In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import geo

In [None]:
data = pd.read_csv('data_bc.csv')
dataBC = data[data['addressRegion'] == 'BC']
dataBC = dataBC.sort_values(by='addressLocality')
# dataBC.head()

In [None]:
dataBC = dataBC.filter([
    'streetAddress',
    'addressLocality',
    'addressRegion',
    'postalCode',
    'latitude',
    'longitude',
    'price',
    'property-beds',
    'property-baths',
    'property-sqft',
    'Property Type', #Agricultural, Condo, Condo/Townhouse, Duplex, Manufactured Home, Mobile Home, MultiFamily, Single Family, Townhome, Vacant Land
    'Square Footage',
    ])
data_bc_single_family = dataBC[dataBC['Property Type'] == 'Single Family']

In [None]:
metro_vancouver_cities = [
    "Vancouver", "Burnaby", "Richmond", "Surrey", "Coquitlam",
    "North Vancouver", "West Vancouver", "New Westminster",
    "Delta", "Port Coquitlam", "Port Moody", "Langley"
]
data_bc_single_family = data_bc_single_family[data_bc_single_family['addressLocality'].isin(metro_vancouver_cities)]
sample = data_bc_single_family.head()
sample

In [None]:
# data_bc_single_family.sort_values(by='Price', ascending=False)
# data_bc_single_family.to_csv('data_bc_1.csv')

In [None]:
lat_lon_array = sample[['latitude', 'longitude']].to_numpy()
lat_lon_array

In [None]:
# write code here for using geo.haversine() to get individual address distances

results = []
for lat, lon in lat_lon_array:
    amenities = geo.get_specific_amenities(lat, lon, radius=3000)
    conv_distance = []
    transit_distance = []
    school_distance = []

    if amenities:
        for a in amenities:
            dist = geo.haversine(lat, lon, a['latitude'], a['longitude'])
            if a.get('shop') in ['convencince', 'grocery']:
                conv_distance.append(dist)
            elif a.get('amenity') in ['bus_station', 'subway_station', 'railway_station']:
                transit_distance.append(dist)
            elif a.get('amenity') in ['school', 'university']:
                school_distance.append(dist)
            
    results.append({
        'latitude': lat,
        'longitude': lon,
        'avg_convenience_dist': np.mean(conv_distance),
        'avg_transit_distance': np.mean(transit_distance),
        'avg_school_distance': np.mean(school_distance)
    })

results_df = pd.DataFrame(results)
results_df

In [None]:
sample = sample.merge(
    results_df,
    how='left',
    on=['latitude', 'longitude']
)

In [None]:
sample

In [None]:
print(sample['price'])

In [None]:
censusdata = pd.read_csv('CensusProfile2021.csv', encoding='latin1')
filtered_df = censusdata[censusdata.iloc[:, 0].str.contains("Income of individuals in", case=False, na=False)]
final_df = filtered_df[filtered_df.iloc[:, 1].str.contains("average|median", case=False, na=False)]
final_2020_df = final_df[
    final_df.iloc[:, 0].str.contains("2020", na=False) |
    final_df.iloc[:, 1].str.contains("2020", na=False)
]
final_2020_df = final_2020_df[final_2020_df.iloc[:, 1].str.contains("Median employment income in 2020 for full-year full-time workers in 2020", case=False, na=False)] 
# print(final_2020_df)
#final_2020_df.to_csv('CensusData_cleaned')
final_2020_df['Unnamed: 2'] = pd.to_numeric(final_2020_df['Unnamed: 2'], errors='coerce')
median_income = final_2020_df['Unnamed: 2'].iloc[0]

In [12]:

sample['Price-to-income Ratio'] = sample['price'] / median_income 
sample

Unnamed: 0,streetAddress,addressLocality,addressRegion,postalCode,latitude,longitude,price,property-beds,property-baths,property-sqft,Property Type,Square Footage,avg_convenience_dist,avg_transit_distance,avg_school_distance,Price-to-income Ratio
0,"51 N SEA AVENUE, Burnaby, BC V5B 1K4",Burnaby,BC,V5B 1K4,49.284733,-122.979824,2188000.0,5.0,3.0,2515,Single Family,"2,515 SQFT",,,2.351229,33.661538
1,"7815 ALLMAN STREET, Burnaby, BC V5E 2A9",Burnaby,BC,V5E 2A9,49.235055,-122.950665,3999000.0,8.0,9.0,6399,Single Family,"6,399 SQFT",2.691901,,2.436122,61.523077
2,"8274 WEDGEWOOD STREET, Burnaby, BC V3N 1C4",Burnaby,BC,V3N 1C4,49.231928,-122.92704,1868000.0,3.0,1.0,1775,Single Family,"1,775 SQFT",,2.714781,,28.738462
3,"6555 DENBIGH AVENUE, Burnaby, BC V5H 3R7",Burnaby,BC,V5H 3R7,49.224338,-122.984424,3990000.0,5.0,4.0,2736,Single Family,"2,736 SQFT",0.250622,,1.973735,61.384615
4,"7088 HALIFAX STREET, Burnaby, BC V5A 1L8",Burnaby,BC,V5A 1L8,49.26771,-122.956486,2998888.0,6.0,5.0,4946,Single Family,"4,946 SQFT",1.324208,,2.681776,46.136738


The code below is for normalizing data once we have it all cleaned up. We can ignore it for now and come back to it once we have clean data with all our columns

In [None]:
# features = ['Price', 'Bedrooms', 'Bathrooms', 'Acreage', 'Square Footage']
features = ['price', 'Bedrooms', 'Bathrooms', 'Acreage', 'Square Footage']
scaler = MinMaxScaler()

data_scaled = sample.copy()
data_scaled[features] = scaler.fit_transform(sample[features])

data_scaled["Price"] = 1 - data_scaled['Price']
data_scaled['Garage'] = data_scaled['Garage'].map({'Yes': 1, 'No': 0})
data_scaled['Parking'] = data_scaled['Parking'].map({'Yes': 1, 'No': 0})
data_scaled.head()

In [None]:
score_features = [
    'Price',
    'Bedrooms',
    'Bathrooms',
    'Square Footage',
    'Acreage',
    'Garage',
    'Parking'
]

weights = np.array([1/len(score_features)] * len(score_features))
data_scored = data_scaled.copy()
data_scored['Score'] = data_scaled[score_features].dot(weights)
data_scored.sort_values(by='Score', ascending=False, inplace=True)
data_scored