# The Battle of Neighborhoods

In [1]:
import pandas as pd
import numpy as np
import lxml

from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import folium

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import GridSearchCV



## Import Data

In [2]:
df_neighbor = pd.read_html('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M', flavor='lxml')[0]
df_neighbor.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [3]:
df_venues = pd.read_csv('toronto_venues.csv')
locs = pd.read_csv('toronto_venues.csv')
df_venues.head()

Unnamed: 0.1,Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,0,"Malvern, Rouge",43.806686,-79.194353,Images Salon & Spa,43.802283,-79.198565,Spa
1,1,"Malvern, Rouge",43.806686,-79.194353,Harvey's,43.80002,-79.198307,Restaurant
2,2,"Malvern, Rouge",43.806686,-79.194353,Wendy's,43.802008,-79.19808,Fast Food Restaurant
3,3,"Malvern, Rouge",43.806686,-79.194353,Wendy’s,43.807448,-79.199056,Fast Food Restaurant
4,4,"Malvern, Rouge",43.806686,-79.194353,RBC Royal Bank,43.798782,-79.19709,Bank


In [4]:
df_prices = pd.read_html('https://housepricehub.com/areas')[0]
df_prices.head()

Unnamed: 0,Postal Code,City,Average Price,Average Price Per Frontage Ft,Total listings,Average Price Trend
0,K0E,Spencerville,"$1,149,289","$5,435",46,
1,K6V,Brockville,"$313,700","$6,580",8,
2,K0G,Merrickville,"$690,405","$4,702",86,
3,K0C,Green Valley,"$597,166","$4,215",85,
4,L0H,Whitchurch-Stouffville,"$2,157,574",,12,


## Transform Data

Neighborhood data cleansing, unassigned values are removed and grouped by postal code.

In [5]:
df_neighbor = df_neighbor[df_neighbor['Borough']!='Not assigned']
df_neighbor = df_neighbor.rename(columns={'Neighbourhood':'Neighborhood'})
df_neighbor = df_neighbor.groupby(["Postal Code", "Borough"])["Neighborhood"].apply(", ".join).reset_index()
df_neighbor.drop(["Borough"], axis=1, inplace=True)
df_neighbor.head()

Unnamed: 0,Postal Code,Neighborhood
0,M1B,"Malvern, Rouge"
1,M1C,"Rouge Hill, Port Union, Highland Creek"
2,M1E,"Guildwood, Morningside, West Hill"
3,M1G,Woburn
4,M1H,Cedarbrae


The places with frequencies lower than 45 are filtered and a pivot_table is made.

In [6]:
df_venues = df_venues[df_venues['Venue Category']!='Neighborhood']
df_venues = df_venues[['Neighborhood', 'Venue Category']]
gt_value = list(df_venues['Venue Category'].value_counts().loc[lambda x : x>45].index)
df_venues = df_venues[df_venues['Venue Category'].apply(lambda x : x in gt_value)]

df_venues['count'] = 1
df_venues = df_venues.groupby(['Neighborhood', 'Venue Category'], as_index=False).count()

he_venues = pd.pivot_table(df_venues, values='count', index='Neighborhood', columns='Venue Category').fillna(0)
he_venues.reset_index(inplace=True)
he_venues.head()

Venue Category,Neighborhood,Bakery,Bank,Bar,Café,Chinese Restaurant,Coffee Shop,Fast Food Restaurant,Gastropub,Grocery Store,...,Japanese Restaurant,Park,Pharmacy,Pizza Place,Pub,Restaurant,Sandwich Place,Sushi Restaurant,Thai Restaurant,Vegetarian / Vegan Restaurant
0,Agincourt,2.0,1.0,0.0,0.0,6.0,2.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0
1,"Alderwood, Long Branch",0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,1.0,1.0,2.0,1.0,0.0,1.0,0.0,0.0,0.0
2,"Bathurst Manor, Wilson Heights, Downsview North",0.0,2.0,0.0,0.0,1.0,2.0,0.0,0.0,0.0,...,0.0,1.0,1.0,2.0,0.0,1.0,1.0,1.0,0.0,0.0
3,Bayview Village,0.0,2.0,0.0,1.0,1.0,0.0,0.0,0.0,2.0,...,2.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,"Bedford Park, Lawrence Manor East",1.0,2.0,0.0,1.0,0.0,3.0,1.0,0.0,1.0,...,0.0,2.0,1.0,1.0,1.0,1.0,2.0,1.0,1.0,0.0


Merge between df_prices and df_neighbor.

In [7]:
df_prices = df_prices.query("City == 'Toronto'")
df_prices = df_prices[['Postal Code', 'Average Price']]
df_prices['Average Price'] = df_prices['Average Price'].apply(lambda x : 
                                                int(x.replace('$','').replace(',','')))
df_prices.head()

Unnamed: 0,Postal Code,Average Price
512,M9W,1092236
531,N4V,2594950
608,M3B,6329289
610,M4V,4635857
611,M3C,8074714


Final merge

In [8]:
df_total = df_neighbor.merge(df_prices, how='inner', 
                             left_on='Postal Code', right_on='Postal Code')

df_total = df_total.merge(he_venues, how='inner', 
                             left_on='Neighborhood', right_on='Neighborhood')

df_total.drop(["Neighborhood"], axis=1, inplace=True)
print('shape: ',df_total.shape)
df_total.head()

shape:  (90, 25)


Unnamed: 0,Postal Code,Average Price,Bakery,Bank,Bar,Café,Chinese Restaurant,Coffee Shop,Fast Food Restaurant,Gastropub,...,Japanese Restaurant,Park,Pharmacy,Pizza Place,Pub,Restaurant,Sandwich Place,Sushi Restaurant,Thai Restaurant,Vegetarian / Vegan Restaurant
0,M1B,922180,0.0,1.0,0.0,0.0,1.0,2.0,2.0,0.0,...,0.0,1.0,0.0,0.0,0.0,2.0,1.0,0.0,0.0,0.0
1,M1C,1463985,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,M1E,1049085,0.0,2.0,0.0,0.0,0.0,1.0,2.0,0.0,...,0.0,0.0,2.0,4.0,0.0,1.0,1.0,0.0,0.0,0.0
3,M1G,1062085,0.0,0.0,0.0,0.0,1.0,2.0,1.0,0.0,...,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,M1H,1088000,2.0,2.0,0.0,0.0,1.0,2.0,1.0,0.0,...,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0


## Prepare Data

In [9]:
X = df_total.drop(['Postal Code', 'Average Price'], axis=1)
y = df_total['Average Price']

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, 
                                                    random_state=42)
print(f'x_train: {X_train.shape},       x_test: {X_test.shape}')

x_train: (67, 23),       x_test: (23, 23)


## Model and results

Use of randomForest for modeling.

In [11]:
model = RandomForestRegressor(n_estimators=200, max_depth=3, min_samples_split=6,
                              random_state=42)
model.fit(X_train, y_train)

RandomForestRegressor(max_depth=3, min_samples_split=6, n_estimators=200,
                      random_state=42)

The 10 venues that most influence the capital gain of the property.

In [12]:
res = pd.DataFrame(list(zip(X_train, model.feature_importances_)))
res.sort_values(1, ascending=False).head(10)

Unnamed: 0,0,1
14,Park,0.255882
9,Gym,0.116383
13,Japanese Restaurant,0.110412
8,Grocery Store,0.109402
10,Hotel,0.075849
18,Restaurant,0.072519
16,Pizza Place,0.054789
12,Italian Restaurant,0.039089
3,Café,0.036023
15,Pharmacy,0.031876


## Visualization

In [13]:
df_coor = pd.read_csv("Geospatial_Coordinates.csv")
df = pd.merge(df_neighbor, df_coor, how='left', left_on = 'Postal Code', right_on = 'Postal Code')
df_map = df.merge(df_prices, how='inner', 
                             left_on='Postal Code', right_on='Postal Code')

locs = pd.read_csv('toronto_venues.csv')
best = ['Park']
locs = locs[locs['Venue Category'].apply(lambda x : x in best)]
locs = locs[['Venue Latitude', 'Venue Longitude', 'Venue Category']]
locs.head()

Unnamed: 0,Venue Latitude,Venue Longitude,Venue Category
16,43.806266,-79.182675,Park
21,43.786257,-79.148776,Park
47,43.776594,-79.215052,Park
48,43.765397,-79.22013,Park
136,43.710527,-79.278966,Park


Visualization of the relation price vs parks.

In [14]:
df_map['colors'] = pd.cut(df_map['Average Price'], bins=4, 
labels=['#F44336', '#9C27B0', '#3F51B5', '#03A9F4'])

map_toronto = folium.Map(location=[43.6534817, -79.3839347], zoom_start=10)
zip_ = zip(df_map['Latitude'], df_map['Longitude'], df_map['Neighborhood'],
           df_map['colors'])

for lat, lng, neighborhood, colors in zip_:
    label = '{}'.format(neighborhood)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color=colors,
        fill=True,
        fill_color=colors,
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)
    
for lat, lng, cat in zip(locs['Venue Latitude'], locs['Venue Longitude'], locs['Venue Category']):
    label = '{}'.format(cat)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=1,
        popup=label,
        color='#000000',
        fill=True,
        fill_color='#000000',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto