# Analizing Neighborhoods in Toronto

In [99]:
#!conda install lxml --yes

In [100]:
#!conda install html5lib --yes

In [101]:
#!conda install BeautifulSoup4  --yes

In [6]:
import pandas as pd
import numpy as np

### Getting dataframe from wikipedia

In [7]:
df = pd.read_html("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M", attrs = {'class': 'wikitable'})

In [8]:
df = pd.DataFrame(df[0])
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


### Filtering to the dataframe

In [9]:
df_filtered = df['Borough'] != 'Not assigned'
df_filtered = df[df_filtered]
df_filtered.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor


In [11]:
df_filtered['Neighbourhood'].replace('Not assigned', np.nan, inplace=True)
df_filtered['Neighbourhood'].fillna(df_filtered['Borough'])

2                     Parkwoods
3              Victoria Village
4                  Harbourfront
5              Lawrence Heights
6                Lawrence Manor
                 ...           
281    Kingsway Park South West
282                   Mimico NW
283          The Queensway West
284       Royal York South West
285              South of Bloor
Name: Neighbourhood, Length: 210, dtype: object

### Grouping the data so neighbourhoods that use same postcode are in the same row

In [12]:
df_grouped_by_postcode = df_filtered.groupby(['Postcode','Borough'])['Neighbourhood'].apply(list)
df_grouped = pd.DataFrame(df_grouped_by_postcode)
df_grouped.reset_index(level=['Postcode', 'Borough'],col_level=0, inplace=True)
df_grouped['Neighbourhood'] = df_grouped.Neighbourhood.apply(lambda x: ", ".join(map(str, x)))
df_grouped.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [13]:
df_grouped.shape

(103, 3)

### Part 2: Getting latitude and longitude with CSV geospatial data

In [14]:
#!pip install wget

In [15]:
import wget

In [16]:
file = wget.download('http://cocl.us/Geospatial_data')

  0% [                                                                                ]    0 / 2891100% [................................................................................] 2891 / 2891

In [17]:
df_geo = pd.read_csv('Geospatial_Coordinates.csv')
df_geo.rename(columns={'Postal Code':'Postcode'}, inplace=True)
df_geo.head()

Unnamed: 0,Postcode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [18]:
df_final = pd.merge(df_grouped, df_geo, how='left', left_on=['Postcode'], right_on=['Postcode'])
df_final.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


### Part 3 Clustering the data

In [19]:
toronto_lat = 43.651070
toronto_lon = -79.347015

In [20]:
#!conda install -c conda-forge folium=0.5.0 --yes

In [45]:
import folium
import matplotlib.cm as cm
import matplotlib.colors as colors

In [30]:
toronto_df = df_final[df_final['Borough'].str.contains('Toronto')].reset_index(drop=True)
toronto_df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M4E,East Toronto,The Beaches,43.676357,-79.293031
1,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188
2,M4L,East Toronto,"The Beaches West, India Bazaar",43.668999,-79.315572
3,M4M,East Toronto,Studio District,43.659526,-79.340923
4,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879


In [47]:
# create map of Manhattan using latitude and longitude values
map_toronto = folium.Map(location=[toronto_lat, toronto_lon], zoom_start=12)

# add markers to map
for lat, lng, label in zip(toronto_df['Latitude'], toronto_df['Longitude'], toronto_df['Neighbourhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

In [34]:
from sklearn.cluster import KMeans

In [42]:
X = toronto_df.loc[:,['Latitude','Longitude']]

kmeans = KMeans(n_clusters=5, random_state=0).fit(X)

kmeans.labels_ 

array([4, 4, 4, 4, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       2, 2, 3, 3, 3, 1, 1, 1, 3, 0, 3, 3, 0, 0, 0, 1, 4])

In [43]:
toronto_df.insert(0, 'Cluster Labels', kmeans.labels_)

## Map with the neighbourhoods separated

In [46]:
# create map
map_clusters = folium.Map(location=[toronto_lat, toronto_lon], zoom_start=12)

# set color scheme for the clusters
x = np.arange(5)
ys = [i + x + (i*x)**2 for i in range(5)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_df['Latitude'], toronto_df['Longitude'], toronto_df['Neighbourhood'], toronto_df['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters