In [10]:
import pandas as pd
import requests
from bs4 import BeautifulSoup


res = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M").text
soup = BeautifulSoup(res,'lxml')
table = soup.find_all('table')[0]
column_names = ['Postalcode','Borough','Neighborhood']
df = pd.DataFrame(columns = column_names)
for tr_cell in table.find_all('tr'):
    row_data=[]
    for td_cell in tr_cell.find_all('td'):
        row_data.append(td_cell.text.strip())
    if len(row_data)==3:
        df.loc[len(df)] = row_data
df.head()

Unnamed: 0,Postalcode,Borough,Neighborhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


Cleaning and Preprocessing

In [11]:
# Remove Unassigned Boroughs
df = df[df.Borough != 'Not assigned']
df = df.groupby(['Postalcode','Borough'], sort=False).agg(', '.join)
df.reset_index(inplace=True)
df

Unnamed: 0,Postalcode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
5,M9A,Etobicoke,Islington Avenue
6,M1B,Scarborough,"Malvern, Rouge"
7,M3B,North York,Don Mills
8,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


In [13]:
#Asign Borough name to Neighbourhood if Neighbourhood is blank
for index, row in df.iterrows():
    if row['Neighborhood'] == 'Not assigned':
        row['Neighborhood'] = row['Borough']

In [14]:
# of Rows
df.shape[0]

103

In [18]:
#import the geopatial data
lat_lng_coords = pd.read_csv('https://cocl.us/Geospatial_data')

#Join with old table
lat_lng_coords.rename(columns={'Postal Code':'Postalcode'},inplace=True)
df = pd.merge(df,lat_lng_coords,on='Postalcode')
df.head()

Unnamed: 0,Postalcode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


In [20]:
# Get Toronto Boroughs
df_toronto = df[df.Borough.str.contains('Toronto')]
df_toronto.reset_index(inplace=True, drop=True)
df_toronto.head()

Unnamed: 0,Postalcode,Borough,Neighborhood,Latitude,Longitude
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
1,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
2,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
3,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
4,M4E,East Toronto,The Beaches,43.676357,-79.293031


In [22]:
df_grouped = df_toronto.groupby('Neighborhood').mean().reset_index()
df_grouped.head()

Unnamed: 0,Neighborhood,Latitude,Longitude
0,Berczy Park,43.644771,-79.373306
1,"Brockton, Parkdale Village, Exhibition Place",43.636847,-79.428191
2,Business reply mail Processing Centre,43.662744,-79.321558
3,"CN Tower, King and Spadina, Railway Lands, Har...",43.628947,-79.39442
4,Central Bay Street,43.657952,-79.387383


In [23]:
# Top 5
num_top_venues = 5

for hood in df_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = df_grouped[df_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Berczy Park----
       venue   freq
0   Latitude  43.64
1  Longitude -79.37


----Brockton, Parkdale Village, Exhibition Place----
       venue   freq
0   Latitude  43.64
1  Longitude -79.43


----Business reply mail Processing Centre----
       venue   freq
0   Latitude  43.66
1  Longitude -79.32


----CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport----
       venue   freq
0   Latitude  43.63
1  Longitude -79.39


----Central Bay Street----
       venue   freq
0   Latitude  43.66
1  Longitude -79.39


----Christie----
       venue   freq
0   Latitude  43.67
1  Longitude -79.42


----Church and Wellesley----
       venue   freq
0   Latitude  43.67
1  Longitude -79.38


----Commerce Court, Victoria Hotel----
       venue   freq
0   Latitude  43.65
1  Longitude -79.38


----Davisville----
       venue   freq
0   Latitude  43.70
1  Longitude -79.39


----Davisville North----
       venue   freq
0   Latitude  43.71
1  Longitude

Clustering

In [29]:
# set number of clusters
from sklearn.cluster import KMeans
kclusters = 5

df_grouped_clustering = df_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(df_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([0, 1, 4, 0, 0, 1, 0, 0, 2, 2], dtype=int32)

In [30]:
import folium # map rendering library
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(manhattan_merged['Latitude'], manhattan_merged['Longitude'], manhattan_merged['Neighborhood'], manhattan_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

ModuleNotFoundError: No module named 'folium'