In [1]:
import pandas as pd

<b>
Use Pandas' read_html' function to pull the list of postal codes out of the wikipedia page and assigned all entries to a dataframe. Then I display it for confirmation with the .head() function.
</b>

In [2]:
df=pd.read_html("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")[0]
df.head()

Unnamed: 0,Postal code,Borough,Neighborhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Regent Park / Harbourfront


<b>Pull all entries of the dataframe out that do not have a value of 'not assigned' in the Borough column then reassign the data frame to only include those entries.</b>

In [3]:
df = df[df.Borough != "Not assigned"]

<b>Group rows together by those that share the same 'Postcode' and 'Borough' entries and append all unique 'Neighbourhood' entries into a list seperated by commas. Here we also arrange the dataframe in order by postcode and reset the index for future convenience.</b>

In [4]:
df2 = df.groupby(['Postal code','Borough'])['Neighborhood'].apply(lambda x: ','.join(x.astype(str))).reset_index()
df2.head()

Unnamed: 0,Postal code,Borough,Neighborhood
0,M1B,Scarborough,Malvern / Rouge
1,M1C,Scarborough,Rouge Hill / Port Union / Highland Creek
2,M1E,Scarborough,Guildwood / Morningside / West Hill
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


<b>Search the dataframe for entries in the 'Neighborhood' column with the value of 'not assigned' and replace those with the entry present in that row's 'Borough' column.</b>

In [5]:
df2.loc[df2['Neighborhood'] == ('Not assigned'), 'Neighborhood'] = df2['Borough']

<b>Here we Print out the number of rows present in the final dataframe using the .shape[] function.</b>

In [6]:
print("The final number of rows in this dataframe is", df2.shape[0])

The final number of rows in this dataframe is 103


<b>Read the Geospatial data in via Pandas' read_csv function and assigned it to a dataframe</b>

In [7]:
da = pd.read_csv('http://cocl.us/Geospatial_data')
da.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


<b>Join the dataframes on common Postal Code entries.</b>

In [10]:
df_toronto = pd.merge(df2, da, left_on = 'Postal code', right_on = 'Postal Code')
df_toronto.head()

Unnamed: 0,Postal code,Borough,Neighborhood,Postal Code,Latitude,Longitude
0,M1B,Scarborough,Malvern / Rouge,M1B,43.806686,-79.194353
1,M1C,Scarborough,Rouge Hill / Port Union / Highland Creek,M1C,43.784535,-79.160497
2,M1E,Scarborough,Guildwood / Morningside / West Hill,M1E,43.763573,-79.188711
3,M1G,Scarborough,Woburn,M1G,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,M1H,43.773136,-79.239476


In [11]:
import numpy as np 

from geopy.geocoders import Nominatim
import requests 

from pandas.io.json import json_normalize 
import json

import matplotlib.cm as cm
import matplotlib.colors as colors

from sklearn.cluster import KMeans
import folium

<b>Let's check out the neighbourhood of Etobicoke (my brother used to live there)</b>

In [12]:
df_etobicoke = df_toronto[df_toronto['Borough'] == 'Etobicoke'].reset_index(drop=True)
df_etobicoke

Unnamed: 0,Postal code,Borough,Neighborhood,Postal Code,Latitude,Longitude
0,M8V,Etobicoke,New Toronto / Mimico South / Humber Bay Shores,M8V,43.605647,-79.501321
1,M8W,Etobicoke,Alderwood / Long Branch,M8W,43.602414,-79.543484
2,M8X,Etobicoke,The Kingsway / Montgomery Road / Old Mill North,M8X,43.653654,-79.506944
3,M8Y,Etobicoke,Old Mill South / King's Mill Park / Sunnylea /...,M8Y,43.636258,-79.498509
4,M8Z,Etobicoke,Mimico NW / The Queensway West / South of Bloo...,M8Z,43.628841,-79.520999
5,M9A,Etobicoke,Islington Avenue,M9A,43.667856,-79.532242
6,M9B,Etobicoke,West Deane Park / Princess Gardens / Martin Gr...,M9B,43.650943,-79.554724
7,M9C,Etobicoke,Eringate / Bloordale Gardens / Old Burnhamthor...,M9C,43.643515,-79.577201
8,M9P,Etobicoke,Westmount,M9P,43.696319,-79.532242
9,M9R,Etobicoke,Kingsview Village / St. Phillips / Martin Grov...,M9R,43.688905,-79.554724


In [14]:
#from wikipedia
e_lat = 43.62
e_long = -79.51

# create map of Manhattan using latitude and longitude values
map_etobicoke = folium.Map(location=[e_lat, e_long], zoom_start=11)

# add markers to map
for lat, lng, label in zip(df_etobicoke['Latitude'], df_etobicoke['Longitude'], df_etobicoke['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_etobicoke)  
    
#map_etobicoke

In [19]:
CLIENT_ID = '4IKRNFS2YHQAGAISXQ5PPCAMPXPLGIEQGPEI2ZWFCFJPAV4L' 
CLIENT_SECRET = 'CBDOMH24XKR4DLJHRGJILEQYR40HUVYEJJ2KHUEUUTGRKZJZ'
VERSION = '20190623'


<b>Write a function to make pulling nearby venues easier</b>

In [20]:
def getNearbyVenues(names, latitudes, longitudes, radius=600):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            800, 
            300)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighbourhood', 
                  'Neighbourhood Latitude', 
                  'Neighbourhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

<b>Call foursquare to get the nearby venues for each Neighbourhood</b>


In [21]:
etobicoke_venues = getNearbyVenues(names=df_etobicoke['Neighborhood'],
                                   latitudes=df_etobicoke['Latitude'],
                                   longitudes=df_etobicoke['Longitude']
                                  )

New Toronto / Mimico South / Humber Bay Shores
Alderwood / Long Branch
The Kingsway / Montgomery Road / Old Mill North
Old Mill South / King's Mill Park / Sunnylea / Humber Bay / Mimico NE / The Queensway East / Royal York South East / Kingsway Park South East
Mimico NW / The Queensway West / South of Bloor / Kingsway Park South West / Royal York South West
Islington Avenue
West Deane Park / Princess Gardens / Martin Grove / Islington / Cloverdale
Eringate / Bloordale Gardens / Old Burnhamthorpe / Markland Wood
Westmount
Kingsview Village / St. Phillips / Martin Grove Gardens / Richview Gardens
South Steeles / Silverstone / Humbergate / Jamestown / Mount Olive / Beaumond Heights / Thistletown / Albion Gardens
Northwest


<b>

Convert the Venue data to one hot encoding then built a data frame with it and their Neighbourhoods
</b>

In [22]:
etobicoke_onehot = pd.get_dummies(etobicoke_venues[['Venue Category']], prefix="", prefix_sep="")
etobicoke_onehot['Neighbourhood'] = etobicoke_venues['Neighbourhood'] 
fixed_columns = [etobicoke_onehot.columns[-1]] + list(etobicoke_onehot.columns[:-1])
etobicoke_venuelist = etobicoke_onehot.groupby('Neighbourhood').mean().reset_index()

<b>

Find the 5 most common venues for each Neighbourhood
</b>

In [23]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]
num_top_venues = 5

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighbourhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
e_neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
e_neighborhoods_venues_sorted['Neighbourhood'] = etobicoke_venuelist['Neighbourhood']

for ind in np.arange(etobicoke_venuelist.shape[0]):
    e_neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(etobicoke_venuelist.iloc[ind, :], num_top_venues)

e_neighborhoods_venues_sorted.head()

Unnamed: 0,Neighbourhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,Alderwood / Long Branch,Convenience Store,Pizza Place,Gym,Sandwich Place,Donut Shop
1,Eringate / Bloordale Gardens / Old Burnhamthor...,Coffee Shop,Beer Store,Pet Store,Convenience Store,Pizza Place
2,Islington Avenue,Pharmacy,Park,Grocery Store,Café,Bank
3,Kingsview Village / St. Phillips / Martin Grov...,Business Service,Bank,Chinese Restaurant,Intersection,Mobile Phone Shop
4,Mimico NW / The Queensway West / South of Bloo...,Gym / Fitness Center,Restaurant,Burrito Place,Italian Restaurant,Burger Joint


<b>

Run k-means to cluster the neighbourhoods in 5 clusters
</b>

In [24]:
# set number of clusters
kclusters = 5

etobicoke_venuelist_clustering = etobicoke_venuelist.drop('Neighbourhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(etobicoke_venuelist_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]

array([4, 4, 3, 0, 0, 0, 1, 2, 4, 0], dtype=int32)

In [25]:
e_neighborhoods_venues_sorted.insert(0, 'Clusters', kmeans.labels_)


<b>

Merge the information back into one dataframe for easy analysis and future use
</b>

In [28]:
df_etobicoke_merged = df_etobicoke

df_etobicoke_merged = df_etobicoke_merged.join(e_neighborhoods_venues_sorted.set_index('Neighbourhood'), on='Neighborhood')

df_etobicoke_merged.dropna(axis=0,inplace = True)
df_etobicoke_merged['Clusters'] = df_etobicoke_merged['Clusters'].astype('int')

df_etobicoke_merged

Unnamed: 0,Postal code,Borough,Neighborhood,Postal Code,Latitude,Longitude,Clusters,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,M8V,Etobicoke,New Toronto / Mimico South / Humber Bay Shores,M8V,43.605647,-79.501321,0,Pharmacy,Café,Fried Chicken Joint,Park,Music Venue
1,M8W,Etobicoke,Alderwood / Long Branch,M8W,43.602414,-79.543484,4,Convenience Store,Pizza Place,Gym,Sandwich Place,Donut Shop
2,M8X,Etobicoke,The Kingsway / Montgomery Road / Old Mill North,M8X,43.653654,-79.506944,0,Italian Restaurant,Breakfast Spot,Sushi Restaurant,Park,Bank
3,M8Y,Etobicoke,Old Mill South / King's Mill Park / Sunnylea /...,M8Y,43.636258,-79.498509,2,Park,Construction & Landscaping,Gym / Fitness Center,Home Service,Flower Shop
4,M8Z,Etobicoke,Mimico NW / The Queensway West / South of Bloo...,M8Z,43.628841,-79.520999,0,Gym / Fitness Center,Restaurant,Burrito Place,Italian Restaurant,Burger Joint
5,M9A,Etobicoke,Islington Avenue,M9A,43.667856,-79.532242,3,Pharmacy,Park,Grocery Store,Café,Bank
6,M9B,Etobicoke,West Deane Park / Princess Gardens / Martin Gr...,M9B,43.650943,-79.554724,4,Hotel,Convenience Store,Pizza Place,Bank,Mexican Restaurant
7,M9C,Etobicoke,Eringate / Bloordale Gardens / Old Burnhamthor...,M9C,43.643515,-79.577201,4,Coffee Shop,Beer Store,Pet Store,Convenience Store,Pizza Place
8,M9P,Etobicoke,Westmount,M9P,43.696319,-79.532242,4,Pizza Place,Golf Course,Sandwich Place,Middle Eastern Restaurant,Golf Driving Range
9,M9R,Etobicoke,Kingsview Village / St. Phillips / Martin Grov...,M9R,43.688905,-79.554724,0,Business Service,Bank,Chinese Restaurant,Intersection,Mobile Phone Shop


<b>

Visualize the Results
</b>

In [30]:
# create map
map_clusters = folium.Map(location=[e_lat, e_long], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(df_etobicoke_merged['Latitude'], df_etobicoke_merged['Longitude'], df_etobicoke_merged['Neighborhood'],df_etobicoke_merged['Clusters']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters
