# Toronto Neighborhood Clustering

In [1]:
#taking care of all the imports
import pandas as pd
import numpy as np

!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

from sklearn.cluster import KMeans

!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library
import requests #for getting webpage data

!conda install -c conda-forge beautifulsoup4 --yes

from bs4 import BeautifulSoup #for web-scraping

#!conda install -c conda-forge geocoder --yes

#import geocoder #for getting addresses

Solving environment: done

## Package Plan ##

  environment location: /home/jupyterlab/conda

  added / updated specs: 
    - geopy


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    openssl-1.0.2p             |       h470a237_1         3.1 MB  conda-forge
    geopy-1.17.0               |             py_0          49 KB  conda-forge
    certifi-2018.11.29         |        py36_1000         145 KB  conda-forge
    conda-4.5.11               |        py36_1000         651 KB  conda-forge
    ca-certificates-2018.11.29 |       ha4d7672_0         143 KB  conda-forge
    geographiclib-1.49         |             py_0          32 KB  conda-forge
    ------------------------------------------------------------
                                           Total:         4.1 MB

The following NEW packages will be INSTALLED:

    geographiclib:   1.49-py_0            conda-forge
    geopy:           

In [2]:
#getting the html page
url="https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
html_text=requests.get(url)

soup=BeautifulSoup(html_text.text,'html.parser')


### Getting the data from the Soup file into a DataFrame

In [3]:
toronto_base=pd.DataFrame()

#marker to avoid getting the first header data
skip_head=True

#looking for row tags
for tag in soup.find_all('tr'):
    #marker to tell which piece of data we should be looking at
    i=1
    
    #iterate through children within the table row
    for ch in tag.children:
        #skip if this is part of the table header or if it's not a table entry
        if(ch.name!=None and skip_head==False):
            if(i==1):  #then this is the Postal Code
                postcode=ch.string
                i=i+1
            elif(i==2):  #then this is the Borough
                borough=ch.string
                i=i+1
            elif(i==3):  #then this is the Neighborhood
                neigh=ch.string
                if(neigh!='Not assigned\n'):  #then need to dig a little deeper into the hyperlink tag to get the data
                    if(ch.a!=None):
                        neigh=ch.a.string
                
                i=i+1
            
    #re-set marker 
    i=1
    
    if(skip_head==False):  #then this is not the first time cycling through
        if(borough!='Not assigned'):  #using this to screen out postal codes that aren't assigned
            #adding the row to the dataframe
            toronto_base=toronto_base.append([[postcode,borough,neigh]])
            
    #after 1st time through, this marker now goes false now that we've skipped the table header
    skip_head=False
    
#add in column headers
toronto_base.columns=('PostalCode','Borough','Neighborhood')
toronto_base.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
0,M4A,North York,Victoria Village
0,M5A,Downtown Toronto,Harbourfront
0,M5A,Downtown Toronto,Regent Park
0,M6A,North York,Lawrence Heights


### Pre-Processing the Data

In [168]:
#create the new dataframe with the correct headers
toronto_final=pd.DataFrame(columns=('PostalCode','Borough','Neighborhood'))

#getting one copy of each of the Postal Codes and putting them in the new dataframe
toronto_final['PostalCode']=toronto_base['PostalCode'].unique()
#sorting alphabetically and eliminating the 'None' Postal Code
toronto_final=toronto_final.sort_values('PostalCode')[:-1]

#resetting the index and dropping the old index column
toronto_final.reset_index(inplace=True)
toronto_final.drop(['index'],axis=1,inplace=True)
toronto_final.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,,
1,M1C,,
2,M1E,,
3,M1G,,
4,M1H,,


In [169]:
#iterating through each Postal Code
for i in range(0,toronto_final.shape[0]):
    #getting the corresponding Borough from the original dataframe and copying it into the new dataframe
    temp=toronto_base.loc[lambda df: df.PostalCode==toronto_final.loc[i]['PostalCode']]['Borough'].unique()
    toronto_final.loc[i]['Borough']=temp[0]
    
    #getting the list of neighborhoods from the original dataframe
    temp=toronto_base.loc[lambda df: df.PostalCode==toronto_final.loc[i]['PostalCode']]['Neighborhood'].unique()
    s='' #creating a new string object
    
    #iterating through the neighborhood list and adding them into the string object
    for j in range(0,len(temp)):
        if(temp[j][-1]=='\n'): #then need to eliminate \n at the end            
            temp[j]=temp[j][:-1]
        if(temp[j]=='Not assigned'): #then there's no Neighborhood name and need to copy the Borough name
            temp[j]=toronto_final.loc[i]['Borough']
        if(s==''):  #then it's the first neighborhood entered
            s=temp[j]
        else:  #there's already neighborhoods in the list and we need a comma
            s=s+", "+temp[j]
    #put the list of neighborhoods into the new dataframe
    toronto_final.loc[i]['Neighborhood']=s
toronto_final.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [6]:
print(toronto_final.shape)

(103, 3)


### Getting Latitude/Longitude

In [7]:
####### Tried geocoder, but it didn't work.  Below is the code I was trying.
#lat_lng=None
#got_em_all=True
#c=0

#all_lat_lng=pd.DataFrame()

#for i in range(0,toronto_final.shape[0]):
 #   lat_lng=None
  #  k=0
   
    #while(lat_lng==None) and k<10:
   
        #g=geocoder.google("{}, Toronto, Ontario".format(toronto_final.loc[i]['PostalCode']))
    #    lat_lng=g.latlng
     #   k=k+1

#    if(lat_lng==None):
 #       got_em_all=False
  #  else:
   #     all_lat_lng=all_lat_lng.append([[toronto_final.loc[i]['PostalCode'],lat_lng[0],lat_lng[1]]])
    #    c=c+1
#all_lat_lng.columns=('PostalCode','Latitude','Longitude')

#if(got_em_all==True):
 #   print('Obtained all latitude/longitude pairs')
#else:
 #   print('Missed some latitude/longitude pairs')
  #  print('Only got {} of them.',c)

In [122]:

#loading the latitude/longitude data
geos=pd.read_csv('http://cocl.us/Geospatial_data')
#Updating column headings
geos.columns=('PostalCode','Latitude','Longitude')
print(geos.shape)
geos.head()



(103, 3)


Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [178]:
#joining the two dataframes into one
toronto_total=toronto_final.join(geos.set_index('PostalCode'),on='PostalCode')
toronto_total.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


### Obtaining FourSquare Data about the Neighborhoods

In [10]:
#Loading Foursquare credentials and other query variables
CLIENT_ID = '0SORGAFMXO02X4VNKQPWTGGSFTPHLJPX215FAS0TSWA4XX4G' # your Foursquare ID
CLIENT_SECRET = 'FUEFJ1YFLRXK3CY2VMYILJLXYIL3OP03Z524NRYEJTFSOLSW' # your Foursquare Secret
VERSION = '20181120' # Foursquare API version
LIMIT=100


In [11]:
#Function to pull out the category name from a table entry that still has lingering JSON features
#Accepts a row as parameter
def get_category_type(r):
    cat_list=r['categories']
    
    if(len(cat_list)==0):  #then there is no category
        return None
    else:
        return cat_list[0]['name']

In [128]:
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

radius=1000 #<--for many postal codes, a radius of 500 didn't bring up any venues
#creating a new dataframe to hold all of the data
pc_venue_list=pd.DataFrame()

#cycle through the rows of postal codes
for i in range(0,toronto_total.shape[0]):
    #querying foursquare for venue info surrounding the postal code
    fsurl="https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}".format(CLIENT_ID,
                                                                                                                            CLIENT_SECRET,
                                                                                                                            VERSION,toronto_total.loc[i]['Latitude'],toronto_total.loc[i]['Longitude'],radius,LIMIT)
    results=requests.get(fsurl).json()
    
    #converting the appropriate parts of the json file into a dataframe
    venue_list_json=results['response']['groups'][0]['items']
    venue_list=json_normalize(venue_list_json)
    
    if venue_list.shape[0]!=0:  #checking to make sure there's data in the venue list for this postal code
        #pull the needed columns and adjust titles
        venue_list=venue_list.loc[:,['venue.name','venue.categories','venue.location.lat','venue.location.lng']]
        venue_list.columns=[col.split('.')[-1] for col in venue_list.columns]
        
        #filter down to the category name
        venue_list['categories']=venue_list.apply(get_category_type,axis=1)
        
        #cycle through the venues and place the information in the final dataframe
        for j in range(0,venue_list.shape[0]-1):
            pc_venue_list=pc_venue_list.append([(toronto_total.loc[i]['PostalCode'],toronto_total.loc[i]['Borough'],toronto_total.loc[i]['Neighborhood'],
                                  toronto_total.loc[i]['Latitude'],toronto_total.loc[i]['Longitude'],venue_list.loc[j]['name'],venue_list.loc[j]['lat'],
                                  venue_list.loc[j]['lng'],venue_list.loc[j]['categories'])])
    else:  #there's no venue data for this postal code--simply put the postal code info in the main dataframe
        pc_venue_list=pc_venue_list.append([(toronto_total.loc[i]['PostalCode'],toronto_total.loc[i]['Borough'],toronto_total.loc[i]['Neighborhood'],
                                  toronto_total.loc[i]['Latitude'],toronto_total.loc[i]['Longitude'],'NA','NA','NA','NA')])

#adjust column names
pc_venue_list.columns=('PostalCode','Borough','Neighborhood','Latitude','Longitude','Venue','Venue_Latitude','Venue_Longitude','Venue_Category')

pc_venue_list.head(20)


Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Venue,Venue_Latitude,Venue_Longitude,Venue_Category
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353,Images Salon & Spa,43.8023,-79.1986,Spa
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353,Caribbean Wave,43.7986,-79.1958,Caribbean Restaurant
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353,Wendy's,43.802,-79.1981,Fast Food Restaurant
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353,Harvey's,43.8001,-79.1983,Fast Food Restaurant
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353,Wendy's,43.8074,-79.1991,Fast Food Restaurant
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353,Tim Hortons,43.802,-79.1982,Coffee Shop
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353,Staples Morningside,43.8003,-79.1966,Paper / Office Supplies Store
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353,Lee Valley,43.8032,-79.1997,Hobby Shop
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353,Bus Stop: 85 & 116,43.8022,-79.1994,Bus Station
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353,Charley's Exotic Cuisine,43.801,-79.2002,Chinese Restaurant


### Processing Venue Information

In [282]:
#creating one-hot encoding of venues in each cit
toronto_onehot=pd.get_dummies(pc_venue_list['Venue_Category'])
#putting in the postal code column 
toronto_onehot.insert(0,'PostalCode',pc_venue_list['PostalCode'])

#creating a sum of each type of venue per postal code
toronto_grouped=toronto_onehot.groupby('PostalCode').sum()
toronto_grouped.head()

Unnamed: 0_level_0,Accessories Store,Adult Boutique,Afghan Restaurant,Airport,Airport Lounge,American Restaurant,Amphitheater,Animal Shelter,Antique Shop,Aquarium,...,Video Store,Vietnamese Restaurant,Warehouse Store,Whisky Bar,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio,Zoo
PostalCode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
M1B,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
M1C,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
M1E,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
M1G,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
M1H,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,1,0


In [131]:
#Function to identify categories that aren't actually represented in the postal code area
#Accepts a row as parameter
def cat_screen(r):
    if r[0]>0:  #then this category is represented
        return r.name
    else:
        return 'NA'

In [283]:
#creating a new dataframe to hold the top 10 most-represented venues for each postal code
toronto_ranking=pd.DataFrame()
#cycle through each postal code
for i in range(0,toronto_grouped.shape[0]):
    #creating a temporary dataframe holding the venue categories sorted by commonality, and constraining it down to 10
    pc_df=pd.DataFrame(toronto_grouped.iloc[i]).sort_values(by=toronto_grouped.iloc[i].name,axis=0,ascending=False)
    pc_df=pc_df.iloc[0:10]

    #marking "NA" for neighborhoods that have less than 10 venue categories
    pc_df['Status']=pc_df.apply(cat_screen,axis=1)
    pc_df=pc_df.transpose()
    
    #adding general postal code data and top 10 categories to the new dataframe
    toronto_ranking=toronto_ranking.append([(toronto_total.loc[i]['PostalCode'],toronto_total.loc[i]['Borough'],toronto_total.loc[i]['Neighborhood'],
                                           toronto_total.loc[i]['Latitude'],toronto_total.loc[i]['Longitude'],pc_df.loc['Status'][0],
                                            pc_df.loc['Status'][1],pc_df.loc['Status'][2],pc_df.loc['Status'][3],pc_df.loc['Status'][4],
                                            pc_df.loc['Status'][5],pc_df.loc['Status'][6],pc_df.loc['Status'][7],pc_df.loc['Status'][8],
                                            pc_df.loc['Status'][9])])
#fixing column names
toronto_ranking.columns=('PostalCode','Borough','Neighborhood','Latitude','Longitude','1st Most Common Venue','2nd Most Common Venue',
                         '3rd Most Common Venue','4th Most Common Venue','5th Most Common Venue','6th Most Common Venue','7th Most Common Venue',
                         '8th Most Common Venue','9th Most Common Venue','10th Most Common Venue')
toronto_ranking.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353,Fast Food Restaurant,Bakery,Hobby Shop,Paper / Office Supplies Store,Spa,Gym,Chinese Restaurant,Coffee Shop,Caribbean Restaurant,Greek Restaurant
0,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497,Breakfast Spot,Burger Joint,Italian Restaurant,,,,,,,
0,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711,Fast Food Restaurant,Pizza Place,Coffee Shop,Pharmacy,Greek Restaurant,Beer Store,Bank,Gym,Liquor Store,Supermarket
0,M1G,Scarborough,Woburn,43.770992,-79.216917,Coffee Shop,Park,Fast Food Restaurant,Pharmacy,Chinese Restaurant,Indian Restaurant,,,,
0,M1H,Scarborough,Cedarbrae,43.773136,-79.239476,Coffee Shop,Bakery,Indian Restaurant,Pizza Place,Thai Restaurant,Sporting Goods Shop,Burger Joint,Caribbean Restaurant,Chinese Restaurant,Pharmacy


In [284]:
#re-setting the index
toronto_ranking.reset_index(inplace=True)
toronto_ranking.drop(['index'],axis=1,inplace=True)
toronto_ranking.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353,Fast Food Restaurant,Bakery,Hobby Shop,Paper / Office Supplies Store,Spa,Gym,Chinese Restaurant,Coffee Shop,Caribbean Restaurant,Greek Restaurant
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497,Breakfast Spot,Burger Joint,Italian Restaurant,,,,,,,
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711,Fast Food Restaurant,Pizza Place,Coffee Shop,Pharmacy,Greek Restaurant,Beer Store,Bank,Gym,Liquor Store,Supermarket
3,M1G,Scarborough,Woburn,43.770992,-79.216917,Coffee Shop,Park,Fast Food Restaurant,Pharmacy,Chinese Restaurant,Indian Restaurant,,,,
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476,Coffee Shop,Bakery,Indian Restaurant,Pizza Place,Thai Restaurant,Sporting Goods Shop,Burger Joint,Caribbean Restaurant,Chinese Restaurant,Pharmacy


### Performing K-Means Clustering

In [285]:
from sklearn.cluster import KMeans

#prepping data for K-Means clustering--getting a new one-hot encoding, grouped by mean instead of sum
X=toronto_onehot.groupby('PostalCode').mean()
X.reset_index(inplace=True)
X=X.drop('PostalCode',axis=1)
X.head()

Unnamed: 0,Accessories Store,Adult Boutique,Afghan Restaurant,Airport,Airport Lounge,American Restaurant,Amphitheater,Animal Shelter,Antique Shop,Aquarium,...,Video Store,Vietnamese Restaurant,Warehouse Store,Whisky Bar,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio,Zoo
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.038462,0.0,0.038462,0.0


In [286]:
#performing the K-Means Clustering with 7 clusters
n_clusters=7
kcluster=KMeans(init='k-means++',n_clusters=n_clusters)
kcluster.fit(X)
kcluster_labels=kcluster.labels_


In [287]:
#getting the latitude/longitude for Toronto
tgeo = Nominatim()
tloc = tgeo.geocode('Toronto,Ontario')

#initializing colors for the various clusters
map_colors=['yellow','red','blue','green','purple','orange','black']
#initializing the map
toronto_map=folium.Map(location=[tloc.latitude,tloc.longitude],zoom_start=10)

#creating markers for the different area codes--stripping out apostrophes from the neighborhood names
toronto_markers=folium.map.FeatureGroup()
for lat,lng,label,group in zip(toronto_ranking['Latitude'],toronto_ranking['Longitude'],toronto_ranking['Neighborhood'],kcluster_labels):
    toronto_markers.add_child(folium.features.CircleMarker([lat,lng],radius=5,color=map_colors[group],fill_color=map_colors[group],
                                                           fill=True,popup=label.replace("'","")))
toronto_map.add_child(toronto_markers)

#displaying the map
toronto_map


