# SEGMENTING AND CLUSTERING NEIGHBORHOODS OF TORONTO

### 1 Importing Libraries

In [1]:
import numpy as np            # for efficient numerical operations
import pandas as pd           # for data frame analytics
import requests               # for pulling html pages and creating html documents
from bs4 import BeautifulSoup # for scraping html documents

### 2 Parsing the Wikipedia page on Toronto postal codes

By means of the <i>requests</i> library which has been loaded above, the link provided in the description of the assignment can be employed to retrieve the html page pointed to be that link. This is acchieved via the <i>requests.get()</i> function. Once the html page is retrieved the underlying html document stored in the <i>text</i> attribute of the page object. Lastly, this hmtl document is parsed by means of the <i>BeautifulSoup</i> library.

In [2]:
html_link = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
html_page = requests.get(html_link)
html_doc = html_page.text
soup = BeautifulSoup(html_doc,'html.parser')

### 3 Scraping the html document

#### 3.1 Searching the postal code table in the html document and creating a dataframe from it
Firstly, the soup object is searched for the table by means of the <i>find()</i> function and specifying the html tag <i>table</i>. With the help of that same function the rows (html tagged <i>tr</i>) are then extracted from the table, and headers (html tagged <i>th</i>) are extracted from the first row. These headers, together with the number of rows are employed to create a raw pandas dataframe <i>table_df</i> which contains all rows of the table found on the wikipedia page.

In [3]:
table = soup.find('table')                                                   # searching for table in html document
rows = table.findAll('tr')                                                   # gathering all rows in that table
header = rows[0].findAll('th')                                               # gathering all column headers in first row of the table
col_num = len(header)                                                        # number of columns equals number of headers found and hence the length of the header object
row_num = len(rows)-1                                                        # number of rows equals number of table rows minus one since first row did contain column headers

columns = list()                                                             # writing column headers into a list
for c in range(col_num):
    if c != col_num-1:
        columns.append(header[c].string.lower())
    else:
        columns.append(header[c].string[:-1].lower())

table_df = pd.DataFrame(columns=columns,index=range(row_num))                # creating a pandas data frame with column names defined by the list of column headers and index defined by number of rows

for i in range(row_num):                                                     # filling the table row by row
    row = rows[i+1].findAll('td')
    for c in range(col_num):
        table_df.iloc[i,c] = row[c].string
        if table_df.iloc[i,c] == None:                                       # the entry of the wikipedia table may have been a string or a hyperlink. If the latter the td object does not provide the string
            table_df.iloc[i,c] = row[c].a.string                             # but its child 'a' does

for i in table_df.index:                                                     # removing line breaks
    if table_df.loc[i,'neighbourhood'][-1:] == '\n':
        table_df.loc[i,'neighbourhood']=table_df.loc[i,'neighbourhood'][:-1]
        
table_df.head()

Unnamed: 0,postcode,borough,neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


#### 3.2 Manipulating the raw dataframe
Next, those rows of the raw dataframe <i>table_df</i> data frame who have no borough assigned, are dropped. Next, if among the remaining entries the neighborhood is not assigned, the neighborhood name will be the same as the borough name.

In [4]:
for i in table_df.index:
    if table_df.loc[i,'borough'] == 'Not assigned':
        table_df = table_df.drop(i,axis=0)
    elif table_df.loc[i,'neighbourhood'] == 'Not assigned':
        table_df.loc[i,'neighbourhood'] = table_df.loc[i,'borough']
        
table_df.head()

Unnamed: 0,postcode,borough,neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights


#### 3.3  Creating a clean postal code dataframe from the raw data frame
Lastly, from the raw dataframe <i>table_df</i> the unique postal codes are collected and employed to create a clean dataframe <i>code_df</i> which lists codes, borough names and all associated neighborhoods.

In [5]:
code_list = list(table_df['postcode'].unique())
code_df = pd.DataFrame(columns=columns,index=range(len(code_list)))

for i in code_df.index:
    code = code_list[i]
    sub_df = table_df[table_df['postcode']==code]
    hood_string = ''
    for j in range(sub_df.shape[0]):
        hood_string = hood_string + sub_df.loc[sub_df.index[j],'neighbourhood'] + ','
    hood_string = hood_string[:-2]
    code_df.loc[i,'postcode'] = code
    code_df.loc[i,'borough'] = sub_df.loc[sub_df.index[0],'borough']
    code_df.loc[i,'neighbourhood'] = hood_string
    
code_df.sort_values('postcode',inplace=True)
code_df.reset_index(inplace=True)
code_df.drop('index',axis=1,inplace=True)
    
code_df.head()

Unnamed: 0,postcode,borough,neighbourhood
0,M1B,Scarborough,"Rouge,Malver"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Unio"
2,M1E,Scarborough,"Guildwood,Morningside,West Hil"
3,M1G,Scarborough,Wobur
4,M1H,Scarborough,Cedarbra


Finally, the number of rows of the <i>code_df</i> dataframe can be infered from its <i>shape[0]</i> object.

In [52]:
print('The Toronto postal code dataframe contains {} boroughs and {} postal codes.'.format(len(code_df['borough'].unique()),code_df.shape[0]))
code_df.postcode.unique()
toronto_onehot.postcode.unique()

The Toronto postal code dataframe contains 11 boroughs and 103 postal codes.


array(['M1B', 'M1C', 'M1E', 'M1G', 'M1H', 'M1J', 'M1K', 'M1L', 'M1M',
       'M1N', 'M1P', 'M1R', 'M1S', 'M1T', 'M1V', 'M1W', 'M2H', 'M2J',
       'M2K', 'M2L', 'M2M', 'M2N', 'M2P', 'M2R', 'M3A', 'M3B', 'M3C',
       'M3H', 'M3J', 'M3K', 'M3L', 'M3M', 'M3N', 'M4A', 'M4B', 'M4C',
       'M4E', 'M4G', 'M4H', 'M4J', 'M4K', 'M4L', 'M4M', 'M4N', 'M4P',
       'M4R', 'M4S', 'M4T', 'M4V', 'M4W', 'M4X', 'M4Y', 'M5A', 'M5B',
       'M5C', 'M5E', 'M5G', 'M5H', 'M5J', 'M5K', 'M5L', 'M5M', 'M5N',
       'M5P', 'M5R', 'M5S', 'M5T', 'M5V', 'M5W', 'M5X', 'M6A', 'M6B',
       'M6C', 'M6E', 'M6G', 'M6H', 'M6J', 'M6K', 'M6L', 'M6M', 'M6N',
       'M6P', 'M6R', 'M6S', 'M7A', 'M7R', 'M7Y', 'M8V', 'M8W', 'M8X',
       'M8Y', 'M8Z', 'M9B', 'M9C', 'M9L', 'M9M', 'M9N', 'M9P', 'M9R',
       'M9V', 'M9W'], dtype=object)

### 4 Geolocating and mapping Toronto neighborhoods

#### 4.1 Assigning latitude and longitude

Since the geocoder strategy did not work, the csv file containing the georeferences for the Toronto postal codes is loaded into a dataframe which is then employed in order to look up the postal codes needed.

In [7]:
geodata_df = pd.read_csv('https://cocl.us/Geospatial_data') # loading geodata in a pandas dataframe
geodata_df.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [8]:
code_df['latitude'] = ''                                                               # adding latitude column to code_df data frame
code_df['longitude'] = ''                                                              # adding longitude column to code_df data frame

for i in code_df.index:
    code = code_df.loc[i,'postcode']
    code_df.loc[i,'latitude'] = geodata_df[geodata_df['Postal Code']==code].iloc[0,1]  # assigning latidtude by reduceing geodata_df to the entry matching the postal code and then calling its latitude column 1
    code_df.loc[i,'longitude'] = geodata_df[geodata_df['Postal Code']==code].iloc[0,2] # assigning longidtude by reduceing geodata_df to the entry matching the postal code and then calling its longitude column 2
    
code_df.head()

Unnamed: 0,postcode,borough,neighbourhood,latitude,longitude
0,M1B,Scarborough,"Rouge,Malver",43.8067,-79.1944
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Unio",43.7845,-79.1605
2,M1E,Scarborough,"Guildwood,Morningside,West Hil",43.7636,-79.1887
3,M1G,Scarborough,Wobur,43.771,-79.2169
4,M1H,Scarborough,Cedarbra,43.7731,-79.2395


#### 4.2 Mapping the neighborhoods

In [9]:
!conda install -c conda-forge folium=0.5.0 --yes
import folium                                    # map rendering library

Fetching package metadata .............
Solving package specifications: .

# All requested packages already installed.
# packages in environment at /opt/conda/envs/DSX-Python35:
#
folium                    0.5.0                      py_0    conda-forge


In [10]:
toronto_map = folium.Map(location=[code_df.latitude.mean(),code_df.longitude.mean()],zoom_start=12)

for lat,lng,borough,code in zip(code_df['latitude'],code_df['longitude'],code_df['borough'],code_df['postcode']):
    label = '{}, {}'.format(code,borough)
    label = folium.Popup(label,parse_html=True)
    folium.CircleMarker([lat,lng],radius=5,popup=label,color='blue',fill=True,fill_color='#3186cc',fill_opacity=0.7,parse_html=False).add_to(toronto_map)  
    
toronto_map

### 5 Exploring Toronto neighborhoods via the Foursquare API

In [11]:
CLIENT_ID = '41HXMY040EFZ5BW2JCZHSGGPG4ZBNJ1BVDCG2XHIE2FO5Y5B'     # Foursquare ID
CLIENT_SECRET = 'GZNPZKZ2VCTSQ3BW2SYFAH2CZ0VAQYXLS1UW1RGF1BOFTFIQ' # Foursquare Secret
VERSION = '20180605'                                               # Foursquare API version

In [18]:
def GetVenues(names,latitudes,longitudes,radius=500,limit=100):
    
    venues_list=[]
    for name,lat,lng in zip(names,latitudes,longitudes):
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'\
              .format(CLIENT_ID,CLIENT_SECRET,VERSION,lat,lng,radius,limit)
        results = requests.get(url).json()["response"]['groups'][0]['items']
        venues_list.append([(name,lat,lng,\
                             v['venue']['name'],\
                             v['venue']['location']['lat'],\
                             v['venue']['location']['lng'],\
                             v['venue']['categories'][0]['name']) for v in results])
    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['postcode','psotcode latitude','postcode longitude','venue','venue latitude','venue longitude','venue category']
    
    return(nearby_venues)

In [53]:
toronto_venues = GetVenues(names=code_df['postcode'],latitudes=code_df['latitude'],longitudes=code_df['longitude'])
print('{} Toronto venues found for {} postcode areas and in {} unique categories.'\
      .format(toronto_venues.shape[0],len(toronto_venues['postcode'].unique()),len(toronto_venues['venue category'].unique())))

2253 Toronto venues found in 101 code areas and 275 unique categories.


In [26]:
toronto_venues.head()

Unnamed: 0,postcode,psotcode latitude,postcode longitude,venue,venue latitude,venue longitude,venue category
0,M1B,43.806686,-79.194353,Wendy's,43.807448,-79.199056,Fast Food Restaurant
1,M1B,43.806686,-79.194353,Interprovincial Group,43.80563,-79.200378,Print Shop
2,M1C,43.784535,-79.160497,Royal Canadian Legion,43.782533,-79.163085,Bar
3,M1C,43.784535,-79.160497,Affordable Toronto Movers,43.787919,-79.162977,Moving Target
4,M1C,43.784535,-79.160497,Scarborough Historical Society,43.788755,-79.162438,History Museum


In [72]:
new = pd.Series([code]+list(np.zeros(len(toronto_onehot.columns)-1)),index=toronto_onehot.columns)

In [74]:
toronto_onehot = pd.get_dummies(toronto_venues[['venue category']], prefix="", prefix_sep="")
toronto_onehot['postcode'] = toronto_venues['postcode'] 
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

for i in code_df.index:
    code = code_df.loc[i,'postcode']
    if code not in toronto_onehot.postcode.unique():
        new = pd.Series([code]+list(np.zeros(len(toronto_onehot.columns)-1)),index=toronto_onehot.columns)
        toronto_onehot = toronto_onehot.append(new,ignore_index=True)

print('Shape: ',toronto_onehot.shape)
toronto_onehot.head()

Shape:  (2255, 276)


Unnamed: 0,postcode,Accessories Store,Adult Boutique,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,...,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,M1B,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,M1B,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,M1C,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,M1C,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,M1C,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [75]:
toronto_grouped = toronto_onehot.groupby('postcode').mean().reset_index()
print('Shape: ',toronto_grouped.shape)
toronto_grouped.head()

Shape:  (103, 276)


Unnamed: 0,postcode,Accessories Store,Adult Boutique,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,...,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,M1B,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,M1C,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,M1E,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,M1G,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,M1H,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [32]:
num_top_venues = 5

for code in toronto_grouped['postcode']:
    print("----"+code+"----")
    temp = toronto_grouped[toronto_grouped['postcode'] == code].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq',ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----M1B----
                       venue  freq
0       Fast Food Restaurant   0.5
1                 Print Shop   0.5
2  Middle Eastern Restaurant   0.0
3              Movie Theater   0.0
4                      Motel   0.0


----M1C----
            venue  freq
0   Moving Target  0.33
1             Bar  0.33
2  History Museum  0.33
3   Movie Theater  0.00
4           Motel  0.00


----M1E----
                venue  freq
0         Pizza Place  0.17
1      Medical Center  0.17
2   Electronics Store  0.17
3  Mexican Restaurant  0.17
4      Breakfast Spot  0.17


----M1G----
               venue  freq
0        Coffee Shop  0.50
1  Korean Restaurant  0.25
2  Indian Restaurant  0.25
3  Accessories Store  0.00
4      Movie Theater  0.00


----M1H----
                 venue  freq
0     Hakka Restaurant  0.12
1  Fried Chicken Joint  0.12
2   Athletics & Sports  0.12
3               Lounge  0.12
4               Bakery  0.12


----M1J----
                venue  freq
0    Business Service  0.33
1   

                venue  freq
0            Bus Line  0.25
1         Swim School  0.25
2                Park  0.25
3  Dim Sum Restaurant  0.25
4   Accessories Store  0.00


----M4P----
               venue  freq
0              Hotel  0.12
1               Park  0.12
2            Dog Run  0.12
3       Burger Joint  0.12
4  Food & Drink Shop  0.12


----M4R----
                 venue  freq
0       Clothing Store  0.15
1  Sporting Goods Shop  0.10
2          Coffee Shop  0.10
3          Yoga Studio  0.05
4            Gift Shop  0.05


----M4S----
                venue  freq
0         Pizza Place  0.08
1      Sandwich Place  0.08
2        Dessert Shop  0.08
3  Seafood Restaurant  0.05
4                Café  0.05


----M4T----
               venue  freq
0         Playground  0.25
1       Tennis Court  0.25
2         Restaurant  0.25
3               Park  0.25
4  Accessories Store  0.00


----M4V----
                 venue  freq
0    Convenience Store  0.13
1                  Pub  0.13
2        

                       venue  freq
0             Baseball Field   0.5
1               Home Service   0.5
2  Middle Eastern Restaurant   0.0
3              Movie Theater   0.0
4                      Motel   0.0


----M8Z----
             venue  freq
0   Discount Store  0.08
1    Grocery Store  0.08
2  Supplement Shop  0.08
3   Sandwich Place  0.08
4    Burrito Place  0.08


----M9B----
                       venue  freq
0                       Bank   1.0
1          Accessories Store   0.0
2  Middle Eastern Restaurant   0.0
3                      Motel   0.0
4        Monument / Landmark   0.0


----M9C----
            venue  freq
0     Pizza Place  0.17
1  Shopping Plaza  0.17
2        Pharmacy  0.17
3      Beer Store  0.17
4    Liquor Store  0.17


----M9L----
                       venue  freq
0                Pizza Place   0.5
1        Empanada Restaurant   0.5
2  Middle Eastern Restaurant   0.0
3              Movie Theater   0.0
4                      Motel   0.0


----M9M----
      

In [76]:
def CommonVenues(row,venue_num):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:venue_num]

In [77]:
venue_num = 5
indicators = ['st','nd','rd']

columns = ['postcode']
for ind in np.arange(venue_num):
    try:
        columns.append('{}{} most common venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th most common venue'.format(ind+1))

codesvenues_sorted = pd.DataFrame(columns=columns)
codesvenues_sorted['postcode'] = toronto_grouped['postcode']

for ind in np.arange(toronto_grouped.shape[0]):
    codesvenues_sorted.iloc[ind, 1:] = CommonVenues(toronto_grouped.iloc[ind, :],venue_num)

codesvenues_sorted.head()

Unnamed: 0,postcode,1st most common venue,2nd most common venue,3rd most common venue,4th most common venue,5th most common venue
0,M1B,Print Shop,Fast Food Restaurant,Donut Shop,Dessert Shop,Dim Sum Restaurant
1,M1C,Moving Target,History Museum,Bar,Empanada Restaurant,Electronics Store
2,M1E,Electronics Store,Medical Center,Rental Car Location,Mexican Restaurant,Breakfast Spot
3,M1G,Coffee Shop,Indian Restaurant,Korean Restaurant,Doner Restaurant,Dessert Shop
4,M1H,Hakka Restaurant,Fried Chicken Joint,Lounge,Thai Restaurant,Bakery


In [78]:
from sklearn.cluster import KMeans

In [91]:
kclusters = 3
toronto_grouped_clustering = toronto_grouped.drop('postcode',axis=1)
kmeans = KMeans(n_clusters=kclusters,random_state=0).fit(toronto_grouped_clustering)

In [92]:
toronto_merged = code_df
toronto_merged['cluster'] = kmeans.labels_
toronto_merged = toronto_merged.join(codesvenues_sorted.set_index('postcode'),on='postcode')
toronto_merged.head()

Unnamed: 0,postcode,borough,neighbourhood,latitude,longitude,cluster,1st most common venue,2nd most common venue,3rd most common venue,4th most common venue,5th most common venue
0,M1B,Scarborough,"Rouge,Malver",43.8067,-79.1944,1,Print Shop,Fast Food Restaurant,Donut Shop,Dessert Shop,Dim Sum Restaurant
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Unio",43.7845,-79.1605,2,Moving Target,History Museum,Bar,Empanada Restaurant,Electronics Store
2,M1E,Scarborough,"Guildwood,Morningside,West Hil",43.7636,-79.1887,2,Electronics Store,Medical Center,Rental Car Location,Mexican Restaurant,Breakfast Spot
3,M1G,Scarborough,Wobur,43.771,-79.2169,2,Coffee Shop,Indian Restaurant,Korean Restaurant,Doner Restaurant,Dessert Shop
4,M1H,Scarborough,Cedarbra,43.7731,-79.2395,2,Hakka Restaurant,Fried Chicken Joint,Lounge,Thai Restaurant,Bakery


In [93]:
import matplotlib.cm as cm
import matplotlib.colors as colors

In [101]:
cluster_map = folium.Map(location=[code_df.latitude.mean(),code_df.longitude.mean()],zoom_start=12)

x = np.arange(kclusters)
ys = [i+x+(i*x)**2 for i in range(kclusters)]
colors_array = cm.plasma(np.linspace(0,1,len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

markers_colors = []
for lat,lon,poi,cluster in zip(toronto_merged['latitude'],toronto_merged['longitude'],toronto_merged['postcode'],toronto_merged['cluster']):
    label = folium.Popup(str(poi)+' cluster '+str(cluster),parse_html=True)
    folium.CircleMarker([lat,lon],radius=5,popup=label,color='gray',fill=True,fill_color=rainbow[cluster-1],fill_opacity=1).add_to(cluster_map)

cluster_map