# Capstone Course

## Week 3: Foursquare Neighborhood Clustering

### Part 1: Webscraping to Gather Toronto Postal Codes

In [40]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import folium
import numpy as np
import pgeocode

#### We will be webscraping the [website](https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M) with Canada postal codes starting with M.

In [41]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
response = requests.get(url=url)

In [42]:
response.status_code

200

#### Using Beautiful Soup to parse the data

#### Finding the first table which will have the data we need

In [43]:
soup = BeautifulSoup(response.text)
table = soup.find('table')

#### We are going to loop through all the rows and for each row we loop through each column

#### We collect the first element of the contents of the 'p' tag as this will contain the postal code

#### Then we find the span tag

#### If the first element in the contents of span is 'Not assigned' then we move on

#### Otherwise we parse the text with '(' to get the borough and the neighborhood.  We also replace/remove unwanted character

#### Using all this info we build a data frame

In [44]:
data = {'PostalCode':[], 'Borough':[], 'Neighborhood':[]}

for row in table.find_all('tr'):
    for cell in row.find_all('td'):
        postal_code = cell.find('p').contents[0]
        span = cell.find('span')
        if span.contents[0] == 'Not assigned': continue
        # print(span.text,'\n')
        borough = span.text.split('(')[0]
        neighborhoods = span.text.split('(')[1]
        neighborhoods = neighborhoods[0:-1]
        neighborhoods = neighborhoods.replace('/',',').replace(' ','')
        data['PostalCode'].append(postal_code)
        data['Borough'].append(borough)
        data['Neighborhood'].append(neighborhoods)

toronto_neighbor_df = pd.DataFrame(data)
toronto_neighbor_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,VictoriaVillage
2,M5A,Downtown Toronto,"RegentPark,Harbourfront"
3,M6A,North York,"LawrenceManor,LawrenceHeights"
4,M7A,Queen's Park,OntarioProvincialGovernment


In [45]:
toronto_neighbor_df.shape

(103, 3)

### Part 2: Collecting Geolocation data

#### I was not able to get geocoder so instead I used pgeocode

#### For each postal code we collect the lat and long and then add then to our table

In [46]:
geolocator = pgeocode.Nominatim('ca')
postal_codes = toronto_neighbor_df['PostalCode'].tolist()
lats, longs = [], []

for code in postal_codes:
    g = geolocator.query_postal_code(code)
    if not g.empty:
        lats.append(g.latitude)
        longs.append(g.longitude)


In [47]:
toronto_neighbor_df['Latitude'] = lats
toronto_neighbor_df['Longitude'] = longs
toronto_neighbor_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.7545,-79.33
1,M4A,North York,VictoriaVillage,43.7276,-79.3148
2,M5A,Downtown Toronto,"RegentPark,Harbourfront",43.6555,-79.3626
3,M6A,North York,"LawrenceManor,LawrenceHeights",43.7223,-79.4504
4,M7A,Queen's Park,OntarioProvincialGovernment,43.6641,-79.3889


#### I initially chose to look only at boroughs with the name 'Toronto' in them but later I decided I wanted more data so I commented that out.

#### It is important to drop rows containing NA otherwise you are going to run into issues later on

In [48]:
only_toronto_df = toronto_neighbor_df.copy()
# only_toronto_df = toronto_neighbor_df[toronto_neighbor_df['Borough'].str.contains('Toronto')].reset_index().drop('index',axis=1)
only_toronto_df.dropna(axis=0,inplace=True)
print(only_toronto_df.shape)
only_toronto_df.head()

(102, 5)


Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.7545,-79.33
1,M4A,North York,VictoriaVillage,43.7276,-79.3148
2,M5A,Downtown Toronto,"RegentPark,Harbourfront",43.6555,-79.3626
3,M6A,North York,"LawrenceManor,LawrenceHeights",43.7223,-79.4504
4,M7A,Queen's Park,OntarioProvincialGovernment,43.6641,-79.3889


### Part 3: Collecting venue info from foursquare

#### I removed the client id and secret before posting this

In [49]:
url = 'https://api.foursquare.com/v2/venues/explore'
CLIENT_ID = 'W0LCDX3ONGPXPXJ5QB5R25ITIWZTTOHHIA42C0DBS2GWYDMR'
CLIENT_SECRET = 'LQAINTXPOARRZDRC20NGNF3VQTJ4HIL05GND4EHREM5BU5HL'


#### All the venue collection here

#### For each row in our postal code/borough/neighborhood dataframe we collect the postal code, borough, and neighborhood for later

#### Then we prep a request parameter query to pass to foursquare

#### We parse the request response to gather the venue information

#### Finally we build a venue table

In [51]:
venue_data = {'Postal Code':[],'Borough':[],'Neighborhoods':[],'Venue Name':[],'Venue Address':[],'Venue Latitude':[],'Venue Longitude':[],'Venue Category':[]}
for index in only_toronto_df.index:
    postal_code = only_toronto_df['PostalCode'][index]
    borough = only_toronto_df['Borough'][index]
    neighborhood = only_toronto_df['Neighborhood'][index]
    lat = only_toronto_df['Latitude'][index]
    long = only_toronto_df['Longitude'][index]
    
    params = {'ll':f'{lat},{long}', 'radius':1000,'limit':1000,'client_id':CLIENT_ID,'client_secret':CLIENT_SECRET,'v':'20210720'}
    response = requests.get(url=url, params=params)

    for venue in response.json()['response']['groups'][0]['items']:
        venue_data['Postal Code'].append(postal_code)
        venue_data['Borough'].append(borough)
        venue_data['Neighborhoods'].append(neighborhood)

        venue_details = venue['venue']

        venue_data['Venue Name'].append(venue_details['name'])
        venue_data['Venue Address'].append(', '.join(venue_details['location']['formattedAddress']))
        venue_data['Venue Latitude'].append(venue_details['location']['lat'])
        venue_data['Venue Longitude'].append(venue_details['location']['lng'])
        venue_data['Venue Category'].append(venue_details['categories'][0]['name'])

venue_df = pd.DataFrame(venue_data)
print(venue_df.shape)
venue_df.head()


(4938, 8)


Unnamed: 0,Postal Code,Borough,Neighborhoods,Venue Name,Venue Address,Venue Latitude,Venue Longitude,Venue Category
0,M3A,North York,Parkwoods,Allwyn's Bakery,"81 Underhill drive, Toronto ON M3A 1Z5, Canada",43.75984,-79.324719,Caribbean Restaurant
1,M3A,North York,Parkwoods,Brookbanks Park,"Toronto, Toronto ON, Canada",43.751976,-79.33214,Park
2,M3A,North York,Parkwoods,Tim Hortons,"215 Brookbanks (York Miils Rd), Toronto ON M3A...",43.760668,-79.326368,Café
3,M3A,North York,Parkwoods,A&W,"1277 York Mills Road, Toronto ON M3A 1Z5, Canada",43.760643,-79.326865,Fast Food Restaurant
4,M3A,North York,Parkwoods,Food Basics,"1277 York Mills Rd (at Parkwoods Village Dr.),...",43.760549,-79.326045,Supermarket


#### Ploting the data using folium

#### Note I was not able to get folium to plot in VS Code Notebooks so instead I saved the html and took a screenshot of it.

In [52]:
colors = ['red', 'blue', 'green', 'purple', 'orange', 'darkred','lightred', 'beige', 'darkblue', 'darkgreen', 'cadetblue', 'darkpurple', 'pink', 'lightblue', 'lightgreen', 'gray', 'black', 'lightgray']
m = folium.Map(location=[only_toronto_df['Latitude'].mean(),only_toronto_df['Longitude'].mean()],zoom_start=12)
for i, (name, group) in enumerate(only_toronto_df.groupby('Borough')):
    color = colors[i]
    for index in group.index:
        folium.map.Marker([only_toronto_df['Latitude'][index], only_toronto_df['Longitude'][index]], popup=only_toronto_df['Neighborhood'][index], icon=folium.Icon(color=color)).add_to(m)
m

In [53]:
m.save("toronta_map.html")

<img src='Toronto_Boroughs_2.png'>

#### Part 4: Clustering

#### Making a one hot encoding table

In [54]:
toronto_hot_df = pd.get_dummies(venue_df[['Venue Category']],prefix='',prefix_sep='')
toronto_hot_df['Neighborhood'] = venue_df['Neighborhoods'].values
columns = [toronto_hot_df.columns[-1]] + list(toronto_hot_df.columns[:-1])
toronto_hot_df = toronto_hot_df[columns]
toronto_hot_df.head()

Unnamed: 0,Zoo Exhibit,ATM,Accessories Store,Adult Boutique,Afghan Restaurant,African Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,...,Vietnamese Restaurant,Volleyball Court,Warehouse Store,Whisky Bar,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio,Zoo
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


#### Grouping by Neighborhood so we can mean the venues columns

In [55]:
toronto_grouped = toronto_hot_df.groupby('Neighborhood').mean().reset_index()
toronto_grouped

Unnamed: 0,Neighborhood,Zoo Exhibit,ATM,Accessories Store,Adult Boutique,Afghan Restaurant,African Restaurant,Airport,Airport Food Court,Airport Gate,...,Vietnamese Restaurant,Volleyball Court,Warehouse Store,Whisky Bar,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio,Zoo
0,Agincourt),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.020833,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0
1,"Alderwood,LongBranch",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0
2,"BathurstManor,WilsonHeights,DownsviewNorth",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0
3,BayviewVillage,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0
4,"BedfordPark,LawrenceManorEast",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.02381,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
96,"Willowdale,Newtonbrook",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0
97,Woburn,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0
98,WoodbineHeights,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0
99,"YorkMills,SilverHills",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0


#### Printing out the top 5 venue categories per Neighborhood

In [56]:
num_top_venues = 5

for hood in toronto_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = toronto_grouped[toronto_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Agincourt)----
                venue  freq
0  Chinese Restaurant  0.17
1       Shopping Mall  0.06
2         Pizza Place  0.04
3              Bakery  0.04
4                Pool  0.02


----Alderwood,LongBranch----
               venue  freq
0               Park  0.08
1        Pizza Place  0.08
2     Discount Store  0.08
3  Convenience Store  0.08
4                Bar  0.04


----BathurstManor,WilsonHeights,DownsviewNorth----
         venue  freq
0  Coffee Shop  0.10
1  Pizza Place  0.07
2  Gas Station  0.07
3         Park  0.07
4         Bank  0.07


----BayviewVillage----
                 venue  freq
0                 Park  0.25
1          Golf Course  0.12
2  Japanese Restaurant  0.12
3                Trail  0.12
4                 Bank  0.12


----BedfordPark,LawrenceManorEast----
                  venue  freq
0           Coffee Shop  0.07
1    Italian Restaurant  0.07
2        Sandwich Place  0.05
3                  Bank  0.05
4  Fast Food Restaurant  0.05


----BerczyPark----
 

#### Function to get the top n venues types for a neighborhood

In [57]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

#### Making a table with the most popular venue types per neighborhood

In [58]:
num_top_venues = 5

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,Agincourt),Chinese Restaurant,Shopping Mall,Pizza Place,Bakery,Pool
1,"Alderwood,LongBranch",Park,Pizza Place,Discount Store,Convenience Store,Bar
2,"BathurstManor,WilsonHeights,DownsviewNorth",Coffee Shop,Pizza Place,Gas Station,Park,Bank
3,BayviewVillage,Park,Golf Course,Japanese Restaurant,Trail,Bank
4,"BedfordPark,LawrenceManorEast",Coffee Shop,Italian Restaurant,Sandwich Place,Bank,Fast Food Restaurant


#### Using sklearn to cluster the neighborhoods by their venues

In [76]:
from sklearn.cluster import KMeans
# set number of clusters
kclusters = 3

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([0, 0, 0, 2, 1, 1, 2, 1, 1, 0])

In [77]:
toronto_clustered = kmeans.predict(toronto_grouped_clustering)

#### Merging the cluster groups into the main postal code table

In [78]:
toronto_grouped['Cluster'] = toronto_clustered
toronto_grouped.dropna(axis=0,inplace=True)
clustered_df = only_toronto_df.merge(toronto_grouped[['Neighborhood','Cluster']],how='left',on='Neighborhood')
clustered_df.dropna(axis=0,inplace=True)
clustered_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster
0,M3A,North York,Parkwoods,43.7545,-79.33,0.0
1,M4A,North York,VictoriaVillage,43.7276,-79.3148,0.0
2,M5A,Downtown Toronto,"RegentPark,Harbourfront",43.6555,-79.3626,1.0
3,M6A,North York,"LawrenceManor,LawrenceHeights",43.7223,-79.4504,1.0
4,M7A,Queen's Park,OntarioProvincialGovernment,43.6641,-79.3889,1.0


#### Plotting in folium but this time by using the cluster group for the color

#### Again showing the plot did not work so instead I am showing the screenshot

In [79]:
m = folium.Map(location=[clustered_df['Latitude'].mean(),clustered_df['Longitude'].mean()],zoom_start=12)
for i, (name, group) in enumerate(clustered_df.groupby('Borough')):
    for index in group.index:
        folium.map.Marker([clustered_df['Latitude'][index], clustered_df['Longitude'][index]], 
            popup=clustered_df['Neighborhood'][index], icon=folium.Icon(color=colors[int(clustered_df['Cluster'][index])])).add_to(m)
m

In [80]:
m.save("toronta_clusters.html")

#### It looks like the K Means algorithm it finding that most of the downtown neighborhoods are similar to each other which probably makes sense while the other neighborhoods are more likely to be similar even though they are not close geographically.  

#### I also played around with the number of clusters and I liked 3 the best.

<img src='Toronto_Clusters_2.png'>

#### Showing the number of neighborhoods in each cluster

In [81]:
clustered_df['Cluster'].value_counts()

1.0    50
0.0    42
2.0     9
Name: Cluster, dtype: int64

In [82]:
toronto_grouped['Cluster'] = toronto_clustered
cluster_means = toronto_grouped.groupby('Cluster').mean().reset_index()
cluster_means.head()

Unnamed: 0,Cluster,Zoo Exhibit,ATM,Accessories Store,Adult Boutique,Afghan Restaurant,African Restaurant,Airport,Airport Food Court,Airport Gate,...,Vietnamese Restaurant,Volleyball Court,Warehouse Store,Whisky Bar,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio,Zoo
0,0,0.015306,0.001701,0.0,0.0,0.001253,0.0,0.000952,0.0,0.0,...,0.010306,0.0,0.001348,0.0,0.0,0.0,0.001082,0.001082,0.000627,0.0
1,1,0.0,0.0,0.000425,0.0004,0.0,0.0004,0.001,0.001,0.001,...,0.003708,0.000513,0.0,0.0002,0.001323,0.0002,0.001606,0.000449,0.008368,0.00022
2,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.018519,0.018519,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### Cluster 2 is mostly about parks while the other two are more about restaurants and coffee shops

In [84]:
indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Cluster']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
cluster_venues_sorted = pd.DataFrame(columns=columns)
num_top_venues = 5

# create a new dataframe
cluster_venues_sorted = pd.DataFrame(columns=columns)
cluster_venues_sorted['Cluster'] = toronto_grouped['Cluster']

for ind, (cluster, group) in enumerate(toronto_grouped.iloc[:,1:].groupby('Cluster')):
    # print(cluster)
    # print(group.mean())
    cluster_venues_sorted.iloc[ind, 0] = cluster
    cluster_venues_sorted.iloc[ind, 1:] = return_most_common_venues(group.iloc[:,:-1].mean() , num_top_venues)
    

cluster_venues_sorted.head(3)

Unnamed: 0,Cluster,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,0,Coffee Shop,Pizza Place,Park,Sandwich Place,Pharmacy
1,1,Coffee Shop,Café,Restaurant,Italian Restaurant,Park
2,2,Park,Café,Coffee Shop,Baseball Field,Bank
