# Capstone Project

In [107]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans
import folium

## Prepare the dataframe

### Script data from Wikipedia page

In [12]:
# script the html page
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
response = requests.get(url)
if response.status_code == 200:
    soup = BeautifulSoup(response.content,'lxml')

In [52]:
# get the PostalCode and save to a list
PostalCode = []
for tag in soup.find('table').find_all('b'):
    PostalCode.append(tag.get_text())

In [53]:
# get the Borough(Neighborhood) and save to a list
Borough = []
for tag in soup.find('table').find_all('span'):
    Borough.append(tag.get_text())

In [85]:
# create a dataframe and put the data above into it
df = pd.DataFrame(columns = ['PostalCode','Borough','Neighborhood'])
df.PostalCode = PostalCode
df.Borough = Borough

In [86]:
# remove rows that contain unassigned boroughs
df = df[df.Borough != 'Not assigned'].reset_index(drop = True)

In [87]:
# clean the 'Borough' and 'Neighborhood' columns to the right format
for i in range(len(df.Borough)):
    l = df.Borough[i].split('(')
    df.Borough[i] = l[0]
    try:
        if l[1][-1] == ')':
            df.Neighborhood[i] = l[1][:,-1].replace('/',',')
        else:
            df.Neighborhood[i] = l[1].replace('/',',')
    except:
        df.Neighborhood[i] = l[0]

In [88]:
df = df.drop(4,0).reset_index(drop = True)
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,North York
1,M4A,North York,North York
2,M5A,Downtown Toronto,Downtown Toronto
3,M6A,North York,North York
4,M9A,Etobicoke,Etobicoke


In [89]:
df.shape

(102, 3)

### Add coordinates

In [90]:
coor = pd.read_csv('Geospatial_Coordinates.csv')

In [91]:
df = df.merge(coor, left_on = 'PostalCode', right_on = 'Postal Code', how = 'left')

In [92]:
df = df.drop('Postal Code',1)

### Group the df by boroughs

In [95]:
df_bor = df.groupby('Borough',as_index = False).mean()

In [99]:
# clean up the boroughs names
df_bor.Borough[2] = 'Downtown Toronto'
df_bor.Borough[4] = 'East Toronto'
df_bor.Borough[8] = 'Etobicoke'
df_bor.Borough[9] = 'Mississauga'
df_bor.drop(6,0,inplace = True)
df_bor = df_bor.groupby('Borough',as_index = False).mean()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


In [100]:
df_bor

Unnamed: 0,Borough,Latitude,Longitude
0,Central Toronto,43.70198,-79.398954
1,Downtown Toronto,43.65053,-79.379515
2,East Toronto,43.666927,-79.323493
3,East York,43.704043,-79.335287
4,Etobicoke,43.681273,-79.565701
5,Mississauga,43.636966,-79.615819
6,North York,43.750727,-79.429338
7,Scarborough,43.766229,-79.249085
8,West Toronto,43.652653,-79.44929
9,York,43.690797,-79.472633


## Clustering Boroughs

### Get venues categories of each borough

In [101]:
CLIENT_ID = '3GSOCBER1WPKAWDHRZWW2FAYTTMQZT2OVZGXVEJOTY1Y4DTJ' # your Foursquare ID
CLIENT_SECRET = 'SFZU24TJPXDXBFNDEW0K0BMHSBVNS2DA0Q50DCMKBKDHFO1F' # your Foursquare Secret
VERSION = '20180605'

def getNearbyVenues(names, latitudes, longitudes, radius=2000,LIMIT = 50):
    
    venues_list=[]
    
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name,   
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Borough', 'Venue Category']
    
    return(nearby_venues)

In [102]:
toronto_venues = getNearbyVenues(names=df_bor['Borough'],latitudes=df_bor['Latitude'],longitudes=df_bor['Longitude'])

Central Toronto
Downtown Toronto
East Toronto
East York
Etobicoke
Mississauga
North York
Scarborough
West Toronto
York


In [104]:
print(toronto_venues.shape)
toronto_venues.head()

(500, 2)


Unnamed: 0,Borough,Venue Category
0,Central Toronto,Indonesian Restaurant
1,Central Toronto,Italian Restaurant
2,Central Toronto,General Entertainment
3,Central Toronto,Italian Restaurant
4,Central Toronto,Restaurant


In [105]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Borough'] = toronto_venues['Borough'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_grouped = toronto_onehot.groupby('Borough').mean().reset_index()
toronto_grouped

Unnamed: 0,Borough,Afghan Restaurant,American Restaurant,Antique Shop,Art Gallery,Asian Restaurant,Athletics & Sports,BBQ Joint,Bagel Shop,Bakery,...,Thrift / Vintage Store,Toy / Game Store,Trail,Turkish Restaurant,Vegetarian / Vegan Restaurant,Vietnamese Restaurant,Warehouse Store,Wings Joint,Xinjiang Restaurant,Yoga Studio
0,Central Toronto,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.02,0.02,...,0.0,0.0,0.04,0.0,0.0,0.0,0.0,0.0,0.0,0.02
1,Downtown Toronto,0.0,0.04,0.0,0.02,0.0,0.0,0.02,0.0,0.0,...,0.0,0.0,0.0,0.0,0.04,0.0,0.0,0.0,0.0,0.0
2,East Toronto,0.0,0.02,0.0,0.0,0.02,0.0,0.02,0.0,0.02,...,0.0,0.0,0.0,0.0,0.0,0.02,0.0,0.0,0.0,0.0
3,East York,0.04,0.0,0.0,0.0,0.0,0.0,0.02,0.0,0.02,...,0.0,0.0,0.0,0.02,0.0,0.0,0.02,0.0,0.0,0.02
4,Etobicoke,0.0,0.04,0.0,0.0,0.0,0.02,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.04,0.0,0.0
5,Mississauga,0.0,0.0,0.0,0.0,0.04,0.0,0.0,0.0,0.02,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,North York,0.0,0.0,0.0,0.0,0.02,0.0,0.0,0.02,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,Scarborough,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.02,...,0.0,0.02,0.0,0.0,0.0,0.0,0.0,0.02,0.02,0.0
8,West Toronto,0.0,0.02,0.0,0.0,0.0,0.0,0.02,0.0,0.04,...,0.0,0.0,0.0,0.0,0.0,0.02,0.0,0.0,0.0,0.0
9,York,0.0,0.0,0.02,0.0,0.0,0.0,0.02,0.0,0.06,...,0.02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Cluster boroughs

In [108]:
kclusters = 3

toronto_grouped_clustering = toronto_grouped.drop('Borough', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_ 

array([2, 2, 2, 0, 0, 1, 0, 0, 2, 0], dtype=int32)

In [109]:
df_bor['Clusters'] = kmeans.labels_ 
df_bor

Unnamed: 0,Borough,Latitude,Longitude,Clusters
0,Central Toronto,43.70198,-79.398954,2
1,Downtown Toronto,43.65053,-79.379515,2
2,East Toronto,43.666927,-79.323493,2
3,East York,43.704043,-79.335287,0
4,Etobicoke,43.681273,-79.565701,0
5,Mississauga,43.636966,-79.615819,1
6,North York,43.750727,-79.429338,0
7,Scarborough,43.766229,-79.249085,0
8,West Toronto,43.652653,-79.44929,2
9,York,43.690797,-79.472633,0


### Visualize the clusters

In [110]:
# create map
map_clusters = folium.Map(location=[43.6532, -79.3832], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(df_bor['Latitude'], df_bor['Longitude'], df_bor['Borough'], df_bor['Clusters']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters