## Data Science Capstone Week 3 Assignment: Segmenting and Clustering Neighborhoods in Toronto

In [1]:
import requests
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import geocoder
from geopy.geocoders import Nominatim
import folium
from sklearn.cluster import KMeans
import matplotlib.cm as cm
import matplotlib.colors as colors
import warnings
warnings.filterwarnings('ignore')

#### Using BeautifulSoup to parse the Wiki table. There's some SSL problems on my network so I need to use verify=False.

In [2]:
url = r'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
text = requests.get(url, verify=False).text
soup = BeautifulSoup(text, 'lxml')

#### Get all the table rows with no headers and strip newlines. I'll process the raw table as a dataframe.

In [3]:
table = soup.find('table', {'class':'wikitable sortable'})
table_rows = []

for items in table.find_all('tr'):
    data = items.find_all('td')
    
    try:
        postal = data[0].text.strip()
        borough = data[1].text.strip()
        neigh = data[2].text.strip()
        table_rows.append([postal, borough, neigh])
    except IndexError:
        pass
    
raw_table_df = pd.DataFrame(table_rows)
raw_table_df.columns = ['PostalCode', 'Borough', 'Neighborhood']

#### Process all 'not assigned' values accordingly, get rid of duplicates and join neighborhoods in the same PostalCode/Borough with a comma.

In [4]:
df = raw_table_df[raw_table_df['Borough'] != 'Not assigned']

for index, row in df.iterrows():
    if row['Neighborhood'] == 'Not assigned':
        row['Neighborhood'] = row['Borough']
    
df = df.drop_duplicates()
df = df.groupby(['PostalCode', 'Borough'])['Neighborhood'].apply(', '.join).reset_index()

In [5]:
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [6]:
df.shape

(103, 3)

## Q2: Coordinates

In [7]:
coords_df = pd.read_csv('Geospatial_Coordinates.csv')
coords_df = coords_df.rename(columns={'Postal Code': 'PostalCode'})
df = df.merge(coords_df, on='PostalCode')

In [8]:
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [9]:
df.shape

(103, 5)

## Q3: Clustering

My goal is to cluster Toronto Postal Codes by venue pricing from FourSquare.

In [10]:
tor_df = df[df['Borough'].str.contains("Toronto")]

In [11]:
tor_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
37,M4E,East Toronto,The Beaches,43.676357,-79.293031
41,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188
42,M4L,East Toronto,"The Beaches West, India Bazaar",43.668999,-79.315572
43,M4M,East Toronto,Studio District,43.659526,-79.340923
44,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879


In [12]:
address = 'Toronto, Canada'

geolocator = Nominatim(user_agent="tor_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.653963, -79.387207.


In [13]:
map_tor = folium.Map(location=[latitude, longitude], zoom_start=11)

for lat, lng, borough, neighborhood in zip(tor_df['Latitude'], tor_df['Longitude'], tor_df['Borough'], tor_df['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_tor)  

In [14]:
map_tor

In [15]:
CLIENT_ID = 'VNBGQDHO2OBW0A4LRJJWRLKIFGQI3TF4P0JZMYZYT5TY0BZO'
CLIENT_SECRET = 'FFKOMRBHEP324SVWUCZ03THCN5ZN3ULKAYRZNGBIUNF2XWOA'
VERSION = '20180605'
LIMIT = 500

I will use the relative frequency of venues near each postal code with prices in the range from \\$ to \\$$$$ to cluster them

In [16]:
names = tor_df['PostalCode']
latitudes = tor_df['Latitude']
longitudes = tor_df['Longitude']

radius = 500
postal_price_freq = []

for name, lat, lng in zip(names, latitudes, longitudes):
    
    price_buckets = []

    for price in range(1, 5):
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&price={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius,
            price,
            LIMIT)

        results = requests.get(url, verify=False).json()["response"]['groups'][0]['items']

        price_buckets.append(len(results))
        
    # Percentage frequency    
    total = sum(price_buckets)
    if total > 0:
        temp = [i / total for i in price_buckets]
        price_buckets = temp
    
    postal_price_freq.append([name] + price_buckets)


In [17]:
price_freq_df = pd.DataFrame(postal_price_freq)
price_freq_df.columns = ['PostalCode', 'Price1', 'Price2', 'Price3', 'Price4']
price_freq_df.head()

Unnamed: 0,PostalCode,Price1,Price2,Price3,Price4
0,M4E,1.0,0.0,0.0,0.0
1,M4K,0.419355,0.483871,0.096774,0.0
2,M4L,0.5,0.5,0.0,0.0
3,M4M,0.32,0.56,0.12,0.0
4,M4N,0.0,0.0,0.0,0.0


In [18]:
# set number of clusters
kclusters = 5

tor_clustering = price_freq_df.drop('PostalCode', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(tor_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([3, 0, 1, 0, 2, 1, 1, 1, 4, 1])

In [19]:
price_freq_df.insert(0, 'Cluster Labels', kmeans.labels_)
tor_df = tor_df.merge(price_freq_df, on='PostalCode')

This map should show Toronto postal codes clustered by venue pricing 

In [20]:
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(tor_df['Latitude'], tor_df['Longitude'], tor_df['Neighborhood'], tor_df['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters