# Toronto segmentation and clustering

This notebook will be used to answer all three questions of the __Peer-graded Assignment: Segmenting and Clustering Neighborhoods in Toronto__.

Each answer will have a leading __Answer to Question ?__ and some explanation at the beggining for easy reference.  
  
  
  

In [60]:
import requests
import pandas as pd
import numpy as np
import matplotlib.cm as cm
import matplotlib.colors as colors
from bs4 import BeautifulSoup
from sklearn.cluster import KMeans
#!conda install -c conda-forge folium=0.5.0 --yes
import folium

## Answer to Question 1

1. Scraped the Toronto postal codes into a dataframe using BeautifulSoup  
2. Cleaned the data (removed "Not assigned" boroughs and replaced / by , in neighborhoods. All neighborhoods are assigned, so nothing to do there)

In [11]:
res = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')
soup = BeautifulSoup(res.content,'lxml')
table = soup.find_all('table')[0] 
torontodf = pd.read_html(str(table))[0]

In [13]:
torontodf.drop(torontodf[torontodf['Borough']=="Not assigned"].index, inplace = True)

In [14]:
torontodf['Neighborhood'] = torontodf['Neighborhood'].str.replace(" /",",")

In [15]:
torontodf.head()

Unnamed: 0,Postal code,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [16]:
torontodf.shape

(103, 3)

## Answer to Question 2

1. Downloaded the Geospatial data into a CSV file  
2. Merged the dataframes using 'Postal code' for the key

In [17]:
#!wget -q -O 'Geospatial.csv' https://cocl.us/Geospatial_data

In [18]:
geospatial = pd.read_csv("Geospatial.csv")

In [19]:
geospatial.rename(columns={"Postal Code": "Postal code"}, inplace = True)

In [20]:
torontodf = torontodf.merge(geospatial, how='left')

In [21]:
torontodf.head()

Unnamed: 0,Postal code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


In [22]:
torontodf.shape

(103, 5)

## Answer to Question 3

1. Started by mapping all the the different Neighborhood locations on a map centered in Toronto, Canada  
2. Defined the getNearbyVenues function from the NY analysis, made a little change getting the data because sometimes the result is empty
3. Called the function and got the venues in the vicinity of the Neighborhoods, increasing the radius to 1000 meters to get more data per location
4. Created columns with all the venues' categories and grouped by Neighborhood to get the frequency of each category
5. Got the top 10 of the most frequent venues for each Neighborhood
6. Executed the K-means configured for 5 clusters
7. Represented the clusters on the map


In [25]:
# create map of Toronto centered on coordinates given by Google search
map_toronto = folium.Map(location=[43.6532, -79.3832], zoom_start=11)

# add markers to map
for lat, lng, borough, neighborhood in zip(torontodf['Latitude'], torontodf['Longitude'], torontodf['Borough'], torontodf['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)

map_toronto

In [26]:
# The code was removed by Watson Studio for sharing.

In [102]:
def getNearbyVenues(names, latitudes, longitudes, radius=1000):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            100)
            
        # make the GET request resilient to empty answers (retries)
        json = ""
        while json == "":
            json = requests.get(url).json()
        results = json["response"]['groups'][0]['items']

        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [104]:
toronto_venues = getNearbyVenues(names=torontodf['Neighborhood'],
                                   latitudes=torontodf['Latitude'],
                                   longitudes=torontodf['Longitude']
                                  )
toronto_venues.shape

(4940, 7)

In [105]:
print('There are {} unique categories.'.format(len(toronto_venues['Venue Category'].unique())))

There are 335 unique categories.


In [106]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

# group rows by neighborhood and by taking the mean of the frequency of occurrence of each category
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()

In [107]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Agincourt,Chinese Restaurant,Shopping Mall,Caribbean Restaurant,Bakery,Pool Hall,Supermarket,Latin American Restaurant,Sri Lankan Restaurant,Breakfast Spot,Bubble Tea Shop
1,"Alderwood, Long Branch",Discount Store,Pharmacy,Convenience Store,Pizza Place,Pub,Dance Studio,Park,Coffee Shop,Intersection,Sandwich Place
2,"Bathurst Manor, Wilson Heights, Downsview North",Coffee Shop,Bank,Restaurant,Diner,Gas Station,Sandwich Place,Frozen Yogurt Shop,Fried Chicken Joint,Sushi Restaurant,Dog Run
3,Bayview Village,Japanese Restaurant,Bank,Gas Station,Grocery Store,Chinese Restaurant,Park,Café,Restaurant,Skating Rink,Trail
4,"Bedford Park, Lawrence Manor East",Coffee Shop,Italian Restaurant,Sandwich Place,Bank,Fast Food Restaurant,Pizza Place,Cosmetics Shop,Restaurant,Thai Restaurant,Intersection


In [108]:
# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

In [109]:
# merge to add latitude/longitude for each neighborhood
toronto_merged = neighborhoods_venues_sorted.merge(torontodf, how = 'left')

In [110]:
# create map
map_clusters = folium.Map(location=[43.6532, -79.3832], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters 