<h1>Coursera Segmenting and Clustering Neighborhoods in Toronto</h1>
<h1>Section 1:</h1>
In this section, I will be gathering all the Toronto postal codes from the web and grouping neighborhoods by those postal codes.

In [1]:
# Import / install required packages.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
import requests
from pandas.io.json import json_normalize
!conda install -c conda-forge folium=0.5.0 --yes
import folium
from sklearn.cluster import KMeans
import matplotlib.cm as cm
import matplotlib.colors as colors

Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/Python36

  added / updated specs: 
    - folium=0.5.0


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    branca-0.3.1               |             py_0          25 KB  conda-forge
    certifi-2019.11.28         |           py36_0         149 KB  conda-forge
    openssl-1.1.1d             |       h516909a_0         2.1 MB  conda-forge
    folium-0.5.0               |             py_0          45 KB  conda-forge
    ca-certificates-2019.11.28 |       hecc5488_0         145 KB  conda-forge
    vincent-0.4.4              |             py_1          28 KB  conda-forge
    altair-4.0.1               |             py_0         575 KB  conda-forge
    ------------------------------------------------------------
                                           Total:         3.0 MB

The following NEW packages will be 

Below, I opened the provided URL and scraped all required data.

In [3]:
# Open target URL.
html = urlopen('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')

# Create BeautifulSoup web-parser.
soup = BeautifulSoup(html, 'html.parser')

# Find attribute table, scrap data and clean text.
table = soup.find('table', {'class': 'wikitable sortable'})
rows = table.find_all('tr')
list_rows = []

for row in rows:
    cells = row.find_all('td')
    str_cells = str(cells)
    clean = re.compile('<.*?>')
    clean2 = (re.sub(clean, '',str_cells))
    if len(clean2) > 2:
        list_rows.append(clean2.strip('[]')[:-1])
    else:
        continue
 
# Create dataframe of all collected web data.
df = pd.DataFrame(list_rows)
postalcode_df = df[0].str.split(',', expand=True)
postalcode_df.columns = ['PostalCode', 'Borough', 'Neighborhood']

# Remove rows were "Borough" = "Not Assinged" then group neighborhoods by Borough.
for column in postalcode_df.columns.values:
    postalcode_df[column] = postalcode_df[column].str.strip()

postalcode_df = postalcode_df[postalcode_df.Borough != 'Not assigned'].reset_index(drop=True)
postalcode_df.loc[(postalcode_df.Neighborhood == 'Not assigned'), 'Neighborhood'] = postalcode_df[postalcode_df.Neighborhood == 'Not assigned'].Borough.item()
postalcode_df = postalcode_df.groupby(['PostalCode','Borough']).Neighborhood.unique().apply(lambda x: ', '.join(x)).reset_index()
postalcode_df.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


In [4]:
# Check DataFrame shape.
postalcode_df.shape

(103, 3)

<h1>Section 2:</h1>

In [5]:
# Read in coordinate data for gathered postal codes.
df=pd.read_csv('https://cocl.us/Geospatial_data')
df.columns = ['PostalCode', 'Latitude', 'Longitude']
df.head()

Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [6]:
# Combine coordinates and Borough / Neighborhood DataFrames.
coordinate_df = postalcode_df.join(df.set_index('PostalCode'), on='PostalCode')
coordinate_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [7]:
# Check DataFrame shape.
coordinate_df.shape

(103, 5)

<h1>Section 3:</h1>

In [9]:
# Retrieve all Borough names.
coordinate_df.Borough.unique()

array(['Scarborough', 'North York', 'East York', 'East Toronto',
       'Central Toronto', 'Downtown Toronto', 'York', 'West Toronto',
       'Mississauga', 'Etobicoke', "Queen's Park"], dtype=object)

In [10]:
# Remove all boroughs withough "Toronto" in the name.
for borough in coordinate_df.Borough.unique():
    if "Toronto" not in borough:
        coordinate_df = coordinate_df[coordinate_df.Borough != borough]
        
coordinate_df.Borough.unique()

array(['East Toronto', 'Central Toronto', 'Downtown Toronto',
       'West Toronto'], dtype=object)

In [11]:
coordinate_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
37,M4E,East Toronto,The Beaches,43.676357,-79.293031
41,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188
42,M4L,East Toronto,"The Beaches West, India Bazaar",43.668999,-79.315572
43,M4M,East Toronto,Studio District,43.659526,-79.340923
44,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879


In [12]:
# Store Foursquare credentials and parameters.
CLIENT_ID = '' # Confidential.
CLIENT_SECRET = '' # Confidential.
VERSION = '20180605' 
LIMIT = 100
radius = 500

In [13]:
# Retrieve venue data from Foursquare.
venues_list=[]
for name, lat, lng in zip(coordinate_df['Neighborhood'], 
                          coordinate_df['Latitude'], 
                          coordinate_df['Longitude']):

    # create the API request URL
    url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
        CLIENT_ID, 
        CLIENT_SECRET, 
        VERSION, 
        lat, 
        lng, 
        radius, 
        LIMIT)

    # make the GET request
    results = requests.get(url).json()["response"]['groups'][0]['items']

    # return only relevant information for each nearby venue
    venues_list.append([(
        name, 
        lat, 
        lng, 
        v['venue']['name'], 
        v['venue']['location']['lat'], 
        v['venue']['location']['lng'],  
        v['venue']['categories'][0]['name']) for v in results])

toronto_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
toronto_venues.columns = ['Neighborhood', 
                         'Neighborhood Latitude', 
                         'Neighborhood Longitude', 
                         'Venue', 
                         'Venue Latitude', 
                         'Venue Longitude', 
                         'Venue Category']

toronto_venues.head()

KeyError: 'groups'

In [14]:
# Remove "Neighborhood" from list of venues
toronto_venues = toronto_venues[toronto_venues['Venue Category'] != 'Neighborhood']
toronto_venues.groupby('Neighborhood').count().head()

NameError: name 'toronto_venues' is not defined

In [15]:
# Print number of unique catagories.
print('There are {} uniques categories.'.format(len(toronto_venues['Venue Category'].unique())))

NameError: name 'toronto_venues' is not defined

In [16]:
# Print top 120 most common venue types.
toronto_venues['Venue Category'].value_counts().head(10)

NameError: name 'toronto_venues' is not defined

In [17]:
# Create table of venues per neighborhood.
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()

NameError: name 'toronto_venues' is not defined

In [18]:
# Check DataFrame shape.
toronto_onehot.shape

NameError: name 'toronto_onehot' is not defined

In [19]:
# Aggregate neighborhood venues by means.
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_grouped.head()

NameError: name 'toronto_onehot' is not defined

In [20]:
# Check DataFrame shape.
toronto_grouped.shape

NameError: name 'toronto_grouped' is not defined

In [21]:
# Create table of most common venues per neighborhood.
# Define function.
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

num_top_venues = 10
indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

NameError: name 'toronto_grouped' is not defined

In [22]:
# Conduct Elbow Sum-of-Squares test on clustering DataFrame.
toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

k_range = range(1,11)
sum_of_squares = []

for k in k_range:
    KM = KMeans(n_clusters=k)
    KM.fit(toronto_grouped_clustering)
    sum_of_squares.append(KM.inertia_)
    
plt.plot(k_range, sum_of_squares, marker='o')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('SS')
plt.xticks(k_range)
plt.show()

NameError: name 'toronto_grouped' is not defined

In [23]:
# Conduct silhouette test on clustering DataFrame.
from sklearn.metrics import silhouette_samples, silhouette_score

best=0

for n_clusters in range(2,11):
    KM = KMeans(n_clusters=n_clusters)
    KM_labels = KM.fit_predict(toronto_grouped_clustering)
    silhouette_avg = silhouette_score(toronto_grouped_clustering, KM_labels)
    print("For n_clusters =", n_clusters,
          "The average silhouette_score is :", silhouette_avg)
    if silhouette_avg > best:
        best = silhouette_avg
        best_n_clusters = n_clusters
        
print('Optimal KMeans n_clusters parameter:', best_n_clusters)

NameError: name 'toronto_grouped_clustering' is not defined

In [24]:
# run k-means clustering.
kclusters = 4
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# Print count of neighborhoods per cluster.
pd.Series(kmeans.labels_).value_counts()

NameError: name 'toronto_grouped_clustering' is not defined

In [25]:
# add clustering labels.
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)
toronto_merged = coordinate_df.drop('PostalCode', axis=1)

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood.
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')
toronto_merged.head()

NameError: name 'kmeans' is not defined

In [26]:
# Import Nominatim.
from geopy.geocoders import Nominatim

# Retrieve coordinates for Toronto.
address = 'Toronto, ON'
geolocator = Nominatim(user_agent="tor_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.653963, -79.387207.


In [27]:
# Create mpa visual with clustered neighborhoods.
# Create folium map.
zoom = 12
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=zoom, min_zoom=zoom, max_zoom=zoom)

# Set color scheme for the clusters.
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow((np.linspace(0, 1, len(ys))))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map.
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

NameError: name 'toronto_merged' is not defined

In [28]:
df_cluster_0 = toronto_merged.loc[toronto_merged['Cluster Labels'] == 0, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]
df_cluster_0.head()

NameError: name 'toronto_merged' is not defined

In [29]:
df_cluster_1 = toronto_merged.loc[toronto_merged['Cluster Labels'] == 1, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]
df_cluster_1

NameError: name 'toronto_merged' is not defined

In [30]:
df_cluster_2 = toronto_merged.loc[toronto_merged['Cluster Labels'] == 2, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]
df_cluster_2

NameError: name 'toronto_merged' is not defined

In [31]:
df_cluster_3 = toronto_merged.loc[toronto_merged['Cluster Labels'] == 3, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]
df_cluster_3

NameError: name 'toronto_merged' is not defined

In [32]:
for i, column in enumerate (df_cluster_3.columns[1:]):
    if i < 3:
        print(column + '(%):')
        print(round(df_cluster_3[column].value_counts()/len(df_cluster_3[column])*100,2))
        print('')

NameError: name 'df_cluster_3' is not defined