In [1]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests


# 1) Extracting raw table from Wiki

In [2]:
# getting data from Wikipedia
wikipedia_link='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
raw_wikipedia_page= requests.get(wikipedia_link).text

# using beautiful soup to parse the HTML/XML codes.
soup = BeautifulSoup(raw_wikipedia_page,'xml')
wikitables= soup.find_all('table')
df= pd.read_html(str(wikitables[0]), index_col=None, header=0)[0]

Group data which has the same PostCode and Borough

In [3]:

# Remove not assigned rows 
df = df[df.Borough != 'Not assigned'].reset_index(drop=True)


# Group neighborhoods by postal and borough
neighborhoods = df.groupby(['Postcode','Borough'], as_index=False).agg(lambda x: ','.join(x))

In the last cell of your notebook, use the .shape method to print the number of rows of your dataframe.

In [4]:
neighborhoods.shape

(103, 3)

# 2) Merge Geo Data to data frame

Geocoder package is unreliable, using csv file from: http://cocl.us/Geospatial_data instead.

In [5]:

import urllib

In [6]:
testfile = urllib.request
testfile.urlretrieve ("http://cocl.us/Geospatial_data", "newyork_geodata.csv")

print('Data downloaded!')

Data downloaded!


In [7]:
geo_df = pd.read_csv('newyork_geodata.csv')
geo_df = geo_df.rename(index=str, columns={"Postal Code": "Postcode", "Latitude": "Latitude", "Longitude":"Longitude"})

toronto_df = pd.merge(neighborhoods, geo_df, on='Postcode')

toronto_df.rename(columns={"Neighbourhood": "Neighborhood"}, inplace=True)

toronto_df.head(15)

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge,Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"East Birchmount Park,Ionview,Kennedy Park",43.727929,-79.262029
7,M1L,Scarborough,"Clairlea,Golden Mile,Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffcrest,Cliffside,Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff,Cliffside West",43.692657,-79.264848


Explore and cluster the neighborhoods in Toronto. You can decide to work with only boroughs that contain the word Toronto and then replicate the same analysis we did to the New York City data. It is up to you.

Just make sure:

1.to add enough Markdown cells to explain what you decided to do and to report any observations you make.

2.to generate maps to visualize your neighborhoods and how they cluster together.

In [8]:
import folium 

!conda install -c conda-forge geopy --yes
from geopy.geocoders import Nominatim

Collecting package metadata: done
Solving environment: | 
  - anaconda::ca-certificates-2018.03.07-0, anaconda::certifi-2018.11.29-py37_0, anaconda::openssl-1.1.1a-h1de35cc_0
  - anaconda::ca-certificates-2018.03.07-0, anaconda::openssl-1.1.1a-h1de35cc_0, defaults::certifi-2018.11.29-py37_0
  - anaconda::certifi-2018.11.29-py37_0, anaconda::openssl-1.1.1a-h1de35cc_0, defaults::ca-certificates-2018.03.07-0
  - anaconda::openssl-1.1.1a-h1de35cc_0, defaults::ca-certificates-2018.03.07-0, defaults::certifi-2018.11.29-py37_0
  - anaconda::certifi-2018.11.29-py37_0, defaults::ca-certificates-2018.03.07-0, defaults::openssl-1.1.1a-h1de35cc_0
  - defaults::ca-certificates-2018.03.07-0, defaults::certifi-2018.11.29-py37_0, defaults::openssl-1.1.1a-h1de35cc_0
  - anaconda::ca-certificates-2018.03.07-0, anaconda::certifi-2018.11.29-py37_0, defaults::openssl-1.1.1a-h1de35cc_0
  - anaconda::ca-certificates-2018.03.07-0, defaults::certifi-2018.11.29-py37_0, defaults::openssl-1.1.1a-h1de35ccdone

# A

In [10]:
address = 'Toronto, Ontario'

geolocator = Nominatim(user_agent="tl-toronto-neigh")
location = geolocator.geocode(address)
toronto_latitude = location.latitude
toronto_longitude = location.longitude
print('The geograpical coordinates of Toronto are {}, {}.'.format(toronto_latitude, toronto_longitude))

GeocoderTimedOut: Service timed out

In [None]:
map_toronto = folium.Map(location = [toronto_latitude, toronto_longitude], zoom_start = 10.7)

# add markers to map
for lat, lng, borough, neighborhood in zip(toronto_df['Latitude'], toronto_df['Longitude'], toronto_df['Borough'], toronto_df['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7).add_to(map_toronto)  
    

map_toronto

In [None]:
# Explore boroughs that have Toronto in their names.
toronto_area = ['East Toronto', 'Central Toronto', 'Downtown Toronto', 'West Toronto']
toronto_df = toronto_df[toronto_df['Borough'].isin(toronto_area)].reset_index(drop=True)
toronto_df.head()

In [None]:
#Use FourSquare API to explore the boroughs
CLIENT_ID = 'EVB2U5OA4QXIYGXVUK4JCLCJLAK0RKBJRWGWAUYIMBE2ZJ0N'
CLIENT_SECRET = 'XNAFJHATEZJLBS0LHO2OP2TDPMYIRG2AYE4BTKOLTQJVGI05'
VERSION = '20180605'
radius = 500
LIMIT = 100

venues = []

for lat, long, post, borough, neighborhood in zip(toronto_df['Latitude'], toronto_df['Longitude'], toronto_df['Postcode'], toronto_df['Borough'], toronto_df['Neighborhood']):
    url = "https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}".format(
        CLIENT_ID,
        CLIENT_SECRET,
        VERSION,
        lat,
        long,
        radius, 
        LIMIT)
    
    results = requests.get(url).json()["response"]['groups'][0]['items']
    
    for venue in results:
        venues.append((
            post, 
            borough,
            neighborhood,
            lat, 
            long, 
            venue['venue']['name'], 
            venue['venue']['location']['lat'], 
            venue['venue']['location']['lng'],  
            venue['venue']['categories'][0]['name']))

In [None]:

# nearby_venues
toronto_venues_df = pd.DataFrame(venues)
toronto_venues_df.columns = ['Postal Code', 
                             'Borough', 
                             'Neighborhood', 
                             'Borough Latitude', 
                             'Borough Longitude', 
                             'Venue Name', 
                             'Venue Latitude', 
                             'Venue Longitude', 
                             'Venue Category']
toronto_venues_df.head()

In [None]:
# The one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues_df[['Venue Category']], prefix="", prefix_sep="")

toronto_onehot['Postal Code'] = toronto_venues_df['Postal Code'] 
toronto_onehot['Borough'] = toronto_venues_df['Borough'] 
toronto_onehot['Neighborhood'] = toronto_venues_df['Neighborhood'] 

Columns = list(toronto_onehot.columns[-3:]) + list(toronto_onehot.columns[:-3])
toronto_onehot = toronto_onehot[Columns]

toronto_onehot.head()

In [None]:
toronto_onehot = toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_onehot

Let's print each neighborhood along with the top 5 most common venues

In [None]:
num_top_venues = 5

for hood in toronto_onehot['Neighborhood']:
    print("----"+hood+"----")
    temp = toronto_onehot[toronto_onehot['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

write a function to sort the venues in descending order.

In [None]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

create the new dataframe and display the top 10 venues for each neighborhood.

In [None]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_onehot['Neighborhood']

for ind in np.arange(toronto_onehot.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_onehot.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

# 3) Explore and Cluster Neighborhoods 

Run k-means to cluster the neighborhood into 5 clusters.

In [None]:
#import k-means from clustering stage
from sklearn.cluster import KMeans
# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

In [None]:
# set number of clusters
kclusters = 3

toronto_onehot = toronto_onehot.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_onehot)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]

In [None]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_merged_df = Toronto_df

# merge
toronto_merged_df = toronto_merged_df.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

toronto_merged_df.head() # check the last columns!

In [None]:
# create map
MapClusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged_df['Latitude'], toronto_merged_df['Longitude'], toronto_merged_df['Neighborhood'], toronto_merged_df['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(MapClusters)
       
MapClusters

# Examine Clusters Examine Cluster

Cluster #1 ==> Park and Playground

In [None]:
toronto_merged_df.loc[toronto_merged_df['Cluster Labels'] == 0, toronto_merged_df.columns[[1] + list(range(5, toronto_merged_df.shape[1]))]]

Cluster #2 ==> Garden

In [None]:
toronto_merged_df.loc[toronto_merged_df['Cluster Labels'] == 1, toronto_merged_df.columns[[1] + list(range(5, toronto_merged_df.shape[1]))]]

Cluster #3 ==> Living Areas.

In [None]:
toronto_merged_df.loc[toronto_merged_df['Cluster Labels'] == 2, toronto_merged_df.columns[[1] + list(range(5, toronto_merged_df.shape[1]))]]