In [1]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests


# 1) Extracting raw table from Wiki

In [2]:
# getting data from Wikipedia
wikipedia_link='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
raw_wikipedia_page= requests.get(wikipedia_link).text

# using beautiful soup to parse the HTML/XML codes.
soup = BeautifulSoup(raw_wikipedia_page,'xml')
wikitables= soup.find_all('table')
df= pd.read_html(str(wikitables[0]), index_col=None, header=0)[0]

Group data which has the same PostCode and Borough

In [3]:

# Remove not assigned rows 
df = df[df.Borough != 'Not assigned'].reset_index(drop=True)


# Group neighborhoods by postal and borough
neighborhoods = df.groupby(['Postcode','Borough'], as_index=False).agg(lambda x: ','.join(x))

In the last cell of your notebook, use the .shape method to print the number of rows of your dataframe.

In [4]:
neighborhoods.shape

(103, 3)

# 2) Merge Geo Data to data frame

Geocoder package is unreliable, using csv file from: http://cocl.us/Geospatial_data instead.

In [5]:

import urllib

In [6]:
testfile = urllib.request
testfile.urlretrieve ("http://cocl.us/Geospatial_data", "toronto_geodata.csv")

print('Data downloaded!')

Data downloaded!


In [7]:
geo_df = pd.read_csv('toronto_geodata.csv')
geo_df = geo_df.rename(index=str, columns={"Postal Code": "Postcode", "Latitude": "Latitude", "Longitude":"Longitude"})

toronto_df = pd.merge(neighborhoods, geo_df, on='Postcode')

toronto_df.rename(columns={"Neighbourhood": "Neighborhood"}, inplace=True)

toronto_df.head(15)

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge,Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"East Birchmount Park,Ionview,Kennedy Park",43.727929,-79.262029
7,M1L,Scarborough,"Clairlea,Golden Mile,Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffcrest,Cliffside,Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff,Cliffside West",43.692657,-79.264848


Explore and cluster the neighborhoods in Toronto. You can decide to work with only boroughs that contain the word Toronto and then replicate the same analysis we did to the New York City data. It is up to you.

Just make sure:

1.to add enough Markdown cells to explain what you decided to do and to report any observations you make.

2.to generate maps to visualize your neighborhoods and how they cluster together.

In [8]:
import folium 

!conda install -c conda-forge geopy --yes
from geopy.geocoders import Nominatim

Collecting package metadata: done
Solving environment: done

# All requested packages already installed.



In [10]:
address = 'Toronto, Ontario'

geolocator = Nominatim(user_agent="tl-toronto-neigh")
location = geolocator.geocode(address)
toronto_latitude = location.latitude
toronto_longitude = location.longitude
print('The geograpical coordinates of Toronto are {}, {}.'.format(toronto_latitude, toronto_longitude))

The geograpical coordinates of Toronto are 43.653963, -79.387207.


In [11]:
map_toronto = folium.Map(location = [toronto_latitude, toronto_longitude], zoom_start = 10.7)

# add markers to map
for lat, lng, borough, neighborhood in zip(toronto_df['Latitude'], toronto_df['Longitude'], toronto_df['Borough'], toronto_df['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7).add_to(map_toronto)  
    

map_toronto

In [12]:
# Explore boroughs that have Toronto in their names.
toronto_area = ['East Toronto', 'Central Toronto', 'Downtown Toronto', 'West Toronto']
toronto_df = toronto_df[toronto_df['Borough'].isin(toronto_area)].reset_index(drop=True)
toronto_df.head()

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude
0,M4E,East Toronto,The Beaches,43.676357,-79.293031
1,M4K,East Toronto,"The Danforth West,Riverdale",43.679557,-79.352188
2,M4L,East Toronto,"The Beaches West,India Bazaar",43.668999,-79.315572
3,M4M,East Toronto,Studio District,43.659526,-79.340923
4,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879


In [13]:
#Use FourSquare API to explore the boroughs
CLIENT_ID = 'EVB2U5OA4QXIYGXVUK4JCLCJLAK0RKBJRWGWAUYIMBE2ZJ0N'
CLIENT_SECRET = 'XNAFJHATEZJLBS0LHO2OP2TDPMYIRG2AYE4BTKOLTQJVGI05'
VERSION = '20180605'
radius = 500
LIMIT = 100

venues = []

for lat, long, post, borough, neighborhood in zip(toronto_df['Latitude'], toronto_df['Longitude'], toronto_df['Postcode'], toronto_df['Borough'], toronto_df['Neighborhood']):
    url = "https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}".format(
        CLIENT_ID,
        CLIENT_SECRET,
        VERSION,
        lat,
        long,
        radius, 
        LIMIT)
    
    results = requests.get(url).json()["response"]['groups'][0]['items']
    
    for venue in results:
        venues.append((
            post, 
            borough,
            neighborhood,
            lat, 
            long, 
            venue['venue']['name'], 
            venue['venue']['location']['lat'], 
            venue['venue']['location']['lng'],  
            venue['venue']['categories'][0]['name']))

In [14]:

# nearby_venues
toronto_venues_df = pd.DataFrame(venues)
toronto_venues_df.columns = ['Postal Code', 
                             'Borough', 
                             'Neighborhood', 
                             'Borough Latitude', 
                             'Borough Longitude', 
                             'Venue Name', 
                             'Venue Latitude', 
                             'Venue Longitude', 
                             'Venue Category']
toronto_venues_df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Borough Latitude,Borough Longitude,Venue Name,Venue Latitude,Venue Longitude,Venue Category
0,M4E,East Toronto,The Beaches,43.676357,-79.293031,Grover Pub and Grub,43.679181,-79.297215,Pub
1,M4E,East Toronto,The Beaches,43.676357,-79.293031,Starbucks,43.678798,-79.298045,Coffee Shop
2,M4E,East Toronto,The Beaches,43.676357,-79.293031,Glen Manor Ravine,43.676821,-79.293942,Trail
3,M4E,East Toronto,The Beaches,43.676357,-79.293031,Upper Beaches,43.680563,-79.292869,Neighborhood
4,M4K,East Toronto,"The Danforth West,Riverdale",43.679557,-79.352188,Pantheon,43.677621,-79.351434,Greek Restaurant


In [21]:
# The one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues_df[['Venue Category']], prefix="", prefix_sep="")

toronto_onehot['Postal Code'] = toronto_venues_df['Postal Code'] 
toronto_onehot['Borough'] = toronto_venues_df['Borough'] 
toronto_onehot['Neighborhood'] = toronto_venues_df['Neighborhood'] 

Columns = list(toronto_onehot.columns[-3:]) + list(toronto_onehot.columns[:-3])
toronto_onehot = toronto_onehot[Columns]

toronto_onehot.head()

Unnamed: 0,Yoga Studio,Postal Code,Borough,Adult Boutique,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,...,Thrift / Vintage Store,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wings Joint,Women's Store
0,0,M4E,East Toronto,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,M4E,East Toronto,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,M4E,East Toronto,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
3,0,M4E,East Toronto,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,M4K,East Toronto,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [22]:
toronto_onehot = toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_onehot

Unnamed: 0,Neighborhood,Yoga Studio,Adult Boutique,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,...,Thrift / Vintage Store,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wings Joint,Women's Store
0,"Adelaide,King,Richmond",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.01,0.0,0.0,0.01,0.0,0.01
1,Berczy Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"Brockton,Exhibition Place,Parkdale Village",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Business Reply Mail Processing Centre 969 Eastern,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,"CN Tower,Bathurst Quay,Island airport,Harbourf...",0.0,0.0,0.0,0.083333,0.083333,0.083333,0.166667,0.166667,0.166667,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,"Cabbagetown,St. James Town",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,Central Bay Street,0.012346,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.012346,0.0,0.0,0.012346,0.0,0.0
7,"Chinatown,Grange Park,Kensington Market",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.01,0.0,0.0,0.0,0.05,0.0,0.05,0.01,0.0,0.01
8,Christie,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,Church and Wellesley,0.011494,0.011494,0.011494,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.011494,0.011494,0.0,0.011494,0.0


Let's print each neighborhood along with the top 5 most common venues

In [23]:
num_top_venues = 5

for hood in toronto_onehot['Neighborhood']:
    print("----"+hood+"----")
    temp = toronto_onehot[toronto_onehot['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Adelaide,King,Richmond----
                 venue  freq
0          Coffee Shop  0.06
1                 Café  0.05
2           Steakhouse  0.04
3      Thai Restaurant  0.04
4  American Restaurant  0.04


----Berczy Park----
            venue  freq
0     Coffee Shop  0.07
1    Cocktail Bar  0.05
2      Restaurant  0.05
3             Pub  0.04
4  Farmers Market  0.04


----Brockton,Exhibition Place,Parkdale Village----
                   venue  freq
0            Coffee Shop  0.10
1                   Café  0.10
2  Performing Arts Venue  0.10
3         Breakfast Spot  0.10
4      Convenience Store  0.05


----Business Reply Mail Processing Centre 969 Eastern----
                venue  freq
0  Light Rail Station  0.12
1       Auto Workshop  0.06
2          Restaurant  0.06
3          Smoke Shop  0.06
4                 Spa  0.06


----CN Tower,Bathurst Quay,Island airport,Harbourfront West,King and Spadina,Railway Lands,South Niagara----
              venue  freq
0    Airport Lounge  0.17

write a function to sort the venues in descending order.

In [24]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

create the new dataframe and display the top 10 venues for each neighborhood.

In [25]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_onehot['Neighborhood']

for ind in np.arange(toronto_onehot.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_onehot.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"Adelaide,King,Richmond",Coffee Shop,Café,Steakhouse,American Restaurant,Thai Restaurant,Gym,Hotel,Restaurant,Asian Restaurant,Bar
1,Berczy Park,Coffee Shop,Cocktail Bar,Restaurant,Farmers Market,Bakery,Steakhouse,Cheese Shop,Café,Beer Bar,Pub
2,"Brockton,Exhibition Place,Parkdale Village",Performing Arts Venue,Breakfast Spot,Café,Coffee Shop,Grocery Store,Climbing Gym,Convenience Store,Burrito Place,Stadium,Bar
3,Business Reply Mail Processing Centre 969 Eastern,Light Rail Station,Farmers Market,Garden,Park,Fast Food Restaurant,Spa,Brewery,Burrito Place,Restaurant,Auto Workshop
4,"CN Tower,Bathurst Quay,Island airport,Harbourf...",Airport Lounge,Airport Service,Airport Terminal,Boat or Ferry,Airport,Airport Food Court,Airport Gate,Harbor / Marina,Sculpture Garden,Women's Store


# 3) Explore and Cluster Neighborhoods 

Run k-means to cluster the neighborhood into 5 clusters.

In [31]:
#import k-means from clustering stage
from sklearn.cluster import KMeans
# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

In [32]:
# set number of clusters
kclusters = 3

toronto_onehot = toronto_onehot.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_onehot)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]

KeyError: "['Neighborhood'] not found in axis"

In [29]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_merged_df = Toronto_df

# merge
toronto_merged_df = toronto_merged_df.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

toronto_merged_df.head() # check the last columns!

ValueError: cannot insert Cluster Labels, already exists

In [30]:
# create map
MapClusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged_df['Latitude'], toronto_merged_df['Longitude'], toronto_merged_df['Neighborhood'], toronto_merged_df['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(MapClusters)
       
MapClusters

NameError: name 'latitude' is not defined

# Examine Clusters Examine Cluster

Cluster #1 ==> Park and Playground

In [None]:
toronto_merged_df.loc[toronto_merged_df['Cluster Labels'] == 0, toronto_merged_df.columns[[1] + list(range(5, toronto_merged_df.shape[1]))]]

Cluster #2 ==> Garden

In [None]:
toronto_merged_df.loc[toronto_merged_df['Cluster Labels'] == 1, toronto_merged_df.columns[[1] + list(range(5, toronto_merged_df.shape[1]))]]

Cluster #3 ==> Living Areas.

In [None]:
toronto_merged_df.loc[toronto_merged_df['Cluster Labels'] == 2, toronto_merged_df.columns[[1] + list(range(5, toronto_merged_df.shape[1]))]]