# Machine Learning Assignment Week 3 Final

### Part 1: Transfering Data in Padas dataframe using BeautifulSoup

###### Installing BeautifulSoup Library

In [5]:
pip install BeautifulSoup4

Note: you may need to restart the kernel to use updated packages.


###### Importing required libraries

In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import requests


##### Fetching data from Wikipedia

In [2]:
source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text

In [3]:
soup = BeautifulSoup(source, 'html.parser')

##### Converting data into pandas dataframe

In [4]:
data = soup.find('table', class_='wikitable sortable').tbody
# print(data)

columns = ['PostCode', 'Borough', 'Neighborhood']
toronto_df = pd.DataFrame(columns=columns)

for row in data.find_all('tr'):
    row_data = []
    for col in row.find_all('td'):
        row_data.append(col.text)
        
    if row_data:
        df = pd.DataFrame({"PostCode" : [row_data[0]],
                      "Borough" : [row_data[1]],
                      "Neighborhood" : [row_data[2]]})
        toronto_df = toronto_df.append(df, ignore_index=True)
toronto_df.head()

Unnamed: 0,PostCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned\n
1,M2A,Not assigned,Not assigned\n
2,M3A,North York,Parkwoods\n
3,M4A,North York,Victoria Village\n
4,M5A,Downtown Toronto,Harbourfront\n


##### Cleaning the data

In [5]:
toronto_df['Neighborhood'] = [value.split('\n')[0] for value in toronto_df['Neighborhood']]

# Removing all rows having 'Not assigned' value to both Neighborhood and Borough column
toronto_df = toronto_df[(toronto_df.Neighborhood != 'Not assigned') | (toronto_df.Borough != 'Not assigned')]

# Assigning Borough to Neighborhood columns having values as 'Not assigned'
toronto_df.loc[toronto_df.Neighborhood == 'Not assigned', 'Neighborhood'] = toronto_df['Borough']

##### Merging rows having same 'PostCode' values

In [6]:
toronto_df = toronto_df.groupby('PostCode', as_index=False).agg(','.join)
toronto_df['Borough'] = toronto_df['Borough'].str.split(',').str[0]

In [7]:
toronto_df.head(12)


Unnamed: 0,PostCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park,Ionview,Kennedy Park"
7,M1L,Scarborough,"Clairlea,Golden Mile,Oakridge"
8,M1M,Scarborough,"Cliffcrest,Cliffside,Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff,Cliffside West"


In [8]:
toronto_df.shape

(103, 3)

##### This fulfils the assignment requirement of showing that there are 103 rows and 3 columns to this data frame from which the first 12 rows have been displayed.

##### Save for future

In [9]:
toronto_df.to_csv('task1.csv', index=False)

 ## 2. Collecting Lattitude and Longitude from Post Codes

In [10]:
import pandas as pd
df = pd.read_csv('task1.csv')
df.head()

Unnamed: 0,PostCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


###### Installing the geocoder library 

In [11]:
!pip install geocoder
import geocoder



##### Collecting Location Coordinates

In [12]:
def get_latlng(postal_code):
    # initialize your variable to None
    lat_lng_coords = None
    # loop until you get the coordinates
    while(lat_lng_coords is None):
        g = geocoder.arcgis('{}, Toronto, Ontario'.format(postal_code))
        lat_lng_coords = g.latlng
    return lat_lng_coords
    
get_latlng('M4G')

[43.70949500000006, -79.36398897099997]

In [13]:
post_codes = df['PostCode']    
coords = [ get_latlng(post_code) for post_code in post_codes.tolist() ]

In [14]:
df_coords = pd.DataFrame(coords, columns=['Latitude', 'Longitude'])
df['Latitude'] = df_coords['Latitude']
df['Longitude'] = df_coords['Longitude']

In [15]:
df.head(12)

Unnamed: 0,PostCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge,Malvern",43.811525,-79.195517
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.785665,-79.158725
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.765815,-79.175193
3,M1G,Scarborough,Woburn,43.768369,-79.21759
4,M1H,Scarborough,Cedarbrae,43.769688,-79.23944
5,M1J,Scarborough,Scarborough Village,43.743125,-79.23175
6,M1K,Scarborough,"East Birchmount Park,Ionview,Kennedy Park",43.726276,-79.263625
7,M1L,Scarborough,"Clairlea,Golden Mile,Oakridge",43.713054,-79.285055
8,M1M,Scarborough,"Cliffcrest,Cliffside,Scarborough Village West",43.724235,-79.227925
9,M1N,Scarborough,"Birch Cliff,Cliffside West",43.69677,-79.259967


In [16]:
df.shape

(103, 5)

##### This suffices the assignment requirement of producing a dataframe containing 103 rows and 5 columns including logitude and lattitude coordinates. The first 12 rows have been displayed. 

##### Saving for future

In [17]:
df.to_csv('task2.csv', index=False)

### 3. Exploring and Clustering the Toronto Neighborhoods 

##### Installing geopy anf folium

In [18]:
pip install geopy

Note: you may need to restart the kernel to use updated packages.


In [19]:
pip install folium

Note: you may need to restart the kernel to use updated packages.


In [20]:
from geopy.geocoders import Nominatim
import folium
import numpy as np
from sklearn.cluster import KMeans

import matplotlib.cm as cm
import matplotlib.colors as colors

##### Fetching geo coordinates

In [21]:
address = 'Toronto, Ontario'

geolocator = Nominatim(user_agent="tl-toronto-neigh")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinates of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinates of Toronto are 43.653963, -79.387207.


##### Creating a Map of Toronto with Boroughs markers on top using Folium

In [22]:

map_toronto = folium.Map(location=[latitude, longitude], zoom_start=11)

for lat, long, post, borough, neigh in zip(df['Latitude'], df['Longitude'], df['PostCode'], df['Borough'], df['Neighborhood']):
    label = "{} ({}): {}".format(borough, post, neigh)
    popup = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, long],
        radius=5,
        popup=popup,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)
    
map_toronto


##### Reducing the number of Boroughs to explore To reduce the numbers of calls to FourSquare API, we will only explore boroughs that have Toronto in their names.

In [23]:
toronto_boroughs = ['East Toronto', 'Central Toronto', 'Downtown Toronto', 'West Toronto']
toronto_central_df = df[df['Borough'].isin(toronto_boroughs)].reset_index(drop=True)
print(toronto_central_df.shape)
toronto_central_df.head()

(38, 5)


Unnamed: 0,PostCode,Borough,Neighborhood,Latitude,Longitude
0,M4E,East Toronto,The Beaches,43.676531,-79.295425
1,M4K,East Toronto,"The Danforth West,Riverdale",43.683178,-79.355105
2,M4L,East Toronto,"The Beaches West,India Bazaar",43.667965,-79.314667
3,M4M,East Toronto,Studio District,43.660629,-79.334855
4,M4N,Central Toronto,Lawrence Park,43.72842,-79.387133


In [24]:
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=12)


for lat, long, post, borough, neigh in zip(toronto_central_df['Latitude'], toronto_central_df['Longitude'], toronto_central_df['PostCode'], toronto_central_df['Borough'], toronto_central_df['Neighborhood']):
    label = "{} ({}): {}".format(borough, post, neigh)
    popup = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, long],
        radius=5,
        popup=popup,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)
    
map_toronto

In [25]:
pip install foursquare

Note: you may need to restart the kernel to use updated packages.


In [26]:
import json
import requests 
import pandas as pd

##### Using FourSquare API to explore the Boroughs

In [42]:
#put your values
VERSION = '20180605'
CLIENT_ID = .......
CLIENT_SECRET = .......
latitude = latitude
longitude = longitude
radius = 500
LIMIT = 100



venues = []

for lat, long, post, borough, neighborhood in zip(toronto_central_df['Latitude'], toronto_central_df['Longitude'], toronto_central_df['PostCode'], toronto_central_df['Borough'], toronto_central_df['Neighborhood']):
    url = "https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}".format(
        CLIENT_ID,
        CLIENT_SECRET,
        VERSION,
        lat,
        long,
        radius, 
        LIMIT)
    
    results = requests.get(url).json()["response"]['groups'][0]['items']
    
    for venue in results:
        venues.append((
            post, 
            borough,
            neighborhood,
            lat, 
            long, 
            venue['venue']['name'], 
            venue['venue']['location']['lat'], 
            venue['venue']['location']['lng'],  
            venue['venue']['categories'][0]['name']))

In [44]:
venues_df = pd.DataFrame(venues)
venues_df.columns = ['PostCode', 'Borough', 'Neighborhood', 'BoroughLatitude', 'BoroughLongitude', 'VenueName', 'VenueLatitude', 'VenueLongitude', 'VenueCategory']
print(venues_df.shape)
venues_df.head()

(1758, 9)


Unnamed: 0,PostCode,Borough,Neighborhood,BoroughLatitude,BoroughLongitude,VenueName,VenueLatitude,VenueLongitude,VenueCategory
0,M4E,East Toronto,The Beaches,43.676531,-79.295425,Glen Manor Ravine,43.676821,-79.293942,Trail
1,M4E,East Toronto,The Beaches,43.676531,-79.295425,The Big Carrot Natural Food Market,43.678879,-79.297734,Health Food Store
2,M4E,East Toronto,The Beaches,43.676531,-79.295425,Grover Pub and Grub,43.679181,-79.297215,Pub
3,M4E,East Toronto,The Beaches,43.676531,-79.295425,Upper Beaches,43.680563,-79.292869,Neighborhood
4,M4K,East Toronto,"The Danforth West,Riverdale",43.683178,-79.355105,Dollarama,43.686197,-79.355989,Discount Store


In [46]:
venues_df.groupby(['PostCode', 'Borough', 'Neighborhood'])['VenueName'].count()


PostCode  Borough           Neighborhood                                                                                        
M4E       East Toronto      The Beaches                                                                                               4
M4K       East Toronto      The Danforth West,Riverdale                                                                               4
M4L       East Toronto      The Beaches West,India Bazaar                                                                            20
M4M       East Toronto      Studio District                                                                                          48
M4N       Central Toronto   Lawrence Park                                                                                             2
M4P       Central Toronto   Davisville North                                                                                          7
M4R       Central Toronto   North Toronto West         

In [47]:
len(venues_df['VenueCategory'].unique())

216

##### Analyze venues in each area

##### one hot encoding

In [49]:
toronto_central_onehot = pd.get_dummies(venues_df[['VenueCategory']], prefix="", prefix_sep="")

# add postal, borough and neighborhood column back to dataframe
toronto_central_onehot['PostCode'] = venues_df['PostCode'] 
toronto_central_onehot['Borough'] = venues_df['Borough'] 
toronto_central_onehot['Neighborhoods'] = venues_df['Neighborhood'] 

# move postal, borough and neighborhood column to the first column
fixed_columns = list(toronto_central_onehot.columns[-3:]) + list(toronto_central_onehot.columns[:-3])
toronto_central_onehot = toronto_central_onehot[fixed_columns]

print(toronto_central_onehot.shape)
toronto_central_onehot.head()

(1758, 219)


Unnamed: 0,PostCode,Borough,Neighborhoods,Afghan Restaurant,American Restaurant,Antique Shop,Art Gallery,Art Museum,Arts & Crafts Store,Asian Restaurant,...,Trail,Train Station,Tram Station,Tunnel,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wings Joint,Yoga Studio
0,M4E,East Toronto,The Beaches,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1,M4E,East Toronto,The Beaches,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,M4E,East Toronto,The Beaches,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,M4E,East Toronto,The Beaches,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,M4K,East Toronto,"The Danforth West,Riverdale",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0



##### Get the frequency of occurance of each category in an area

In [73]:
toronto_central_venues_freq = toronto_central_onehot.groupby(['PostCode', 'Borough', 'Neighborhoods']).mean().reset_index()
print(toronto_central_venues_freq.shape)
toronto_central_venues_freq.head()

(37, 219)


Unnamed: 0,PostCode,Borough,Neighborhoods,Afghan Restaurant,American Restaurant,Antique Shop,Art Gallery,Art Museum,Arts & Crafts Store,Asian Restaurant,...,Trail,Train Station,Tram Station,Tunnel,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wings Joint,Yoga Studio
0,M4E,East Toronto,The Beaches,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,M4K,East Toronto,"The Danforth West,Riverdale",0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,M4L,East Toronto,"The Beaches West,India Bazaar",0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,M4M,East Toronto,Studio District,0.0,0.041667,0.020833,0.0,0.0,0.041667,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.020833,0.0,0.0,0.0
4,M4N,Central Toronto,Lawrence Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


##### Get 10 most occurance venue types in each area

In [74]:

num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
areaColumns = ['PostCode', 'Borough', 'Neighborhoods']
freqColumns = []
for ind in np.arange(num_top_venues):
    try:
        freqColumns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        freqColumns.append('{}th Most Common Venue'.format(ind+1))
columns = areaColumns+freqColumns
# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['PostCode'] = toronto_central_venues_freq['PostCode']
neighborhoods_venues_sorted['Borough'] = toronto_central_venues_freq['Borough']
neighborhoods_venues_sorted['Neighborhoods'] = toronto_central_venues_freq['Neighborhoods']

for ind in np.arange(toronto_central_venues_freq.shape[0]):
    row_categories = toronto_central_venues_freq.iloc[ind, :].iloc[3:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    neighborhoods_venues_sorted.iloc[ind, 3:] = row_categories_sorted.index.values[0:num_top_venues]

neighborhoods_venues_sorted.sort_values(freqColumns, inplace=True)
neighborhoods_venues_sorted.head()

Unnamed: 0,PostCode,Borough,Neighborhoods,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
31,M6J,West Toronto,"Little Portugal,Trinity",Bar,Restaurant,Asian Restaurant,Cocktail Bar,Coffee Shop,Bakery,Wine Bar,Vietnamese Restaurant,French Restaurant,New American Restaurant
1,M4K,East Toronto,"The Danforth West,Riverdale",Bus Line,Park,Grocery Store,Discount Store,Yoga Studio,Elementary School,Flea Market,Fish Market,Fish & Chips Shop,Fast Food Restaurant
4,M4N,Central Toronto,Lawrence Park,Bus Line,Swim School,Yoga Studio,Food & Drink Shop,Flower Shop,Flea Market,Fish Market,Fish & Chips Shop,Fast Food Restaurant,Farmers Market
35,M6S,West Toronto,"Runnymede,Swansea",Café,Coffee Shop,Bakery,Pizza Place,Italian Restaurant,Flower Shop,Restaurant,French Restaurant,Bookstore,Pilates Studio
24,M5S,Downtown Toronto,"Harbord,University of Toronto",Café,Coffee Shop,Restaurant,Bakery,Bar,Italian Restaurant,Japanese Restaurant,Bookstore,Gym,Pizza Place



##### Clustering areas
##### Using KMeans algorigthm to cluster the toronto central areas into 3 clusters

In [None]:
kclusters = 3

toronto_central_venues_freq_clustering = toronto_central_venues_freq.drop(['PostCode', 'Borough', 'Neighborhoods'], 1)

kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_central_venues_freq_clustering)
kmeans.labels_[0:10]
toronto_central_clustered_df = toronto_central_df
toronto_central_clustered_df['Cluster'] = kmeans.labels_

toronto_central_clustered_df = toronto_central_clustered_df.join(neighborhoods_venues_sorted.drop(['Borough', 'Neighborhoods'], 1).set_index('PostCode'), on='PostCode')
toronto_central_clustered_df.sort_values(['Cluster'] + freqColumns, inplace=True)
toronto_central_clustered_df.head()