In [170]:
#Import Beautiful Soup, lxml, requests to scrap data from Toronto Neighborhood in Wikipedia
!pip install wikipedia
from bs4 import BeautifulSoup
import wikipedia as wp
import numpy as np 

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

#conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print("Libraries Imported!")

Libraries Imported!


# Get Table from Wikipedia 

In [171]:
html = wp.page("List of postal codes of Canada: M").html().encode("UTF-8")
df = pd.read_html(html)[0]
df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [172]:
#delete Borough which is not assigned
df = df[df['Borough']!= 'Not assigned']
df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [173]:
df['Borough'].unique()

array(['North York', 'Downtown Toronto', 'Etobicoke', 'Scarborough',
       'East York', 'York', 'East Toronto', 'West Toronto',
       'Central Toronto', 'Mississauga'], dtype=object)

In [174]:
df['Neighborhood'].unique()

array(['Parkwoods', 'Victoria Village', 'Regent Park, Harbourfront',
       'Lawrence Manor, Lawrence Heights',
       "Queen's Park, Ontario Provincial Government",
       'Islington Avenue, Humber Valley Village', 'Malvern, Rouge',
       'Don Mills', 'Parkview Hill, Woodbine Gardens',
       'Garden District, Ryerson', 'Glencairn',
       'West Deane Park, Princess Gardens, Martin Grove, Islington, Cloverdale',
       'Rouge Hill, Port Union, Highland Creek', 'Woodbine Heights',
       'St. James Town', 'Humewood-Cedarvale',
       'Eringate, Bloordale Gardens, Old Burnhamthorpe, Markland Wood',
       'Guildwood, Morningside, West Hill', 'The Beaches', 'Berczy Park',
       'Caledonia-Fairbanks', 'Woburn', 'Leaside', 'Central Bay Street',
       'Christie', 'Cedarbrae', 'Hillcrest Village',
       'Bathurst Manor, Wilson Heights, Downsview North',
       'Thorncliffe Park', 'Richmond, Adelaide, King',
       'Dufferin, Dovercourt Village', 'Scarborough Village',
       'Fairview, H

In [175]:
#combine neighbourhoods where postcode & Borough are the same
df = df.groupby(['Postal Code','Borough'])['Neighborhood'].apply(lambda x:', '.join(x)).to_frame().reset_index()

In [176]:
df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [177]:
for index, row in df.iterrows():
    if row['Neighborhood'] == 'Not assigned':
        row['Neighborhood'] == df['Borough']
    

In [178]:
colnames = ['Postcode','Borough','Neighborhood']
df.columns = colnames

In [179]:
df.head()

Unnamed: 0,Postcode,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


# Add geographical coordinates to the neighborhoods

In [180]:
import pandas as pd
import io
import requests

url="https://cocl.us/Geospatial_data"
s=requests.get(url).content #bytes
lat_lng=pd.read_csv(io.StringIO(s.decode('utf-8'))) #s.decode: change bytes to string to _io.StringIO
lat_lng.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [181]:
#merge lat_lng to df
lat_lng = lat_lng.rename(columns={'Postal Code':'Postcode'})
toronto_df = pd.merge(df, lat_lng, on='Postcode')
toronto_df.head()

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [182]:
print('The dataframe has {} boroughs and {} neighborhoods.'.format(
       len(toronto_df['Borough'].unique()),
       toronto_df.shape[0]))  # shape check the dimension of df(n,m)

The dataframe has 10 boroughs and 103 neighborhoods.


# Scrap the distribution of population from wilipedia

In [183]:
html = wp.page('Demographics of Toronto').html().encode('UTF-8')

In [184]:
demo_df = pd.read_html(html)
demo_df = pd.DataFrame(demo_df)[0]

In [185]:
#TORONTO & EAST YORK
TE  = demo_df[12]
TE

Unnamed: 0,Riding,Population,Ethnic Origin #1,%,Ethnic Origin #2,%.1,Ethnic Origin #3,%.2,Ethnic Origin #4,%.3,Ethnic Origin #5,%.4,Ethnic Origin #6,%.5,Ethnic Origin #7,%.6,Ethnic Origin #8,%.7,Ethnic Origin #9,%.8
0,Spadina-Fort York,114315,English,16.4,Chinese,16.0,Irish,14.6,Canadian,14.0,Scottish,13.2,French,7.7,German,7.6,,,,
1,Beaches-East York,108435,English,24.2,Irish,19.9,Canadian,19.7,Scottish,18.9,French,8.7,German,8.4,,,,,,
2,Davenport,107395,Portuguese,22.7,English,13.6,Canadian,12.8,Irish,11.5,Italian,11.1,Scottish,11.0,,,,,,
3,Parkdale-High Park,106445,English,22.3,Irish,20.0,Scottish,18.7,Canadian,16.1,German,9.8,French,8.88,Polish,8.5,,,,
4,Toronto-Danforth,105395,English,22.9,Irish,19.5,Scottish,18.7,Canadian,18.4,Chinese,13.8,French,8.86,German,8.8,Greek,7.3,,
5,Toronto-St. Paul's,104940,English,18.5,Canadian,16.1,Irish,15.2,Scottish,14.8,Polish,10.3,German,7.9,Russian,7.7,Italian,7.3,French,7.2
6,University-Rosedale,100520,English,20.6,Irish,16.6,Scottish,16.3,Canadian,15.2,Chinese,14.7,German,8.7,French,7.7,Italian,7.4,,
7,Toronto Centre,99590,English,15.7,Canadian,13.7,Irish,13.4,Scottish,12.6,Chinese,12.5,French,7.2,,,,,,


In [186]:
#NORTH YORK
NY = demo_df[13]

In [187]:
#SCARBOROUGH
S = demo_df[14]

In [188]:
#ETOBICOKE & YORK
EY = demo_df[15]

In [189]:
toronto_df['Borough'].unique()

array(['Scarborough', 'North York', 'East York', 'East Toronto',
       'Central Toronto', 'Downtown Toronto', 'York', 'West Toronto',
       'Mississauga', 'Etobicoke'], dtype=object)

# Get location data using Foursquare

In [190]:
#Get geographical coordinate of Toronto
address = 'Toronto'

geolocator = Nominatim(user_agent = 'ny_explorer')
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))


The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


In [191]:
#Connect Foursquare
CLIENT_ID = 'JIWNG1XSMPPPJ0HOWZQRGS2JPXEK4SLLHWJKQSEVROTFQRTE' # your Foursquare ID
CLIENT_SECRET = 'F143M2VMJGAH2GAD1WAVSR22CEMUYQATH5UET0PJSCHQWABZ' # your Foursquare Secret
VERSION = '20180604'
LIMIT = 100
print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)
radius = 500
#print(search_query + ' .... OK!')

url = 'https://api.foursquare.com/v2/venues/search?client_id={}&client_secret={}&ll={},{}&v={}&radius={}&limit={}'.format(CLIENT_ID, CLIENT_SECRET, latitude, longitude, VERSION, radius, LIMIT)
url

Your credentails:
CLIENT_ID: JIWNG1XSMPPPJ0HOWZQRGS2JPXEK4SLLHWJKQSEVROTFQRTE
CLIENT_SECRET:F143M2VMJGAH2GAD1WAVSR22CEMUYQATH5UET0PJSCHQWABZ


'https://api.foursquare.com/v2/venues/search?client_id=JIWNG1XSMPPPJ0HOWZQRGS2JPXEK4SLLHWJKQSEVROTFQRTE&client_secret=F143M2VMJGAH2GAD1WAVSR22CEMUYQATH5UET0PJSCHQWABZ&ll=43.6534817,-79.3839347&v=20180604&radius=500&limit=100'

In [192]:
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [193]:
#Send the GET request and examine the results
results = requests.get(url).json()
results

# assign relevant part of JSON to venues
venues = results['response']['venues']

# tranform venues into a dataframe
dataframe = json_normalize(venues)
dataframe.head()

Unnamed: 0,categories,hasPerk,id,location.address,location.cc,location.city,location.country,location.crossStreet,location.distance,location.formattedAddress,location.labeledLatLngs,location.lat,location.lng,location.postalCode,location.state,name,referralId
0,"[{'id': '4bf58dd8d48988d129941735', 'name': 'C...",False,4c093ee0340720a153728493,,CA,,Canada,,184,[Canada],"[{'label': 'display', 'lat': 43.65182710471462...",43.651827,-79.383949,,,City Hall Council Chambers,v-1591253247
1,"[{'id': '4bf58dd8d48988d129941735', 'name': 'C...",False,4ad4c05ef964a5208ff620e3,100 Queen St. W.,CA,Toronto,Canada,at Bay St.,38,"[100 Queen St. W. (at Bay St.), Toronto ON M5H...","[{'label': 'display', 'lat': 43.65313989695342...",43.65314,-79.383967,M5H 2N2,ON,Toronto City Hall,v-1591253247
2,"[{'id': '4bf58dd8d48988d129941735', 'name': 'C...",False,5b193c42598e64002ca79b96,100 Queen St W,CA,Toronto,Canada,,3,"[100 Queen St W, Toronto ON M5H 2N2, Canada]","[{'label': 'display', 'lat': 43.653454, 'lng':...",43.653454,-79.383952,M5H 2N2,ON,City of Toronto Civic Innovation Office,v-1591253247
3,"[{'id': '4bf58dd8d48988d15a941735', 'name': 'G...",False,4c0121fd9a950f47fa9208c6,100 Queen Street West,CA,Toronto,Canada,,6,"[100 Queen Street West, Toronto ON, Canada]","[{'label': 'display', 'lat': 43.65350358617817...",43.653504,-79.383866,,ON,City Hall Podium Green Roof,v-1591253247
4,"[{'id': '4bf58dd8d48988d162941735', 'name': 'O...",False,4fa43f81e4b098f42a5681a1,Nathan Phillips Square,CA,Toronto,Canada,,95,"[Nathan Phillips Square, Toronto ON, Canada]","[{'label': 'display', 'lat': 43.652622, 'lng':...",43.652622,-79.383923,,ON,the Archer / Three-Way Piece No. 2,v-1591253247


In [194]:
# keep only columns that include venue name, and anything that is associated with location
filtered_columns = ['name', 'categories'] + [col for col in dataframe.columns if col.startswith('location.')] + ['id']
dataframe_filtered = dataframe.loc[:, filtered_columns]

# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

# filter the category for each row
dataframe_filtered['categories'] = dataframe_filtered.apply(get_category_type, axis=1)

# clean column names by keeping only last term
dataframe_filtered.columns = [column.split('.')[-1] for column in dataframe_filtered.columns]

dataframe_filtered.head()

Unnamed: 0,name,categories,address,cc,city,country,crossStreet,distance,formattedAddress,labeledLatLngs,lat,lng,postalCode,state,id
0,City Hall Council Chambers,City Hall,,CA,,Canada,,184,[Canada],"[{'label': 'display', 'lat': 43.65182710471462...",43.651827,-79.383949,,,4c093ee0340720a153728493
1,Toronto City Hall,City Hall,100 Queen St. W.,CA,Toronto,Canada,at Bay St.,38,"[100 Queen St. W. (at Bay St.), Toronto ON M5H...","[{'label': 'display', 'lat': 43.65313989695342...",43.65314,-79.383967,M5H 2N2,ON,4ad4c05ef964a5208ff620e3
2,City of Toronto Civic Innovation Office,City Hall,100 Queen St W,CA,Toronto,Canada,,3,"[100 Queen St W, Toronto ON M5H 2N2, Canada]","[{'label': 'display', 'lat': 43.653454, 'lng':...",43.653454,-79.383952,M5H 2N2,ON,5b193c42598e64002ca79b96
3,City Hall Podium Green Roof,Garden,100 Queen Street West,CA,Toronto,Canada,,6,"[100 Queen Street West, Toronto ON, Canada]","[{'label': 'display', 'lat': 43.65350358617817...",43.653504,-79.383866,,ON,4c0121fd9a950f47fa9208c6
4,the Archer / Three-Way Piece No. 2,Other Great Outdoors,Nathan Phillips Square,CA,Toronto,Canada,,95,"[Nathan Phillips Square, Toronto ON, Canada]","[{'label': 'display', 'lat': 43.652622, 'lng':...",43.652622,-79.383923,,ON,4fa43f81e4b098f42a5681a1


In [195]:
#Create function to know how many venues there are in Toronto

def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [196]:
toronto_df.head()

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [None]:
#List of Neighborhood that have venues in Toronto
toronto_venues = getNearbyVenues(names=toronto_df['Neighborhood'],
                                   latitudes=toronto_df['Latitude'],
                                   longitudes=toronto_df['Longitude']
                                  )

In [None]:
print('{} venues were returned by Foursquare.'.format(toronto_venues.shape[0]))
toronto_venues.head()

# Folium Library and Leaflet Map

In [None]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(toronto_df['Latitude'], toronto_df['Longitude'], 
                                           toronto_df['Borough'], toronto_df['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

# Relationship between neighborhood and Chinese Restaurant

In [None]:
toronto_df.head()

In [None]:
toronto_venues.head()

In [None]:
#Number of venues per neighborhood
toronto_venues.groupby('Neighborhood').count()
x = toronto_venues.groupby('Neighborhood').count()
x.index.name = 'Neighborhood'
x.reset_index(inplace = True)
x.head()

In [None]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
manhattan_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()

In [None]:
# add neighborhood column back to dataframe
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_grouped.head()

In [None]:
toronto_chinese = toronto_grouped[['Neighborhood','Chinese Restaurant']]
toronto_chinese.head()

In [None]:
toronto_merged = pd.merge(toronto_df, toronto_chinese, on='Neighborhood')

In [None]:
toronto_merged

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
plt.figure(figsize=(18,6))
ax = sns.violinplot(x="Borough", y="Chinese Restaurant", data=toronto_merged)
plt.show()

In [None]:
plt.figure(figsize = (18,6))
ax2 = plt.bar(x = 'Borough', height = 'Chinese Restaurant', data = toronto_merged)
plt.show()

In [None]:
#With the help of this bar plots we can identify the boroughs with densely populated Chinese restaurants. 
#It is drawn using seaborn library to show the distribution of Chinese restaurants in different boroughs.
##Let’s also visualize the neighborhood with Chinese Restaurants:

In [None]:
neighbor_form = toronto_merged[toronto_merged['Chinese Restaurant']>0]
plt.figure(figsize = (22,6))
ax2 = plt.bar(x = 'Neighborhood', height = 'Chinese Restaurant', data = neighbor_form)
plt.xticks(rotation=90)
plt.show()

# Relationship between Neighborhood and Chinese Population

In [None]:
df = TE.append([NY,S,EY], ignore_index=True)

In [None]:
df

In [None]:
df_now = df.iloc[[0,4,6,7,8,10,13,14,15,16,17,19,20],:]
df_now

In [None]:
Chinese_population = {'Chinese_Population':[18290.4,14776.44,14544.51,12448.75,
                                            30407.895,35335.44,11400.48,8292.13,
                                           11818.15,7797.24,48985.75,7179.165,45486.26],'Riding': df_now['Riding']}

In [None]:
df_Chinese_population = pd.DataFrame(Chinese_population)

In [None]:
df_Chinese_population

In [None]:
plt.figure(figsize = (22,6))
ax2 = plt.bar(x = 'Riding', height = 'Chinese_Population', data = df_Chinese_population)
plt.xticks(rotation=90)
plt.show()

In [None]:
#This analysis & visualization of the relationship between neighborhoods & Chinese population present in those neighborhoods helps us in identifying the highly populated Chinese neighborhoods. 
#Once we identify those neighborhoods it helps us in deciding where to place the new Chinese restaurant. 
#Chinese restaurant placed in an densely populated Chinese neighborhood is more likely to get more Chinese customers than a restaurant placed in a neighborhood with less or no Chiense population. 
#Thus this analysis helps in the determining the success of the new Chinese restaurant.

# Predictive Modeling - KMeans

In [None]:
from sklearn.cluster import KMeans

toronto_part_clustering = toronto_merged.drop(['Neighborhood', 'Postcode', 'Borough'],1)

toronto_part_clustering.head()

In [None]:
KM = KMeans(n_clusters = 2, max_iter=10)
KM.fit(toronto_part_clustering)

In [None]:
error_cost = []
for i in range(3,11):
    KM = KMeans(n_clusters = i, max_iter = 100)
    try:
        KM.fit(toronto_part_clustering)
    except ValueError:
        print("error on line",i)
    
    error_cost.append(KM.inertia_/100)
    


In [None]:
#plot
plt.plot(range(3,11), error_cost, color='r', linewidth='3')
plt.xlabel('K values')
plt.ylabel('Squared Error(cost)')
plt.grid(color='white', linestyle='-', linewidth=2)
plt.show()

In [None]:
#Instantiate the clustering model and visualizer
!pip install yellowbrick
from yellowbrick.cluster import KElbowVisualizer
model = KMeans()
visualizer = KElbowVisualizer(model, k=(4,13))

#fit data to the visualizer
visualizer.fit(toronto_part_clustering)
visualizer.show()

In [None]:
#After analysing using elbow method using distortion score & Squared error for each K value, looks like K = 7 is the best value.
#Clustering the Toronto Neighborhood Using K-Means with K =7

In [None]:
kclusters = 7

kmeans = KMeans(n_clusters = kclusters, random_state=0).fit(toronto_part_clustering)

kmeans.labels_

In [None]:
#Add group label to each row
toronto_merged.insert(0, 'Cluster Labels', kmeans.labels_)

# Exam Cluster

In [None]:
#cluster 0 
toronto_merged.loc[toronto_merged['Cluster Labels']==0]

In [None]:
#Cluster 1
toronto_merged.loc[toronto_merged['Cluster Labels']==1]

In [None]:
#Cluster 2
toronto_merged.loc[toronto_merged['Cluster Labels']==2]

In [None]:
#Cluster 3
toronto_merged.loc[toronto_merged['Cluster Labels']==3]

In [None]:
#Cluster 4
toronto_merged.loc[toronto_merged['Cluster Labels']==4]

In [None]:
#Cluster 5
toronto_merged.loc[toronto_merged['Cluster Labels']==5]

In [None]:
#Cluster 6
toronto_merged.loc[toronto_merged['Cluster Labels']==6]

In [None]:
#Conclusion: Cluster 4 contains all the neighborhoods which is densely populated with Chinese restaurants. 