## Part 1: Pull Data From URL and Make Your Dataframe

In [54]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

#!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab *already downloaded
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')

Solving environment: done

# All requested packages already installed.

Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/Python36

  added / updated specs: 
    - folium=0.5.0


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    vincent-0.4.4              |             py_1          28 KB  conda-forge
    altair-4.1.0               |             py_1         614 KB  conda-forge
    branca-0.4.0               |             py_0          26 KB  conda-forge
    folium-0.5.0               |             py_0          45 KB  conda-forge
    ------------------------------------------------------------
                                           Total:         713 KB

The following NEW packages will be INSTALLED:

    altair:  4.1.0-py_1 conda-forge
    branca:  0.4.0-py_0 conda-forge
    folium:  0.5.0-py_0 conda-forge
    vincent: 0.4.4-py_1 conda-forge


Down

In [55]:
url='https://en.wikipedia.org/w/index.php?title=List_of_postal_codes_of_Canada:_M&oldid=945633050'

df = pd.read_html(url)

df_postcodes=df[0]

print('imported dataframe has', df_postcodes['Postcode'].count(), "postcodes entriess")

df_postcodes.head(10)
#df_postcodes.dtypes

imported dataframe has 287 postcodes entriess


Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor
7,M7A,Downtown Toronto,Queen's Park
8,M8A,Not assigned,Not assigned
9,M9A,Etobicoke,Islington Avenue


Clean the data - Get rid of N/A Boroughs and for any N/A Nieghbhorhoods, add the Borough name to Nieghborhoods

In [56]:
#df_postcodes["Borough"].replace("nan", "Not assigned", inplace=True)
#df_postcodes["Neighbourhood"].replace("nan", "Not assigned", inplace=True)


df_postcodes = df_postcodes[df_postcodes["Borough"] != "Not assigned"]
#df_postcodes.dropna(subset=["Borough"], axis=0, inplace=True)

#Replace the N/A values of Neighborhood with the Borough Value
df_postcodes["Neighbourhood"].replace("Not assigned", df_postcodes["Borough"], inplace=True)

df_postcodes.head(20)

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor
7,M7A,Downtown Toronto,Queen's Park
9,M9A,Etobicoke,Islington Avenue
10,M1B,Scarborough,Rouge
11,M1B,Scarborough,Malvern
13,M3B,North York,Don Mills North


In [57]:
#Now combine all the neighbourhoods with the same postal codes
df = df_postcodes.groupby(["Postcode","Borough"])["Neighbourhood"].apply(list)
df = df.sample(frac=1).reset_index()
df["Neighbourhood"] = df["Neighbourhood"].str.join(', ')
df.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1P,Scarborough,"Dorset Park, Scarborough Town Centre, Wexford ..."
1,M3C,North York,"Flemingdon Park, Don Mills South"
2,M8W,Etobicoke,"Alderwood, Long Branch"
3,M9N,York,Weston
4,M4P,Central Toronto,Davisville North
5,M1G,Scarborough,Woburn
6,M5M,North York,"Bedford Park, Lawrence Manor East"
7,M6M,York,"Del Ray, Keelesdale, Mount Dennis, Silverthorn"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M5P,Central Toronto,"Forest Hill North, Forest Hill West"


In [58]:
df.shape

(103, 3)

## Part 2: Get Latitudes and Longitudes

In [71]:
#!pip install geocoder
#import geocoder

lat_list=[]
lng_list=[]
for i in range(df.shape[0]):
    address='{}, Toronto, Ontario'.format(df.at[i,'Postcode'])
    g=geocoder.arcgis(address)
    lat_list.append(g.latlng[0])
    lng_list.append(g.latlng[1])
    

#df.drop(['Postal Code_x', 'Latitude_x', 'Longitude_x','Postal Code_y'],axis=1, inplace=True)
df.drop(['Postal Code_y'], axis=1, inplace=True)
df.rename(columns= {'Latitude_y':'Latitude', 'Longitude_y':'Longitude'}, inplace=True)

df.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1P,Scarborough,"Dorset Park, Scarborough Town Centre, Wexford ...",43.75741,-79.273304
1,M3C,North York,"Flemingdon Park, Don Mills South",43.7259,-79.340923
2,M8W,Etobicoke,"Alderwood, Long Branch",43.602414,-79.543484
3,M9N,York,Weston,43.706876,-79.518188
4,M4P,Central Toronto,Davisville North,43.712751,-79.390197
5,M1G,Scarborough,Woburn,43.770992,-79.216917
6,M5M,North York,"Bedford Park, Lawrence Manor East",43.733283,-79.41975
7,M6M,York,"Del Ray, Keelesdale, Mount Dennis, Silverthorn",43.691116,-79.476013
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West",43.716316,-79.239476
9,M5P,Central Toronto,"Forest Hill North, Forest Hill West",43.696948,-79.411307


## Part 3: Clustering with K-Means

Get the latitude and latitude for Toronto

In [73]:
address = 'Toronto, Ontario'

geolocator = Nominatim(user_agent="Toronto_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


Create map of Toronto with the neighbourhoods imposed on it

In [79]:
# create map of New York using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighbourhood in zip(df['Latitude'], df['Longitude'], df['Borough'], df['Neighbourhood']):
    label ='{}, {}'.format(neighbourhood, borough) 
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

Simply the above map and segment and cluster the neighbhourhoods in Scarborough since there seems to be quite a few neighbhourhoods in that borough

In [81]:
Scarborough_data = df[df['Borough'] == 'Scarborough'].reset_index(drop=True)
Scarborough_data.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1P,Scarborough,"Dorset Park, Scarborough Town Centre, Wexford ...",43.75741,-79.273304
1,M1G,Scarborough,Woburn,43.770992,-79.216917
2,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West",43.716316,-79.239476
3,M1N,Scarborough,"Birch Cliff, Cliffside West",43.692657,-79.264848
4,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353


The geographical coordinates of Scarborough

In [83]:
map_Scarborough = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, label in zip(Scarborough_data['Latitude'], Scarborough_data['Longitude'], Scarborough_data['Neighbourhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_Scarborough)  
    
map_Scarborough

Define Foursquare Creditentials and Version

In [84]:
CLIENT_ID = '5SZD033DFFP1BYV0EJNAO10QIFGX1T2MCSY2JMTQOP5BAYAH'
CLIENT_SECRET = 'WPMLWHCCGOSDK0GXBULP30WM0XEKVC5Y5NQYFZBBU0UA3YLJ'
VERSION = '20180604'
LIMIT = 30
print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: 5SZD033DFFP1BYV0EJNAO10QIFGX1T2MCSY2JMTQOP5BAYAH
CLIENT_SECRET:WPMLWHCCGOSDK0GXBULP30WM0XEKVC5Y5NQYFZBBU0UA3YLJ
