In [2]:
import numpy as np
import pandas as pd
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)
import json
import requests
from geopy.geocoders import Nominatim
from pandas.io.json import json_normalize
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans

In [7]:
from bs4 import BeautifulSoup

In [3]:
# GET request to scrape data from wikipedia
data = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text

In [4]:
# create the empty lists to store info from wikipedia
postalcodes = []
boroughs = []
neighborhoods = []

In [8]:
# Create beautifulsoup object to parse HTML data
soup = BeautifulSoup(data, 'html.parser')

In [9]:
# ask Soup to find the table
soup.find('table').find_all('tr')

# for each row of the table, find all the table data
for row in soup.find('table').find_all('tr'):
    cells = row.find_all('td')

In [10]:
# add the data in the table to our empty lists
for row in soup.find('table').find_all('tr'):
    cells = row.find_all('td')
    if(len(cells) > 0):
        postalcodes.append(cells[0].text.rstrip('\n'))
        boroughs.append(cells[1].text.rstrip('\n'))
        neighborhoods.append(cells[2].text.rstrip('\n'))

In [11]:
# create a new DataFrame from the three lists
TorontoDF = pd.DataFrame({"PostalCode": postalcodes,
                           "Borough": boroughs,
                           "Neighborhood": neighborhoods})

TorontoDF.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [12]:
# Only process the cells that have an assigned borough. Ignore cells with a borough that is Not assigned.
# In order to satisfy the above requirement, I will use the drop command from pandas to drop any value with a borough that is not assigned

TorontoDF = TorontoDF[TorontoDF.Borough != "Not assigned"].reset_index(drop=True)
TorontoDF.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [13]:
# For any neighborhood that has the value 'Not assigned' replace that value with the Borough instead
for index, row in TorontoDF.iterrows():
    if row["Neighborhood"] == "Not assigned":
        row["Neighborhood"] = row["Borough"]
        
TorontoDF.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [14]:
#Get coordinate data of postal codes
latlong = pd.read_csv("http://cocl.us/Geospatial_data")
latlong.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [15]:
# Change name of column to match Toronto_df
latlong.rename(columns={"Postal Code": "PostalCode"}, inplace=True)
latlong.head()

Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [16]:
# merge two table on the column "PostalCode"
TorontoDF = TorontoDF.merge(latlong, on="PostalCode", how="left")
TorontoDF.head(12)

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village",43.667856,-79.532242
6,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
7,M3B,North York,Don Mills,43.745906,-79.352188
8,M4B,East York,"Parkview Hill, Woodbine Gardens",43.706397,-79.309937
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937


In [17]:
# Load my foursquare credentials to use the API

CLIENT_ID = '31OIH0QK2KJZDWMLAEUPETODOP05CK1Q22050KU3OXPU0QXT'
CLIENT_SECRET = 'CXTSYGJMIBJIKREJZ40Y00SFCISR3R0DWG20CHQGEECBLQFM'
VERSION = '20180605'

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: 31OIH0QK2KJZDWMLAEUPETODOP05CK1Q22050KU3OXPU0QXT
CLIENT_SECRET:CXTSYGJMIBJIKREJZ40Y00SFCISR3R0DWG20CHQGEECBLQFM


In [18]:
#create a list of venues within 500 meters of each neighborhood, max of 100 venues

def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            500, 
            100)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venues in venues for item in venues])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [19]:
# Get venues into a dataframe

venuesDF = getNearbyVenues(names=TorontoDF['Neighborhood'],
                                   latitudes=TorontoDF['Latitude'],
                                   longitudes=TorontoDF['Longitude']
                                  )

Parkwoods
Victoria Village
Regent Park, Harbourfront
Lawrence Manor, Lawrence Heights
Queen's Park, Ontario Provincial Government
Islington Avenue, Humber Valley Village
Malvern, Rouge
Don Mills
Parkview Hill, Woodbine Gardens
Garden District, Ryerson
Glencairn
West Deane Park, Princess Gardens, Martin Grove, Islington, Cloverdale
Rouge Hill, Port Union, Highland Creek
Don Mills
Woodbine Heights
St. James Town
Humewood-Cedarvale
Eringate, Bloordale Gardens, Old Burnhamthorpe, Markland Wood
Guildwood, Morningside, West Hill
The Beaches
Berczy Park
Caledonia-Fairbanks
Woburn
Leaside
Central Bay Street
Christie
Cedarbrae
Hillcrest Village
Bathurst Manor, Wilson Heights, Downsview North
Thorncliffe Park
Richmond, Adelaide, King
Dufferin, Dovercourt Village
Scarborough Village
Fairview, Henry Farm, Oriole
Northwood Park, York University
East Toronto, Broadview North (Old East York)
Harbourfront East, Union Station, Toronto Islands
Little Portugal, Trinity
Kennedy Park, Ionview, East Birchmo

In [24]:
# take a look
venuesDF.head(10)

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Parkwoods,43.753259,-79.329656,Brookbanks Park,43.751976,-79.33214,Park
1,Parkwoods,43.753259,-79.329656,Variety Store,43.751974,-79.333114,Food & Drink Shop
2,Victoria Village,43.725882,-79.315572,Victoria Village Arena,43.723481,-79.315635,Hockey Arena
3,Victoria Village,43.725882,-79.315572,Tim Hortons,43.725517,-79.313103,Coffee Shop
4,Victoria Village,43.725882,-79.315572,Portugril,43.725819,-79.312785,Portuguese Restaurant
5,Victoria Village,43.725882,-79.315572,Eglinton Ave E & Sloane Ave/Bermondsey Rd,43.726086,-79.31362,Intersection
6,Victoria Village,43.725882,-79.315572,Pizza Nova,43.725824,-79.31286,Pizza Place
7,Victoria Village,43.725882,-79.315572,Cash Money,43.725486,-79.312665,Financial or Legal Service
8,"Regent Park, Harbourfront",43.65426,-79.360636,Roselle Desserts,43.653447,-79.362017,Bakery
9,"Regent Park, Harbourfront",43.65426,-79.360636,Tandem Coffee,43.653559,-79.361809,Coffee Shop


In [31]:
#check total number of venues in each neighborhood
Total = venuesDF["Neighborhood"].value_counts().to_frame()

In [32]:
Total.head()

Unnamed: 0,Neighborhood
"Toronto Dominion Centre, Design Exchange",100
"Harbourfront East, Union Station, Toronto Islands",100
"Garden District, Ryerson",100
"Commerce Court, Victoria Hotel",100
"Richmond, Adelaide, King",100


In [46]:
Total = Total.reset_index()

In [33]:
venuesDF.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category,Total
0,Parkwoods,43.753259,-79.329656,Brookbanks Park,43.751976,-79.33214,Park,
1,Parkwoods,43.753259,-79.329656,Variety Store,43.751974,-79.333114,Food & Drink Shop,
2,Victoria Village,43.725882,-79.315572,Victoria Village Arena,43.723481,-79.315635,Hockey Arena,
3,Victoria Village,43.725882,-79.315572,Tim Hortons,43.725517,-79.313103,Coffee Shop,
4,Victoria Village,43.725882,-79.315572,Portugril,43.725819,-79.312785,Portuguese Restaurant,


In [50]:
Total = Total.rename(columns={"Neighborhood":"Total"})
Total.head()

Unnamed: 0,index,Total
0,"Toronto Dominion Centre, Design Exchange",100
1,"Harbourfront East, Union Station, Toronto Islands",100
2,"Garden District, Ryerson",100
3,"Commerce Court, Victoria Hotel",100
4,"Richmond, Adelaide, King",100


In [51]:
Total = Total.rename(columns={"index":"Neighborhood"})
Total.head()

Unnamed: 0,Neighborhood,Total
0,"Toronto Dominion Centre, Design Exchange",100
1,"Harbourfront East, Union Station, Toronto Islands",100
2,"Garden District, Ryerson",100
3,"Commerce Court, Victoria Hotel",100
4,"Richmond, Adelaide, King",100


In [34]:
venuesDF.drop(['Total'], axis=1)

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Parkwoods,43.753259,-79.329656,Brookbanks Park,43.751976,-79.33214,Park
1,Parkwoods,43.753259,-79.329656,Variety Store,43.751974,-79.333114,Food & Drink Shop
2,Victoria Village,43.725882,-79.315572,Victoria Village Arena,43.723481,-79.315635,Hockey Arena
3,Victoria Village,43.725882,-79.315572,Tim Hortons,43.725517,-79.313103,Coffee Shop
4,Victoria Village,43.725882,-79.315572,Portugril,43.725819,-79.312785,Portuguese Restaurant
5,Victoria Village,43.725882,-79.315572,Eglinton Ave E & Sloane Ave/Bermondsey Rd,43.726086,-79.31362,Intersection
6,Victoria Village,43.725882,-79.315572,Pizza Nova,43.725824,-79.31286,Pizza Place
7,Victoria Village,43.725882,-79.315572,Cash Money,43.725486,-79.312665,Financial or Legal Service
8,"Regent Park, Harbourfront",43.65426,-79.360636,Roselle Desserts,43.653447,-79.362017,Bakery
9,"Regent Park, Harbourfront",43.65426,-79.360636,Tandem Coffee,43.653559,-79.361809,Coffee Shop


In [39]:
venuesDF = venuesDF.drop(['Neighborhood Latitude','Neighborhood Longitude','Total','Venue','Venue Latitude', 'Venue Longitude'], axis=1)

In [52]:
venuesDF.head(10)

Unnamed: 0,Neighborhood,Venue Category
0,Parkwoods,Park
1,Parkwoods,Food & Drink Shop
2,Victoria Village,Hockey Arena
3,Victoria Village,Coffee Shop
4,Victoria Village,Portuguese Restaurant
5,Victoria Village,Intersection
6,Victoria Village,Pizza Place
7,Victoria Village,Financial or Legal Service
8,"Regent Park, Harbourfront",Bakery
9,"Regent Park, Harbourfront",Coffee Shop


In [42]:
venuesDF["Venue Category"].value_counts()

Coffee Shop                        186
Café                                98
Restaurant                          66
Pizza Place                         52
Park                                52
Sandwich Place                      44
Hotel                               44
Italian Restaurant                  41
Japanese Restaurant                 39
Bakery                              39
Gym                                 37
Bar                                 34
Clothing Store                      33
Sushi Restaurant                    29
Pub                                 28
American Restaurant                 27
Grocery Store                       26
Fast Food Restaurant                25
Bank                                24
Seafood Restaurant                  23
Gastropub                           22
Breakfast Spot                      22
Thai Restaurant                     21
Pharmacy                            20
Beer Bar                            19
Ice Cream Shop           