## Data Preprocessing for Toronto 

In [1]:
import pandas as pd
import numpy as np
import json

!pip install geopy
from geopy.geocoders import Nominatim

!pip install beautifulsoup4
from bs4 import BeautifulSoup
import requests

!pip install folium
import folium

import json



### Obtaining the Borough and Neighborhood details from Wiki page

In [2]:
source=requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(source, 'lxml')
table=soup.find('table', class_='wikitable sortable')
headerlist=['Postcode','Borough','Neighborhood']

dftable=""
for tr in table.find_all("tr"):
    r1=""
    for td in tr.find_all("td"):
        r1=r1+','+td.text
    dftable=dftable+r1[1:]
print(dftable)

M1A,Not assigned,Not assigned
M2A,Not assigned,Not assigned
M3A,North York,Parkwoods
M4A,North York,Victoria Village
M5A,Downtown Toronto,Harbourfront
M6A,North York,Lawrence Heights
M6A,North York,Lawrence Manor
M7A,Downtown Toronto,Queen's Park
M8A,Not assigned,Not assigned
M9A,Etobicoke,Islington Avenue
M1B,Scarborough,Rouge
M1B,Scarborough,Malvern
M2B,Not assigned,Not assigned
M3B,North York,Don Mills North
M4B,East York,Woodbine Gardens
M4B,East York,Parkview Hill
M5B,Downtown Toronto,Ryerson
M5B,Downtown Toronto,Garden District
M6B,North York,Glencairn
M7B,Not assigned,Not assigned
M8B,Not assigned,Not assigned
M9B,Etobicoke,Cloverdale
M9B,Etobicoke,Islington
M9B,Etobicoke,Martin Grove
M9B,Etobicoke,Princess Gardens
M9B,Etobicoke,West Deane Park
M1C,Scarborough,Highland Creek
M1C,Scarborough,Rouge Hill
M1C,Scarborough,Port Union
M2C,Not assigned,Not assigned
M3C,North York,Flemingdon Park
M3C,North York,Don Mills South
M4C,East York,Woodbine Heights
M5C,Downtown Toronto,St. James

### Assigning the output in DataFrame along with headers 

In [3]:
file=open("postcode.csv","wb")
file.write(bytes(dftable,encoding="ascii",errors="ignore"))
T_df=pd.read_csv('postcode.csv', header=None)
T_df.columns=headerlist
indexnames=T_df[T_df["Borough"]=="Not assigned"].index #Remove the Not Assigned records
T_df.drop(indexnames, inplace=True)
T_df.loc[T_df['Neighborhood']=='Not Assigned','Neighborhood'] = T_df['Borough'] #Replace the Not Assigned Neighborhoods with Borough names

In [4]:
df_lat_long=pd.read_csv('https://cocl.us/Geospatial_data')

In [5]:
df_lat_long.columns=['Postcode','Latitude', 'Longitude']

In [6]:
Toronto_df=pd.merge(T_df,df_lat_long, on='Postcode')
Toronto_df

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,Harbourfront,43.654260,-79.360636
3,M6A,North York,Lawrence Heights,43.718518,-79.464763
4,M6A,North York,Lawrence Manor,43.718518,-79.464763
5,M7A,Downtown Toronto,Queen's Park,43.662301,-79.389494
6,M9A,Etobicoke,Islington Avenue,43.667856,-79.532242
7,M1B,Scarborough,Rouge,43.806686,-79.194353
8,M1B,Scarborough,Malvern,43.806686,-79.194353
9,M3B,North York,Don Mills North,43.745906,-79.352188


In [7]:
Toronto_df.to_csv('Toronto_Neighborhood_Locations.csv', index=None, header=True, encoding='utf-8')

First Dataset created! `Toronto_Neighborhood_Locations.csv`

### Generating a map to show the Neighborhood Locations

In [8]:
address='Toronto, ON'

geolocator=Nominatim(user_agent='T_app') #Nominatim highly discourages using default 'user_agent'
location=geolocator.geocode(address)
latitude=location.latitude
longitude=location.longitude

print('The coordinates of Toronto is {},{}'.format(latitude,longitude))

The coordinates of Toronto is 43.653963,-79.387207


In [11]:
Toronto_map=folium.Map(location=[latitude,longitude], zoom_start=12, tiles='OpenStreetMap')

for lat,long,borough,neighborhood in zip(Toronto_df['Latitude'],Toronto_df['Longitude'],
                                         Toronto_df['Borough'],Toronto_df['Neighborhood']):
    location=[lat,long]
    label='{},{}'.format(neighborhood,borough)
    label=folium.Popup(label,parse_html=True)
    folium.CircleMarker(location=[lat,long],
                       radius=5,
                       popup=label,
                       color='blue',
                       fill=True,
                       fill_color='#ccc431',
                       fill_opacity=0.7,
                       parse_html=False).add_to(Toronto_map)
Toronto_map

### Foursquare API feature to extract Venues near Neighborhoods

In [12]:
CLIENT_ID ='...'      #Foursquare Client ID
CLIENT_SECRET ='...'  #Foursquare Client secret
VERSION ='20180605'   #Foursquare Version

In [13]:
LIMIT = 200
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    venues=[]
    for name, lat, long in zip(names, latitudes, longitudes):
        print(name)
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            long, 
            radius, 
            LIMIT)
        results = requests.get(url).json()["response"]['groups'][0]['items']
        venues.append([(
            name, 
            lat, 
            long, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])
    
    nearby_venues=pd.DataFrame([item for venue in venues for item in venue])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    return(nearby_venues)

In [14]:
T_venues=getNearbyVenues(Toronto_df['Neighborhood'],Toronto_df['Latitude'],Toronto_df['Longitude'])

Parkwoods
Victoria Village
Harbourfront
Lawrence Heights
Lawrence Manor
Queen's Park
Islington Avenue
Rouge
Malvern
Don Mills North
Woodbine Gardens
Parkview Hill
Ryerson
Garden District
Glencairn
Cloverdale
Islington
Martin Grove
Princess Gardens
West Deane Park
Highland Creek
Rouge Hill
Port Union
Flemingdon Park
Don Mills South
Woodbine Heights
St. James Town
Humewood-Cedarvale
Bloordale Gardens
Eringate
Markland Wood
Old Burnhamthorpe
Guildwood
Morningside
West Hill
The Beaches
Berczy Park
Caledonia-Fairbanks
Woburn
Leaside
Central Bay Street
Christie
Cedarbrae
Hillcrest Village
Bathurst Manor
Downsview North
Wilson Heights
Thorncliffe Park
Adelaide
King
Richmond
Dovercourt Village
Dufferin
Scarborough Village
Fairview
Henry Farm
Oriole
Northwood Park
York University
East Toronto
Harbourfront East
Toronto Islands
Union Station
Little Portugal
Trinity
East Birchmount Park
Ionview
Kennedy Park
Bayview Village
CFB Toronto
Downsview East
The Danforth West
Riverdale
Design Exchange
Toro

In [15]:
T_venues.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Parkwoods,43.753259,-79.329656,Brookbanks Park,43.751976,-79.33214,Park
1,Parkwoods,43.753259,-79.329656,Variety Store,43.751974,-79.333114,Food & Drink Shop
2,Parkwoods,43.753259,-79.329656,TTC stop - 44 Valley Woods,43.755402,-79.333741,Bus Stop
3,Victoria Village,43.725882,-79.315572,Victoria Village Arena,43.723481,-79.315635,Hockey Arena
4,Victoria Village,43.725882,-79.315572,Tim Hortons,43.725517,-79.313103,Coffee Shop


In [16]:
T_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Adelaide,100,100,100,100,100,100
Agincourt,5,5,5,5,5,5
Agincourt North,3,3,3,3,3,3
Albion Gardens,9,9,9,9,9,9
Alderwood,8,8,8,8,8,8
Bathurst Manor,20,20,20,20,20,20
Bathurst Quay,15,15,15,15,15,15
Bayview Village,4,4,4,4,4,4
Beaumond Heights,9,9,9,9,9,9
Bedford Park,26,26,26,26,26,26


In [17]:
T_onehot = pd.get_dummies(T_venues[['Venue Category']], prefix = "", prefix_sep = "")

T_onehot['Neighborhood'] = T_venues['Neighborhood']

col_list = list(T_onehot.columns)
col_list.remove('Neighborhood')
cols = ['Neighborhood'] + [col for col in col_list]

T_onehot = T_onehot[cols]

T_onehot.head()

Unnamed: 0,Neighborhood,Accessories Store,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio
0,Parkwoods,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Parkwoods,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Parkwoods,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Victoria Village,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Victoria Village,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [19]:
T_onehot.shape
T_grouped=T_onehot.groupby('Neighborhood').mean().reset_index()
T_grouped.head(20)

Unnamed: 0,Neighborhood,Accessories Store,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio
0,Adelaide,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.02,...,0.02,0.0,0.0,0.0,0.0,0.01,0.0,0.0,0.01,0.0
1,Agincourt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Agincourt North,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Albion Gardens,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Alderwood,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,Bathurst Manor,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.05,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,Bathurst Quay,0.0,0.0,0.066667,0.066667,0.066667,0.133333,0.133333,0.066667,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,Bayview Village,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,Beaumond Heights,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,Bedford Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.038462,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [20]:
T_grouped.to_csv('Toronto_grouped.csv',header=True, index=None, encoding='utf-8')

Another Dataset is ready! `Toronto_grouped.csv`