**Collecting and scraping data**

-

In [1]:
#import Pandas and data

import pandas as pd
import requests

url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
html = requests.get(url).content
df_list = pd.read_html(html)
df = df_list[0]

Toronto=pd.DataFrame(df)

In [2]:
#delete 'Not Assigned' Borough and give Borough name to 'Not Assigned' Neighbourhood
import numpy as np

Toronto.drop( Toronto[Toronto['Borough'] == 'Not assigned' ].index , inplace=True)

Toronto.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights


In [3]:
Toronto.loc[Toronto['Neighbourhood']=='Not assigned'].index[0]

8

In [4]:
Toronto['Neighbourhood'][8] = Toronto['Borough'][8] 

In [5]:
Toronto=Toronto.groupby(['Postcode']).agg({'Borough':'first','Neighbourhood':'first'})
Toronto=Toronto.reset_index()
Toronto.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,Rouge
1,M1C,Scarborough,Highland Creek
2,M1E,Scarborough,Guildwood
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


-

-

**Adding coordinates for analysis**

-

In [6]:
# The code was removed by Watson Studio for sharing.

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [7]:
df_data_1=df_data_1.sort_values(['Postal Code'])
df_data_1=df_data_1.rename(columns={'Postal Code':'Postcode'})

In [9]:
Toronto=Toronto.merge(df_data_1)
Toronto.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,Rouge,43.806686,-79.194353
1,M1C,Scarborough,Highland Creek,43.784535,-79.160497
2,M1E,Scarborough,Guildwood,43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


-

-

**Analysis and K-menas**

-

In [10]:
import json 

!conda install -c conda-forge geopy --yes
from geopy.geocoders import Nominatim 

import requests 
from pandas.io.json import json_normalize 

import matplotlib.cm as cm
import matplotlib.colors as colors

from sklearn.cluster import KMeans
!conda install -c conda-forge folium=0.5.0 --yes 
import folium

Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/Python36

  added / updated specs: 
    - geopy


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    openssl-1.1.1c             |       h516909a_0         2.1 MB  conda-forge
    geographiclib-1.49         |             py_0          32 KB  conda-forge
    certifi-2019.6.16          |           py36_1         149 KB  conda-forge
    ca-certificates-2019.6.16  |       hecc5488_0         145 KB  conda-forge
    geopy-1.20.0               |             py_0          57 KB  conda-forge
    ------------------------------------------------------------
                                           Total:         2.5 MB

The following NEW packages will be INSTALLED:

    geographiclib:   1.49-py_0         conda-forge
    geopy:           1.20.0-py_0       conda-forge

The following packages will be UPDATED:

    ca-

In [11]:
Toronto.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,Rouge,43.806686,-79.194353
1,M1C,Scarborough,Highland Creek,43.784535,-79.160497
2,M1E,Scarborough,Guildwood,43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [14]:
latitude=(43.6532)
longitude=(-79.3832)
map_Toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

for lat, lng, neighbourhood in zip(Toronto['Latitude'],Toronto['Longitude'],Toronto['Neighbourhood']):
    label='{}'.format(neighbourhood)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7).add_to(map_Toronto)  
    
map_Toronto

-

-

**Defining Forsquare credentials**

In [15]:
CLIENT_ID = 'FJFJZV0QQVS51SUFTMYVMRMUVTRMBLQHVFIVSBAFCSMCCBWF' 
CLIENT_SECRET = 'KZ3OTVTODQPX3JVR2OYOPHAWJ3MBQCNC13AAVQMRSMARSIT0' 
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: FJFJZV0QQVS51SUFTMYVMRMUVTRMBLQHVFIVSBAFCSMCCBWF
CLIENT_SECRET:KZ3OTVTODQPX3JVR2OYOPHAWJ3MBQCNC13AAVQMRSMARSIT0


In [16]:
Toronto.loc[0, 'Neighbourhood']

'Rouge'

-

**Analysis on the ROUGE Neighbourhood**

In [18]:
nh_ltd = Toronto.loc[0, 'Latitude'] 
nh_lng = Toronto.loc[0, 'Longitude'] 

nh_name = Toronto.loc[0, 'Neighbourhood']

print('Latitude and longitude values of {} are {}, {}.'.format(nh_name, 
                                                               nh_ltd, 
                                                               nh_lng))

Latitude and longitude values of Rouge are 43.806686299999996, -79.19435340000001.


In [19]:
#top100 Venues in 1km range
LIMIT=100
radius=1000

url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    nh_ltd, 
    nh_lng, 
    radius, 
    LIMIT)
url

'https://api.foursquare.com/v2/venues/explore?&client_id=FJFJZV0QQVS51SUFTMYVMRMUVTRMBLQHVFIVSBAFCSMCCBWF&client_secret=KZ3OTVTODQPX3JVR2OYOPHAWJ3MBQCNC13AAVQMRSMARSIT0&v=20180605&ll=43.806686299999996,-79.19435340000001&radius=1000&limit=100'

In [20]:
results = requests.get(url).json()

In [21]:
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [23]:
venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) 

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

print(nearby_venues.shape)
nearby_venues.head()

(18, 4)


Unnamed: 0,name,categories,lat,lng
0,Images Salon & Spa,Spa,43.802283,-79.198565
1,Caribbean Wave,Caribbean Restaurant,43.798558,-79.195777
2,Staples Morningside,Paper / Office Supplies Store,43.800285,-79.196607
3,Wendy's,Fast Food Restaurant,43.802008,-79.19808
4,Harvey's,Fast Food Restaurant,43.800106,-79.198258


-

-

**Further Analysis on Toronto neighborhoods per Venues available**

- Getting nearby Venues per Neighbourhood

In [24]:
def getNearbyVenues(names, latitudes, longitudes, radius=1000):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # get request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [26]:
Toronto_venues = getNearbyVenues(names=Toronto['Neighbourhood'],
                                   latitudes=Toronto['Latitude'],
                                   longitudes=Toronto['Longitude']
                                  )

Rouge
Highland Creek
Guildwood
Woburn
Cedarbrae
Scarborough Village
East Birchmount Park
Clairlea
Cliffcrest
Birch Cliff
Dorset Park
Maryvale
Agincourt
Clarks Corners
Agincourt North
L'Amoreaux West
Upper Rouge
Hillcrest Village
Fairview
Bayview Village
Silver Hills
Newtonbrook
Willowdale South
York Mills West
Willowdale West
Parkwoods
Don Mills North
Flemingdon Park
Bathurst Manor
Northwood Park
CFB Toronto
Downsview West
Downsview Central
Downsview Northwest
Victoria Village
Woodbine Gardens
Woodbine Heights
The Beaches
Leaside
Thorncliffe Park
East Toronto
The Danforth West
The Beaches West
Studio District
Lawrence Park
Davisville North
North Toronto West
Davisville
Moore Park
Deer Park
Rosedale
Cabbagetown
Church and Wellesley
Harbourfront
Ryerson
St. James Town
Berczy Park
Central Bay Street
Adelaide
Harbourfront East
Design Exchange
Commerce Court
Bedford Park
Roselawn
Forest Hill North
The Annex
Harbord
Chinatown
CN Tower
Stn A PO Boxes 25 The Esplanade
First Canadian Place
Lawr

-

-

-

- Getting top Venues per neighbourhood in Toronto

-

In [27]:
# one hot encoding
Toronto_onehot = pd.get_dummies(Toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe and move it as first
Toronto_onehot['Neighbourhood'] = Toronto['Neighbourhood'] 

fixed_columns = [Toronto_onehot.columns[-1]] + list(Toronto_onehot.columns[:-1])
Toronto_onehot = Toronto_onehot[fixed_columns]


In [28]:
Toronto_final = Toronto_onehot.groupby('Neighbourhood').mean().reset_index()
Toronto_final.head()

Unnamed: 0,Neighbourhood,Accessories Store,Adult Boutique,Afghan Restaurant,Airport,Airport Lounge,American Restaurant,Amphitheater,Animal Shelter,Antique Shop,...,Video Store,Vietnamese Restaurant,Warehouse Store,Whisky Bar,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio,Zoo
0,Adelaide,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Agincourt,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Agincourt North,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Albion Gardens,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Alderwood,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


**Top 3 Venues for each Neighbourhood in Toronto**

In [33]:
num_top_venues = 3

for hood in Toronto_final['Neighbourhood']:
    print("----"+hood+"----")
    temp = Toronto_final[Toronto_final['Neighbourhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Adelaide----
               venue  freq
0        Coffee Shop   1.0
1  Accessories Store   0.0
2       Noodle House   0.0


----Agincourt----
                  venue  freq
0      Greek Restaurant   1.0
1             Nightclub   0.0
2  Pakistani Restaurant   0.0


----Agincourt North----
               venue  freq
0             Bakery   1.0
1  Accessories Store   0.0
2          Nightclub   0.0


----Albion Gardens----
                     venue  freq
0               Hobby Shop   1.0
1        Accessories Store   0.0
2  New American Restaurant   0.0


----Alderwood----
                     venue  freq
0            Bowling Alley   1.0
1        Accessories Store   0.0
2  New American Restaurant   0.0


----Bathurst Manor----
                           venue  freq
0                  Smoothie Shop   1.0
1              Accessories Store   0.0
2  Paper / Office Supplies Store   0.0


----Bayview Village----
                  venue  freq
0    Italian Restaurant   1.0
1     Accessories Store  