# Segmenting and Clustering Neighborhoods in Toronto

## Web Scraping Toronto Postal Codes

In [141]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import json # library to handle JSON files
import requests # library to handle requests
# import k-means from clustering stage
from sklearn.cluster import KMeans
!conda install -c conda-forge folium=0.5.0 
import folium # map rendering library

Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/Python36

  added / updated specs: 
    - folium=0.5.0


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    altair-3.3.0               |           py36_0         747 KB  conda-forge
    folium-0.5.0               |             py_0          45 KB  conda-forge
    certifi-2019.11.28         |           py36_0         149 KB  conda-forge
    ca-certificates-2019.11.28 |       hecc5488_0         145 KB  conda-forge
    openssl-1.1.1d             |       h516909a_0         2.1 MB  conda-forge
    branca-0.3.1               |             py_0          25 KB  conda-forge
    vincent-0.4.4              |             py_1          28 KB  conda-forge
    ------------------------------------------------------------
                                           Total:         3.2 MB

The following NEW packages will be 

In [72]:
URL = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

In [73]:
page = requests.get(URL)

In [74]:
soup = BeautifulSoup(page.text, 'html.parser')

In [75]:
columns = ['PostalCode','Borough','Neighborhood']

In [76]:
postaltable = soup.find_all('table')[0]

### Converting Postal Codes html table info to Pandas Dataframe

In [78]:
df = pd.read_html(str(postaltable))

In [79]:
df[0].head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


In [80]:
df[0].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 287 entries, 0 to 286
Data columns (total 3 columns):
Postcode         287 non-null object
Borough          287 non-null object
Neighbourhood    287 non-null object
dtypes: object(3)
memory usage: 6.8+ KB


In [81]:
postalcodes = df[0]

In [82]:
postalcodes.columns = columns

In [83]:
postalcodes.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


### Narrow down the dataframe to ignore/exclude Not Assigned Boroughs

In [84]:
postalcodes = postalcodes[postalcodes['Borough'] != 'Not assigned']

### Grouping the Neighborhoods w.r.t each Boroughs

In [93]:
toronto_pcodes = postalcodes.groupby(['PostalCode','Borough'])['Neighborhood'].agg(', '.join).to_frame()

In [98]:
toronto_pcodes = toronto_pcodes.reset_index()

In [100]:
toronto_pcodes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 103 entries, 0 to 102
Data columns (total 3 columns):
PostalCode      103 non-null object
Borough         103 non-null object
Neighborhood    103 non-null object
dtypes: object(3)
memory usage: 2.5+ KB


In [101]:
toronto_pcodes.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [103]:
toronto_pcodes.shape

(103, 3)

#### overall 103 postal codes are identified and integrated in the dataframe

# Append Coordinates with help of GeoSpatial data

In [111]:
!wget -q -O 'geospatial_coordinates.csv' http://cocl.us/Geospatial_data
print('data downloaded')

data downloaded


In [112]:
geo_coord = pd.read_csv('geospatial_coordinates.csv')
geo_coord.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


### Merging Coordinates with Toronto Postal Codes dataframe

In [117]:
toronto_postal_coord = pd.merge(left=toronto_pcodes, right=geo_coord, left_on = 'PostalCode', right_on = 'Postal Code', how='inner')

In [120]:
toronto_postal_coord.drop(columns=['Postal Code'], inplace=True)

In [121]:
toronto_postal_coord.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


# Explore and cluster the neighborhoods in Toronto

In [126]:
toronto_postal_coord['Borough'].unique()

array(['Scarborough', 'North York', 'East York', 'East Toronto',
       'Central Toronto', 'Downtown Toronto', 'York', 'West Toronto',
       "Queen's Park", 'Mississauga', 'Etobicoke'], dtype=object)

In [130]:
toronto_postal_coord = toronto_postal_coord[toronto_postal_coord['Borough'].str.contains('Toronto')]

In [135]:
toronto_postal_coord = toronto_postal_coord.reset_index()

In [137]:
toronto_postal_coord.drop(columns='index', inplace=True)

In [138]:
toronto_postal_coord.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M4E,East Toronto,The Beaches,43.676357,-79.293031
1,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188
2,M4L,East Toronto,"The Beaches West, India Bazaar",43.668999,-79.315572
3,M4M,East Toronto,Studio District,43.659526,-79.340923
4,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879


In [144]:
# Toronto Coordinates
Latitude = 43.6532
Longitude = -79.3832

### Toronto Neighborhood map view

In [147]:
map_toronto = folium.Map(location=[Latitude, Longitude], zoom_start = 12)

for lat, lng, lbl in zip(toronto_postal_coord['Latitude'], toronto_postal_coord['Longitude'], toronto_postal_coord['Neighborhood']):
    label = folium.Popup(lbl, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius = 5,
        popup = label,
        fill = True,
        color = 'blue',
        fill_color = 'yellow',
        fill_opacity = 0.7,
        parse_html = False
    ).add_to(map_toronto)
map_toronto

### Define Foursquare Credentials and Version

In [161]:
# The code was removed by Watson Studio for sharing.

Your credentails:
CLIENT_ID: 1XWYOQ0GV5OZHK4E2J2N051LVBLAQGSNW3GBHUZXBZI3RPDW
CLIENT_SECRET:MHSY1JRZTVAFLUYQ0R4HIQVQLQ5XM5THVUBYSWJ4D32XRGLH


### Exploring the first neighborhood of Toronto via Foursquare API results

In [152]:
toronto_postal_coord.loc[0, 'Neighborhood']

'The Beaches'

In [154]:
first_neighbor_name = toronto_postal_coord.loc[0, 'Neighborhood']
first_neighbor_lat = toronto_postal_coord.loc[0, 'Latitude']
first_neighbor_long = toronto_postal_coord.loc[0, 'Longitude']

print('Latitude and longitude values of {} are {}, {}.'.format(first_neighbor_name, 
                                                               first_neighbor_lat, 
                                                               first_neighbor_long))

Latitude and longitude values of The Beaches are 43.67635739999999, -79.2930312.


#### Now, let's get the top 100 venues that are in The Beaches within a radius of 500 meters.

###### First, let's create the GET request URL. Name your URL **url**.

In [205]:

url = 'https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
CLIENT_ID,
CLIENT_SECRET,
VERSION,
first_neighbor_lat,
first_neighbor_long,
500,
100
)



###### Send the GET request and examine the resutls

In [156]:
results = requests.get(url).json()
results

{'meta': {'code': 200, 'requestId': '5de291d6882fc7001ba46598'},
 'response': {'headerLocation': 'The Beaches',
  'headerFullLocation': 'The Beaches, Toronto',
  'headerLocationGranularity': 'neighborhood',
  'totalResults': 4,
  'suggestedBounds': {'ne': {'lat': 43.680857404499996,
    'lng': -79.28682091449052},
   'sw': {'lat': 43.67185739549999, 'lng': -79.29924148550948}},
  'groups': [{'type': 'Recommended Places',
    'name': 'recommended',
    'items': [{'reasons': {'count': 0,
       'items': [{'summary': 'This spot is popular',
         'type': 'general',
         'reasonName': 'globalInteractionReason'}]},
      'venue': {'id': '4bd461bc77b29c74a07d9282',
       'name': 'Glen Manor Ravine',
       'location': {'address': 'Glen Manor',
        'crossStreet': 'Queen St.',
        'lat': 43.67682094413784,
        'lng': -79.29394208780985,
        'labeledLatLngs': [{'label': 'display',
          'lat': 43.67682094413784,
          'lng': -79.29394208780985}],
        'distanc

In [165]:
results = results['response']['groups'][0]['items']

###### Transforming the json results to pandas dataframe 

In [167]:
from pandas.io.json import json_normalize
venues = json_normalize(results)

In [204]:
print('{} venues were returned by Foursquare.'.format(venues.shape[0]))

4 venues were returned by Foursquare.


In [168]:
venues.head()

Unnamed: 0,reasons.count,reasons.items,referralId,venue.categories,venue.id,venue.location.address,venue.location.cc,venue.location.city,venue.location.country,venue.location.crossStreet,...,venue.location.formattedAddress,venue.location.labeledLatLngs,venue.location.lat,venue.location.lng,venue.location.postalCode,venue.location.state,venue.name,venue.photos.count,venue.photos.groups,venue.venuePage.id
0,0,"[{'summary': 'This spot is popular', 'type': '...",e-0-4bd461bc77b29c74a07d9282-0,"[{'id': '4bf58dd8d48988d159941735', 'name': 'T...",4bd461bc77b29c74a07d9282,Glen Manor,CA,Toronto,Canada,Queen St.,...,"[Glen Manor (Queen St.), Toronto ON, Canada]","[{'label': 'display', 'lat': 43.67682094413784...",43.676821,-79.293942,,ON,Glen Manor Ravine,0,[],
1,0,"[{'summary': 'This spot is popular', 'type': '...",e-0-4ad4c062f964a52011f820e3-1,"[{'id': '50aa9e744b90af0d42d5de0e', 'name': 'H...",4ad4c062f964a52011f820e3,125 Southwood Dr,CA,Toronto,Canada,,...,"[125 Southwood Dr, Toronto ON M4E 0B8, Canada]","[{'label': 'display', 'lat': 43.678879, 'lng':...",43.678879,-79.297734,M4E 0B8,ON,The Big Carrot Natural Food Market,0,[],75150878.0
2,0,"[{'summary': 'This spot is popular', 'type': '...",e-0-4b8daea1f964a520480833e3-2,"[{'id': '4bf58dd8d48988d11b941735', 'name': 'P...",4b8daea1f964a520480833e3,676 Kingston Rd.,CA,Toronto,Canada,at Main St.,...,"[676 Kingston Rd. (at Main St.), Toronto ON M4...","[{'label': 'display', 'lat': 43.67918143494101...",43.679181,-79.297215,M4E 1R4,ON,Grover Pub and Grub,0,[],
3,0,"[{'summary': 'This spot is popular', 'type': '...",e-0-4df91c4bae60f95f82229ad5-3,"[{'id': '4f2a25ac4b909258e854f55f', 'name': 'N...",4df91c4bae60f95f82229ad5,,CA,Toronto,Canada,,...,"[Toronto ON, Canada]","[{'label': 'display', 'lat': 43.68056321147582...",43.680563,-79.292869,,ON,Upper Beaches,0,[],


###### Cleaning up the Venues dataframe for easy reference

In [169]:
venues.columns

Index(['reasons.count', 'reasons.items', 'referralId', 'venue.categories',
       'venue.id', 'venue.location.address', 'venue.location.cc',
       'venue.location.city', 'venue.location.country',
       'venue.location.crossStreet', 'venue.location.distance',
       'venue.location.formattedAddress', 'venue.location.labeledLatLngs',
       'venue.location.lat', 'venue.location.lng', 'venue.location.postalCode',
       'venue.location.state', 'venue.name', 'venue.photos.count',
       'venue.photos.groups', 'venue.venuePage.id'],
      dtype='object')

In [171]:
filtered_columns = ['venue.categories','venue.name','venue.location.lat','venue.location.lng']
venues = venues.loc[:, filtered_columns]

In [172]:
venues.head()

Unnamed: 0,venue.categories,venue.name,venue.location.lat,venue.location.lng
0,"[{'id': '4bf58dd8d48988d159941735', 'name': 'T...",Glen Manor Ravine,43.676821,-79.293942
1,"[{'id': '50aa9e744b90af0d42d5de0e', 'name': 'H...",The Big Carrot Natural Food Market,43.678879,-79.297734
2,"[{'id': '4bf58dd8d48988d11b941735', 'name': 'P...",Grover Pub and Grub,43.679181,-79.297215
3,"[{'id': '4f2a25ac4b909258e854f55f', 'name': 'N...",Upper Beaches,43.680563,-79.292869


In [180]:
venues.columns = [col.split('.')[-1] for col in venues.columns]
venues.head()

Unnamed: 0,categories,name,lat,lng
0,"[{'id': '4bf58dd8d48988d159941735', 'name': 'T...",Glen Manor Ravine,43.676821,-79.293942
1,"[{'id': '50aa9e744b90af0d42d5de0e', 'name': 'H...",The Big Carrot Natural Food Market,43.678879,-79.297734
2,"[{'id': '4bf58dd8d48988d11b941735', 'name': 'P...",Grover Pub and Grub,43.679181,-79.297215
3,"[{'id': '4f2a25ac4b909258e854f55f', 'name': 'N...",Upper Beaches,43.680563,-79.292869


In [202]:
venues['categories'] = [None if len(row) == 0  else row[0]['name'] for row in venues['categories']] 

In [203]:
venues.head()

Unnamed: 0,categories,name,lat,lng
0,Trail,Glen Manor Ravine,43.676821,-79.293942
1,Health Food Store,The Big Carrot Natural Food Market,43.678879,-79.297734
2,Pub,Grover Pub and Grub,43.679181,-79.297215
3,Neighborhood,Upper Beaches,43.680563,-79.292869


## Explore Neighborhoods in Toronto

###### Let's create a function to repeat the same process to all the neighborhoods in Toronto

In [206]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    LIMIT = 10
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

###### Calling the function

In [207]:
toronto_venues = getNearbyVenues(names=toronto_postal_coord['Neighborhood'],
                                   latitudes=toronto_postal_coord['Latitude'],
                                   longitudes=toronto_postal_coord['Longitude']
                                  )

The Beaches
The Danforth West, Riverdale
The Beaches West, India Bazaar
Studio District
Lawrence Park
Davisville North
North Toronto West
Davisville
Moore Park, Summerhill East
Deer Park, Forest Hill SE, Rathnelly, South Hill, Summerhill West
Rosedale
Cabbagetown, St. James Town
Church and Wellesley
Harbourfront
Ryerson, Garden District
St. James Town
Berczy Park
Central Bay Street
Adelaide, King, Richmond
Harbourfront East, Toronto Islands, Union Station
Design Exchange, Toronto Dominion Centre
Commerce Court, Victoria Hotel
Roselawn
Forest Hill North, Forest Hill West
The Annex, North Midtown, Yorkville
Harbord, University of Toronto
Chinatown, Grange Park, Kensington Market
CN Tower, Bathurst Quay, Island airport, Harbourfront West, King and Spadina, Railway Lands, South Niagara
Stn A PO Boxes 25 The Esplanade
First Canadian Place, Underground city
Christie
Dovercourt Village, Dufferin
Little Portugal, Trinity
Brockton, Exhibition Place, Parkdale Village
High Park, The Junction Sout

In [209]:
print(toronto_venues.shape)
toronto_venues.head()

(339, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,The Beaches,43.676357,-79.293031,Glen Manor Ravine,43.676821,-79.293942,Trail
1,The Beaches,43.676357,-79.293031,The Big Carrot Natural Food Market,43.678879,-79.297734,Health Food Store
2,The Beaches,43.676357,-79.293031,Grover Pub and Grub,43.679181,-79.297215,Pub
3,The Beaches,43.676357,-79.293031,Upper Beaches,43.680563,-79.292869,Neighborhood
4,"The Danforth West, Riverdale",43.679557,-79.352188,Pantheon,43.677621,-79.351434,Greek Restaurant


In [210]:
toronto_venues.groupby(['Neighborhood'])['Venue'].count()

Neighborhood
Adelaide, King, Richmond                                                                                      10
Berczy Park                                                                                                   10
Brockton, Exhibition Place, Parkdale Village                                                                  10
Business Reply Mail Processing Centre 969 Eastern                                                             10
CN Tower, Bathurst Quay, Island airport, Harbourfront West, King and Spadina, Railway Lands, South Niagara    10
Cabbagetown, St. James Town                                                                                   10
Central Bay Street                                                                                            10
Chinatown, Grange Park, Kensington Market                                                                     10
Christie                                                                           

In [218]:
print('Number of unique Categories of Venues present in Toronto : {}'.format(len(toronto_venues['Venue Category'].unique())))

Number of unique Categories of Venues present in Toronto : 114


## Analyze Neighborhood and its Venues 

In [271]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = pd.Series(toronto_venues['Neighborhood'])

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()

Unnamed: 0,Yoga Studio,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Terminal,American Restaurant,Arts & Crafts Store,Asian Restaurant,Auto Workshop,...,Sushi Restaurant,Swim School,Tea Room,Tennis Court,Thai Restaurant,Theater,Theme Restaurant,Trail,Vegetarian / Vegan Restaurant,Wine Bar
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
