# Segmenting and Clustering neighbourhoods in Toronto

## Assignment for Applied Data Science Capstone

## Part1 -

### Importing data

In [39]:
import requests 
import pandas as pd

url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
page = requests.get(url)

#using Pandas to read html into dataframe
data = pd.read_html("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M", header = 0)

df=data[0]
df

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
...,...,...,...
175,M5Z,Not assigned,Not assigned
176,M6Z,Not assigned,Not assigned
177,M7Z,Not assigned,Not assigned
178,M8Z,Etobicoke,"Mimico NW, The Queensway West, South of Bloor,..."


### Cleaning the dataframe

In [40]:
df_cleaned = df[df.Borough != "Not assigned"]
df_cleaned

Unnamed: 0,Postal Code,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
...,...,...,...
160,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
165,M4Y,Downtown Toronto,Church and Wellesley
168,M7Y,East Toronto,"Business reply mail Processing Centre, South C..."
169,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


In [41]:
#checking if any "Not assigned" values are left in the Neighbourhood column
df_cleaned.loc[df_cleaned["Neighborhood"] == "Not assigned"]

Unnamed: 0,Postal Code,Borough,Neighborhood


In [42]:
df_cleaned.shape

(103, 3)

## Part 2 -

### Adding geographic information

In [43]:
df_geo = pd.read_csv(r"C:\Users\adrif\Documents\Learning\IBM Data Sciene\C9 - Applied Data Science Capstone\Geospatial_Coordinates.csv")
df_geo.head(20)

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
5,M1J,43.744734,-79.239476
6,M1K,43.727929,-79.262029
7,M1L,43.711112,-79.284577
8,M1M,43.716316,-79.239476
9,M1N,43.692657,-79.264848


In [44]:
#merge tables based on the Postal Code column
df = pd.merge(df_cleaned, df_geo, how="outer", on=["Postal Code"])

import numpy as np

df[["Latitude", "Longitude"]] = np.round(df[["Latitude", "Longitude"]], decimals=2)
df.shape

(103, 5)

In [45]:
df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.75,-79.33
1,M4A,North York,Victoria Village,43.73,-79.32
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65,-79.36
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.72,-79.46
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.66,-79.39


## Part 3 -
### Exploring Toronto neighbourhoods

I want to find if there are any swimming pools in the neighbourhoods that belong to Downtown Toronto.

In [73]:
#keeping only the data of Downtown Toronto neighbourhoods
df_downtown = df[df.Borough == "Downtown Toronto"]
df_downtown.set_index("Postal Code", inplace = True)
df_downtown.head()

Unnamed: 0_level_0,Borough,Neighborhood,Latitude,Longitude
Postal Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65,-79.36
M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.66,-79.39
M5B,Downtown Toronto,"Garden District, Ryerson",43.66,-79.38
M5C,Downtown Toronto,St. James Town,43.65,-79.38
M5E,Downtown Toronto,Berczy Park,43.64,-79.37


### Getting data from Foursquare

Preparing a request to get data on all swimming pools from Foursquare that are located in the above Downtown Toronto neighbourhoods.

In [80]:
#preparing Foursquare query
CLIENT_ID = 'QKXLFOGEM2RLE3AHMICC2ZHUXOZDWTFT1HLK0IP4O0MTWAUT'
CLIENT_SECRET = 'OCOYFA0RMR0M4VGKAJQZDZV50GV2MAGPZAWTJZT3MIKG2BXA'
VERSION = '20180604'
LIMIT = 200
search_query = "swimming pool"
radius = 2000

#Getting json data for Parkwoods
latitude = df_downtown.loc["M5A", "Latitude"]
longitude = df_downtown.loc["M5A", "Longitude"]

url = 'https://api.foursquare.com/v2/venues/search?&client_id={}&client_secret={}&v={}&ll={},{}&query={}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            latitude, 
            longitude,
            search_query,
            radius, 
            LIMIT)

results = requests.get(url).json()
results

{'meta': {'code': 200, 'requestId': '5f04ea51e5b2125c2d78c193'},
 'response': {'venues': [{'id': '51c0b4bf498e78941e4f6a9d',
    'name': 'Swimming Pool',
    'location': {'address': '123 Queen St W',
     'crossStreet': 'between Bay St & University Ave',
     'lat': 43.652266,
     'lng': -79.384815,
     'labeledLatLngs': [{'label': 'display',
       'lat': 43.652266,
       'lng': -79.384815}],
     'distance': 2014,
     'cc': 'CA',
     'city': 'Toronto',
     'state': 'ON',
     'country': 'Canada',
     'formattedAddress': ['123 Queen St W (between Bay St & University Ave)',
      'Toronto ON',
      'Canada']},
    'categories': [{'id': '4bf58dd8d48988d132951735',
      'name': 'Hotel Pool',
      'pluralName': 'Hotel Pools',
      'shortName': 'Pool',
      'icon': {'prefix': 'https://ss3.4sqi.net/img/categories_v2/parks_outdoors/pool_',
       'suffix': '.png'},
      'primary': True}],
    'referralId': 'v-1594157865',
    'hasPerk': False},
   {'id': '55adb867498e5577b1ee186

In [127]:
#function to retrieve data for each downtown neighbourhood
def getSwimmingpools(names, latitudes, longitudes):
    
    pools_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
       
        url = 'https://api.foursquare.com/v2/venues/search?&client_id={}&client_secret={}&v={}&ll={},{}&query={}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng,
            search_query,
            radius, 
            LIMIT)
        
        
        results = requests.get(url).json()["response"]["venues"]
        
        
    return(results)

In [128]:
downtown_swimming = getSwimmingpools(names = df_downtown["Neighborhood"], latitudes = df_downtown["Latitude"], longitudes = df_downtown["Longitude"])

print(downtown_swimming)

Regent Park, Harbourfront
Queen's Park, Ontario Provincial Government
Garden District, Ryerson
St. James Town
Berczy Park
Central Bay Street
Christie
Richmond, Adelaide, King
Harbourfront East, Union Station, Toronto Islands
Toronto Dominion Centre, Design Exchange
Commerce Court, Victoria Hotel
University of Toronto, Harbord
Kensington Market, Chinatown, Grange Park
CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport
Rosedale
Stn A PO Boxes
St. James Town, Cabbagetown
First Canadian Place, Underground city
Church and Wellesley
[{'id': '51c0b4bf498e78941e4f6a9d', 'name': 'Swimming Pool', 'location': {'address': '123 Queen St W', 'crossStreet': 'between Bay St & University Ave', 'lat': 43.652266, 'lng': -79.384815, 'labeledLatLngs': [{'label': 'display', 'lat': 43.652266, 'lng': -79.384815}], 'distance': 2011, 'cc': 'CA', 'city': 'Toronto', 'state': 'ON', 'country': 'Canada', 'formattedAddress': ['123 Queen St W (between Bay St & Un

### Creating & cleaning pandas dataframe

In [139]:
df_dt_swimming = pd.json_normalize(downtown_swimming)

df_dt_swimming = df_dt_swimming[["name", "categories", "location.address", "location.lat", "location.lng", "location.postalCode"]]
df_dt_swimming.rename(columns = {"name":"Name", "location.address": "Address", "location.lat": "Latitude", "location.lng": "Longitude", "location.postalCode": "Post Code"}, inplace = True)

def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

df_dt_swimming['categories'] = df_dt_swimming.apply(get_category_type, axis=1)
df_dt_swimming.rename(columns = {"categories":"Categories"}, inplace = True)

print(df_dt_swimming.shape)
df_dt_swimming.head()

(50, 6)


Unnamed: 0,Name,Categories,Address,Latitude,Longitude,Post Code
0,Swimming Pool,Hotel Pool,123 Queen St W,43.652266,-79.384815,
1,swimming pool @ Holiday inn,Pool,,43.661551,-79.380566,
2,Residences at College Park Swimming Pool,Gym Pool,763 Bay St.,43.660025,-79.385018,M5G 2R3
3,Swimming Pool at Hilton Garden Inn,Hotel Pool,200 Dundas Street East,43.657455,-79.376674,M5A 4R6
4,Swimming Pool at The Merchandise Lofts,Gym Pool,155 Dalhousie Street,43.657048,-79.37688,M5B 2P7


### Visualisation

Creating a map of all swimming pools located in Downtown Toronto.

In [151]:
import folium

# create map
map_dt_swimming = folium.Map(location=[43.65,-79.37], zoom_start=13)

# add markers to the map
markers_colors = []
for lat, lng, label in zip(df_dt_swimming['Latitude'], df_dt_swimming['Longitude'], df_dt_swimming['Name']):
    labels = folium.Popup(str(label), parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=labels,
        color="blue",
        fill=True,
        fill_color="blue",
        fill_opacity=0.7).add_to(map_dt_swimming)
       
map_dt_swimming