# Segmenting and Clustering Neighborhoods in Toronto

### Part 1

#### We will start by building a code to scrape the "List of postal codes of Canada: M" Wikipedia page:

In [1]:
pip install BeautifulSoup4

Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup

In [3]:
url = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M").text

In [4]:
bsoup = BeautifulSoup(url,"html")

In [5]:
table = bsoup.find("table", {"class":"wikitable sortable"})

#### Next we obtain the table of postal codes and transform the data into a pandas dataframe consisting of three columns; PostalCode, Borough, and Neighborhood:

In [6]:
table_body = table.find("tbody")
rows = table_body.find_all("tr")
l = []
for row in rows:
    cols = row.find_all("td")
    cols = [ele.text.strip() for ele in cols]
    l.append(cols)
df_PBN = pd.DataFrame(l, columns = ["PostalCode", "Borough", "Neighborhood"])
df_PBN = df_PBN.drop(df_PBN.index[[0]]).reset_index(drop = True)
df_PBN.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


#### Processing only cells that have an assigned borough:

In [7]:
df_drop = df_PBN.drop(df_PBN[df_PBN.Borough == "Not assigned"].index).reset_index(drop = True)
df_drop.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights


#### If the neighborhood is not assigned then assign the neighborhood the same name as the borough:

In [8]:
for index, row in df_drop.iterrows():
    if row ["Neighborhood"] == "Not assigned":
        row ["Neighborhood"] = row ["Borough"]
df_drop.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights


#### Combine neighborhoods with the same postal code:

In [9]:
neighborhoods = df_drop.groupby(["PostalCode", "Borough"])["Neighborhood"].apply(lambda x: ", ".join(x.astype(str))).reset_index()
neighborhoods.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


#### The number of rows of your dataframe:

In [10]:
print("The dataframe has {} neighborhoods.".format(neighborhoods.shape[0]))

The dataframe has 103 neighborhoods.




### Part 2

#### Using the csv file to create the dataframe:

In [11]:
import io

In [12]:
url = requests.get("http://cocl.us/Geospatial_data").content
df_merge = pd.read_csv(io.StringIO(url.decode("utf-8")))

#### Merging the two data frame on the column Postal Codes:

In [13]:
df_merge.columns = ["PostalCode", "Latitude", "Longitude"]
neighborhoods = pd.merge(df_merge, neighborhoods, on = "PostalCode")
neighborhoods.head()

Unnamed: 0,PostalCode,Latitude,Longitude,Borough,Neighborhood
0,M1B,43.806686,-79.194353,Scarborough,"Rouge, Malvern"
1,M1C,43.784535,-79.160497,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,43.763573,-79.188711,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,43.770992,-79.216917,Scarborough,Woburn
4,M1H,43.773136,-79.239476,Scarborough,Cedarbrae


#### Reordering the columns of the data frame:

In [14]:
neighborhoods = neighborhoods[["PostalCode", "Borough", "Neighborhood", "Latitude", "Longitude"]]
neighborhoods

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
...,...,...,...,...,...
98,M9N,York,Weston,43.706876,-79.518188
99,M9P,Etobicoke,Westmount,43.696319,-79.532242
100,M9R,Etobicoke,"Kingsview Village, Martin Grove Gardens, Richv...",43.688905,-79.554724
101,M9V,Etobicoke,"Albion Gardens, Beaumond Heights, Humbergate, ...",43.739416,-79.588437




### Part 3

#### Map of Toronto neighborhoods:

In [15]:
import folium
import json
from pandas.io.json import json_normalize
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans

In [16]:
toronto_map = folium.Map(location = [43.6532, -79.3832], zoom_start = 10)

# add markers to map
for lat, lng, borough, neighborhood in zip(neighborhoods["Latitude"], neighborhoods["Longitude"], neighborhoods["Borough"], neighborhoods["Neighborhood"]):
    label = "{}, {}".format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius = 5,
        popup = label,
        color = "blue",
        fill = True,
        fill_color = "#3186cc",
        fill_opacity = 0.25,
        parse_html = False).add_to(toronto_map)  
    
toronto_map

#### Connecting with foursquare credentials and version:

In [17]:
CLIENT_ID = '22LA0W1UYAA2MYXY0WMLO3ZR1Y0Y2D5TGXELMPXQEOC1OF4S' # your Foursquare ID
CLIENT_SECRET = '1ZLFGQJ0EJHIGNRTYC4TVVSB3LT0ACLLAQJ32KTN5LUQ1MAP' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: 22LA0W1UYAA2MYXY0WMLO3ZR1Y0Y2D5TGXELMPXQEOC1OF4S
CLIENT_SECRET:1ZLFGQJ0EJHIGNRTYC4TVVSB3LT0ACLLAQJ32KTN5LUQ1MAP


#### First neighborhood and it's location:

In [18]:
neighborhoods.loc[0, "Neighborhood"]

neighborhood_lat = neighborhoods.loc[0, "Latitude"]
neighborhood_lng = neighborhoods.loc[0, "Longitude"]
neighborhood_name = neighborhoods.loc[0, "Neighborhood"]

print("Latitude and longitude values of {} are {}, {}.".format(neighborhood_name,
                                                               neighborhood_lat, 
                                                               neighborhood_lng))

Latitude and longitude values of Rouge, Malvern are 43.806686299999996, -79.19435340000001.


#### The first 100 locations within a kilometer radius of the first neighborhood:

In [20]:
LIMIT = 100
radius = 1000

url = "https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}".format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    neighborhood_lat, 
    neighborhood_lng, 
    radius, 
    LIMIT)

url

'https://api.foursquare.com/v2/venues/explore?&client_id=22LA0W1UYAA2MYXY0WMLO3ZR1Y0Y2D5TGXELMPXQEOC1OF4S&client_secret=1ZLFGQJ0EJHIGNRTYC4TVVSB3LT0ACLLAQJ32KTN5LUQ1MAP&v=20180605&ll=43.806686299999996,-79.19435340000001&radius=1000&limit=100'

In [21]:
results = requests.get(url).json()

def type_of_category(row):
    try:
        categories_list = row["categories"]
    except:
        categories_list = row["venue.categories"]     
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]["name"]

In [22]:
locations = results["response"]["groups"][0]["items"]
nearby_locations = json_normalize(locations)

#### Filtered columns:

In [23]:
filtered_columns = ["venue.name", "venue.categories", "venue.location.lat", "venue.location.lng"]
nearby_locations = nearby_locations.loc[:, filtered_columns]

#### Filtered category for each row:

In [24]:
nearby_locations["venue.categories"] = nearby_locations.apply(type_of_category, axis = 1)

#### Clean the columns:

In [25]:
nearby_locations.columns = [col.split(".")[-1] for col in nearby_locations.columns]
nearby_locations

Unnamed: 0,name,categories,lat,lng
0,Images Salon & Spa,Spa,43.802283,-79.198565
1,Staples Morningside,Paper / Office Supplies Store,43.800285,-79.196607
2,Caribbean Wave,Caribbean Restaurant,43.798558,-79.195777
3,Wendy's,Fast Food Restaurant,43.802008,-79.19808
4,Wendy's,Fast Food Restaurant,43.807448,-79.199056
5,Harvey's,Fast Food Restaurant,43.800106,-79.198258
6,Tim Hortons,Coffee Shop,43.802,-79.198169
7,Lee Valley,Hobby Shop,43.803161,-79.199681
8,FASTSIGNS,Business Service,43.807882,-79.201968
9,Tim Hortons / Esso,Coffee Shop,43.801863,-79.199296


In [26]:
print("There are {} venues returned by Foursquare.".format(nearby_locations.shape[0]))

There are 20 venues returned by Foursquare.


#### All the locations within a kilometer of Toronto:

In [27]:
def nearbylocations(names, latitudes, longitudes, radius = 1000):
    list_of_locations = []
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
        url = "https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}".format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
        results = requests.get(url).json()["response"]["groups"][0]["items"]
        list_of_locations.append([(
            name, 
            lat, 
            lng, 
            v["venue"]["name"], 
            v["venue"]["location"]["lat"], 
            v["venue"]["location"]["lng"],  
            v["venue"]["categories"][0]["name"]) for v in results])
    nearby_locations = pd.DataFrame([item for list_of_locations in list_of_locations for item in list_of_locations])
    nearby_locations.columns = ["Neighborhood", 
                  "Neighborhood Latitude", 
                  "Neighborhood Longitude", 
                  "Venue", 
                  "Venue Latitude", 
                  "Venue Longitude", 
                  "Venue Category"]
    return(nearby_locations)

In [28]:
toronto_locations = nearbylocations(names = neighborhoods["Neighborhood"],
                                    latitudes = neighborhoods["Latitude"],
                                    longitudes = neighborhoods["Longitude"])

Rouge, Malvern
Highland Creek, Rouge Hill, Port Union
Guildwood, Morningside, West Hill
Woburn
Cedarbrae
Scarborough Village
East Birchmount Park, Ionview, Kennedy Park
Clairlea, Golden Mile, Oakridge
Cliffcrest, Cliffside, Scarborough Village West
Birch Cliff, Cliffside West
Dorset Park, Scarborough Town Centre, Wexford Heights
Maryvale, Wexford
Agincourt
Clarks Corners, Sullivan, Tam O'Shanter
Agincourt North, L'Amoreaux East, Milliken, Steeles East
L'Amoreaux West
Upper Rouge
Hillcrest Village
Fairview, Henry Farm, Oriole
Bayview Village
Silver Hills, York Mills
Newtonbrook, Willowdale
Willowdale South
York Mills West
Willowdale West
Parkwoods
Don Mills North
Flemingdon Park, Don Mills South
Bathurst Manor, Downsview North, Wilson Heights
Northwood Park, York University
CFB Toronto, Downsview East
Downsview West
Downsview Central
Downsview Northwest
Victoria Village
Woodbine Gardens, Parkview Hill
Woodbine Heights
The Beaches
Leaside
Thorncliffe Park
East Toronto
The Danforth West, 

In [29]:
print(toronto_locations.shape)

(4880, 7)


In [30]:
toronto_locations

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Rouge, Malvern",43.806686,-79.194353,Images Salon & Spa,43.802283,-79.198565,Spa
1,"Rouge, Malvern",43.806686,-79.194353,Staples Morningside,43.800285,-79.196607,Paper / Office Supplies Store
2,"Rouge, Malvern",43.806686,-79.194353,Caribbean Wave,43.798558,-79.195777,Caribbean Restaurant
3,"Rouge, Malvern",43.806686,-79.194353,Wendy's,43.802008,-79.198080,Fast Food Restaurant
4,"Rouge, Malvern",43.806686,-79.194353,Wendy's,43.807448,-79.199056,Fast Food Restaurant
...,...,...,...,...,...,...,...
4875,"Albion Gardens, Beaumond Heights, Humbergate, ...",43.739416,-79.588437,46 Martingrove North,43.732211,-79.589618,Bus Line
4876,"Albion Gardens, Beaumond Heights, Humbergate, ...",43.739416,-79.588437,Panorama Park,43.747021,-79.583497,Park
4877,Northwest,43.706748,-79.594054,Tim Hortons,43.714657,-79.593716,Coffee Shop
4878,Northwest,43.706748,-79.594054,Saand Rexdale,43.705072,-79.598725,Drugstore


#### Number of venues in each location:
#### Note that the downtown neighborhoods have the highest number of venues.

In [31]:
toronto_locations.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"Adelaide, King, Richmond",100,100,100,100,100,100
Agincourt,44,44,44,44,44,44
"Agincourt North, L'Amoreaux East, Milliken, Steeles East",28,28,28,28,28,28
"Albion Gardens, Beaumond Heights, Humbergate, Jamestown, Mount Olive, Silverstone, South Steeles, Thistletown",16,16,16,16,16,16
"Alderwood, Long Branch",26,26,26,26,26,26
...,...,...,...,...,...,...
Willowdale West,12,12,12,12,12,12
Woburn,9,9,9,9,9,9
"Woodbine Gardens, Parkview Hill",17,17,17,17,17,17
Woodbine Heights,26,26,26,26,26,26


#### The frequency of the first 3 venue in each neighborhood of Toronto:

In [33]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_locations[["Venue Category"]], prefix = " ", prefix_sep = " ")

# add neighborhood column and move it to the first column
toronto_onehot["Neighborhood"] = toronto_locations["Neighborhood"] 
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]
toronto_onehot.shape
toronto_grouped = toronto_onehot.groupby("Neighborhood").mean().reset_index()
toronto_grouped
toronto_grouped.shape
num_top_venues = 3

for hood in toronto_grouped["Neighborhood"]:
    print("----" + hood + "----")
    temp = toronto_grouped[toronto_grouped["Neighborhood"] == hood].T.reset_index()
    temp.columns = ["venue", "freq"]
    temp = temp.iloc[1:]
    temp['freq'] = temp["freq"].astype(float)
    temp = temp.round({"freq": 2})
    print(temp.sort_values("freq", ascending = False).reset_index(drop = True).head(num_top_venues))
    print('\n')
    
    
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending = False)
    return row_categories_sorted.index.values[0:num_top_venues]

----Adelaide, King, Richmond----
           venue  freq
0    Coffee Shop  0.06
1          Hotel  0.05
2           Café  0.04


----Agincourt----
                  venue  freq
0    Chinese Restaurant  0.16
1         Shopping Mall  0.09
2                Bakery  0.05


----Agincourt North, L'Amoreaux East, Milliken, Steeles East----
                  venue  freq
0    Chinese Restaurant  0.21
1                  Park  0.07
2                Bakery  0.07


----Albion Gardens, Beaumond Heights, Humbergate, Jamestown, Mount Olive, Silverstone, South Steeles, Thistletown----
             venue  freq
0      Pizza Place  0.19
1    Grocery Store  0.19
2      Coffee Shop  0.06


----Alderwood, Long Branch----
                 venue  freq
0       Discount Store  0.12
1             Pharmacy  0.12
2    Convenience Store  0.08


----Bathurst Manor, Downsview North, Wilson Heights----
                 venue  freq
0          Pizza Place  0.07
1          Coffee Shop  0.07
2    Convenience Store  0.04


---