<h1>Segmenting and Clustering Neighborhoods in Toronto<\h1>

before getting all started up ,first lets import all the libraries required to explore the Data

In [1]:
#!conda install -c anaconda beautifulsoup4 -y 
#!conda install -c anaconda lxml -y

In [2]:
import pandas as pd
import numpy as np
import urllib.request
from bs4 import BeautifulSoup
import requests
import lxml
import folium
from sklearn.cluster import KMeans 
import matplotlib.cm as cm
import matplotlib.colors as colors


<h1>Question 1</h1>
<h2>Get The Data</h2>
lets get the required data from the given url

In [3]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
page = urllib.request.urlopen(url)
soup = BeautifulSoup(page, "html")


In [4]:
all_tables=soup.find_all("table")
#all_tables

now to find the right table for our problem 

In [5]:
right_table=soup.find('table', class_='wikitable sortable')
#right_table

<h2>Convert</h2>
now lets convert the table in html to a Dataframe 

In [6]:
# storing data of columns in respective lists 
A=[]
B=[]
C=[]

for row in right_table.findAll('tr'):
    cells=row.findAll('td')
    if len(cells)==3:
        A.append(cells[0].find(text=True))
        B.append(cells[1].find(text=True))
        C.append(cells[2].find(text=True))

In [7]:
#ctreating a datafram with respective coloumns
df=pd.DataFrame(A,columns=['PostalCode'])
df['Borough']=B
df['Neighborhood']=C
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A\n,Not assigned\n,\n
1,M2A\n,Not assigned\n,\n
2,M3A\n,North York\n,Parkwoods\n
3,M4A\n,North York\n,Victoria Village\n
4,M5A\n,Downtown Toronto\n,Regent Park / Harbourfront\n


<h2>Preprocessing</h2>
Now lets remove all the unnecessary text from the dataframe

In [8]:
#removing "\n" string from all the rows using strip()
for column in df.columns:
    df[column] = df[column].map(lambda x : x.strip('\n'))
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Regent Park / Harbourfront


Removing all the rows with unassigned Boroughs

In [9]:
#replacing all '/' with ',' as asked 
df['Neighborhood'] = df['Neighborhood'].str.replace('/',',',regex = False)
#replacing all "Not assigned" rows in Borough with 'NaN'
df.replace(to_replace = ['Not assigned' , ''] , value = [np.nan ,np.nan],inplace = True)
#dropping all the rows with 'NaN' in the column 'Borough'
df.dropna(subset = ['Borough'],axis = 0,inplace = True)
df.reset_index(inplace = True , drop = True)
df.head(20)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park , Harbourfront"
3,M6A,North York,"Lawrence Manor , Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park , Ontario Provincial Government"
5,M9A,Etobicoke,Islington Avenue
6,M1B,Scarborough,"Malvern , Rouge"
7,M3B,North York,Don Mills
8,M4B,East York,"Parkview Hill , Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


Lets find out if there are any rows in the column 'Neighborhood' to replace them with the same name as the corresponding "Borough" of the same row

In [10]:
df['Neighborhood'].isna().sum()

0

Hence all the rows are filled with the data without any 'Nan'

In [11]:
#finding the number of rows and columns in the dataframe 
df.shape

(103, 3)

<h1>Question 2</h1>
<h2>Get The Data</h2>
Lets convert the url of the .csv file to a pandas Dataframe


In [12]:
url = 'http://cocl.us/Geospatial_data'
df_GC = pd.read_csv(url)
df_GC.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


now we have to merge this dataframe with the dataframe obtained at the end of question 1 

In [13]:
#matching the postal codes of both the dataframes
df.sort_values(by = 'PostalCode' , inplace = True)
df.reset_index(drop = True , inplace = True)
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Malvern , Rouge"
1,M1C,Scarborough,"Rouge Hill , Port Union , Highland Creek"
2,M1E,Scarborough,"Guildwood , Morningside , West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [14]:
#adding the columns to the dataframe
df['Latitude'] = df_GC['Latitude']
df['Longitude'] = df_GC['Longitude']
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern , Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill , Port Union , Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood , Morningside , West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [15]:
#toronto_map = folium.Map(location = [43.6532 , -79.3832] , zoom_start = 12 , tiles = 'stamen terrain')
#toronto_map

To find the total number of unique 'Borough's

In [16]:
df['Borough'].unique().size

10

<h1>Question 3</h1>
<h2>Exploring The Data</h2>
lets explore the 'Downtown Toronto' Borough

In [17]:
#forming a dataframe with only the neighborhoods of Downtown Toronto 
downtown = df[df['Borough'] == 'Downtown Toronto'].reset_index(drop = True)
downtown.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M4W,Downtown Toronto,Rosedale,43.679563,-79.377529
1,M4X,Downtown Toronto,"St. James Town , Cabbagetown",43.667967,-79.367675
2,M4Y,Downtown Toronto,Church and Wellesley,43.66586,-79.38316
3,M5A,Downtown Toronto,"Regent Park , Harbourfront",43.65426,-79.360636
4,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937


We are going to get the data about all the venues in the 'Downtown Toronto' using the foursquare API calls  

In [18]:
#details required for requesting API calls 
CLIENT_ID = 'UD3APJJHD41HZJV1OFAGV4GD35MYPGJRE1OK41S0KBYHZMX0' 
CLIENT_SECRET = 'TIPSUGZPPSE2S0DSSQ4K4CX0N03U3SIZKNETVL3ABBB2H1QJ'
VERSION = '20200422'

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)
LIMIT = 100
radius =500

Your credentails:
CLIENT_ID: UD3APJJHD41HZJV1OFAGV4GD35MYPGJRE1OK41S0KBYHZMX0
CLIENT_SECRET:TIPSUGZPPSE2S0DSSQ4K4CX0N03U3SIZKNETVL3ABBB2H1QJ


Lets define a function to request the API calls at a time and convert it into a dataframe

In [19]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [20]:
downtown_venues = getNearbyVenues(names = downtown['Neighborhood'],latitudes = downtown['Latitude'] , longitudes = downtown['Longitude'])

Rosedale
St. James Town , Cabbagetown
Church and Wellesley
Regent Park , Harbourfront
Garden District, Ryerson
St. James Town
Berczy Park
Central Bay Street
Richmond , Adelaide , King
Harbourfront East , Union Station , Toronto Islands
Toronto Dominion Centre , Design Exchange
Commerce Court , Victoria Hotel
University of Toronto , Harbord
Kensington Market , Chinatown , Grange Park
CN Tower , King and Spadina , Railway Lands , Harbourfront West , Bathurst Quay , South Niagara , Island airport
Stn A PO Boxes
First Canadian Place , Underground city
Christie
Queen's Park , Ontario Provincial Government


In [21]:
downtown_venues.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Rosedale,43.679563,-79.377529,Rosedale Park,43.682328,-79.378934,Playground
1,Rosedale,43.679563,-79.377529,Whitney Park,43.682036,-79.373788,Park
2,Rosedale,43.679563,-79.377529,Alex Murray Parkette,43.6783,-79.382773,Park
3,Rosedale,43.679563,-79.377529,Milkman's Lane,43.676352,-79.373842,Trail
4,"St. James Town , Cabbagetown",43.667967,-79.367675,Cranberries,43.667843,-79.369407,Diner


Lets find out different types of venue categories in the Downtown Toronto using one hot encoding

In [22]:
downtown_onehot = pd.get_dummies(downtown_venues[['Venue Category']], prefix = '' , prefix_sep = '')
downtown_onehot['Neighborhood'] = downtown_venues['Neighborhood']

In [23]:
#finding the mean of all the categories
downtown_mean = downtown_onehot.groupby('Neighborhood').mean().reset_index()
print(downtown_mean.shape)
downtown_mean.head()

(19, 206)


Unnamed: 0,Neighborhood,Airport,Airport Food Court,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,Aquarium,Art Gallery,...,Theater,Theme Restaurant,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Women's Store,Yoga Studio
0,Berczy Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.017241,...,0.0,0.0,0.0,0.0,0.017241,0.0,0.0,0.0,0.0,0.0
1,"CN Tower , King and Spadina , Railway Lands , ...",0.0625,0.0625,0.125,0.1875,0.125,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Central Bay Street,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.014925,0.0,0.0,0.0,0.0,0.014925
3,Christie,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Church and Wellesley,0.0,0.0,0.0,0.0,0.0,0.014085,0.0,0.0,0.0,...,0.014085,0.014085,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.028169


there are about 15 different types of venue categories combined in all the Neighborhoods of Downtown Toronto

In [24]:
#function to get the most common venues of a neighborhood
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [25]:
num_top_venues = 10

indt = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for indice in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(indice+1, indt[indice]))
    except:
        columns.append('{}th Most Common Venue'.format(indice+1))

# create a new dataframe
dt_venues_sorted = pd.DataFrame(columns=columns)
dt_venues_sorted['Neighborhood'] = downtown_mean['Neighborhood']

for ind in np.arange(downtown_mean.shape[0]):
    dt_venues_sorted.iloc[ind, 1:] = return_most_common_venues(downtown_mean.iloc[ind, :], num_top_venues)

dt_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Berczy Park,Coffee Shop,Cocktail Bar,Beer Bar,Seafood Restaurant,Restaurant,Bakery,Italian Restaurant,Cheese Shop,Farmers Market,Café
1,"CN Tower , King and Spadina , Railway Lands , ...",Airport Service,Airport Lounge,Airport Terminal,Harbor / Marina,Sculpture Garden,Airport Food Court,Boat or Ferry,Boutique,Coffee Shop,Plane
2,Central Bay Street,Coffee Shop,Italian Restaurant,Sandwich Place,Café,Fried Chicken Joint,Bubble Tea Shop,Salad Place,Thai Restaurant,Japanese Restaurant,Department Store
3,Christie,Grocery Store,Café,Park,Italian Restaurant,Candy Store,Restaurant,Diner,Gas Station,Baby Store,Nightclub
4,Church and Wellesley,Sushi Restaurant,Coffee Shop,Japanese Restaurant,Restaurant,Yoga Studio,Burger Joint,Pub,Men's Store,Hotel,Mediterranean Restaurant


<h2> Clustering </h2>
lets cluster all the neighborhoods with 

In [26]:
# set number of clusters
kclusters = 5

downtown_clustering = downtown_mean.drop(columns ='Neighborhood')

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(downtown_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([0, 3, 4, 2, 0, 0, 0, 0, 0, 0], dtype=int32)

In [27]:
dt_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

dt_f = downtown

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
dt_f = dt_f.join(dt_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

dt_f.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M4W,Downtown Toronto,Rosedale,43.679563,-79.377529,1,Park,Playground,Trail,Deli / Bodega,Eastern European Restaurant,Dumpling Restaurant,Donut Shop,Doner Restaurant,Dog Run,Distribution Center
1,M4X,Downtown Toronto,"St. James Town , Cabbagetown",43.667967,-79.367675,0,Coffee Shop,Café,Pizza Place,Bakery,Chinese Restaurant,Market,Italian Restaurant,Pub,Restaurant,Japanese Restaurant
2,M4Y,Downtown Toronto,Church and Wellesley,43.66586,-79.38316,0,Sushi Restaurant,Coffee Shop,Japanese Restaurant,Restaurant,Yoga Studio,Burger Joint,Pub,Men's Store,Hotel,Mediterranean Restaurant
3,M5A,Downtown Toronto,"Regent Park , Harbourfront",43.65426,-79.360636,0,Coffee Shop,Park,Bakery,Pub,Café,Restaurant,Breakfast Spot,Theater,Beer Store,Performing Arts Venue
4,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937,0,Clothing Store,Coffee Shop,Café,Restaurant,Japanese Restaurant,Bubble Tea Shop,Cosmetics Shop,Middle Eastern Restaurant,Tea Room,Diner


<h2>Mapping</h2>
Lets represent plot all the cluster points on a map using folium library

In [28]:
# create map
map_clusters = folium.Map(location=[43.6532 , -79.3832], zoom_start=12)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(dt_f['Latitude'], dt_f['Longitude'], dt_f['Neighborhood'], dt_f['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

In [29]:
dt_f.shape

(19, 16)

<h2>Examining the Clusters</h2>

In [31]:
dt_f.loc[dt_f['Cluster Labels'] == 0, dt_f.columns[[1] + list(range(5, dt_f.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
1,Downtown Toronto,0,Coffee Shop,Café,Pizza Place,Bakery,Chinese Restaurant,Market,Italian Restaurant,Pub,Restaurant,Japanese Restaurant
2,Downtown Toronto,0,Sushi Restaurant,Coffee Shop,Japanese Restaurant,Restaurant,Yoga Studio,Burger Joint,Pub,Men's Store,Hotel,Mediterranean Restaurant
3,Downtown Toronto,0,Coffee Shop,Park,Bakery,Pub,Café,Restaurant,Breakfast Spot,Theater,Beer Store,Performing Arts Venue
4,Downtown Toronto,0,Clothing Store,Coffee Shop,Café,Restaurant,Japanese Restaurant,Bubble Tea Shop,Cosmetics Shop,Middle Eastern Restaurant,Tea Room,Diner
5,Downtown Toronto,0,Coffee Shop,Café,Gastropub,Hotel,Italian Restaurant,American Restaurant,Cocktail Bar,Seafood Restaurant,Gym,Farmers Market
6,Downtown Toronto,0,Coffee Shop,Cocktail Bar,Beer Bar,Seafood Restaurant,Restaurant,Bakery,Italian Restaurant,Cheese Shop,Farmers Market,Café
8,Downtown Toronto,0,Coffee Shop,Café,Restaurant,Gym,Deli / Bodega,Clothing Store,American Restaurant,Thai Restaurant,Hotel,Breakfast Spot
9,Downtown Toronto,0,Coffee Shop,Aquarium,Café,Hotel,Restaurant,Italian Restaurant,Sporting Goods Shop,Fried Chicken Joint,Scenic Lookout,Brewery
10,Downtown Toronto,0,Coffee Shop,Hotel,Café,Restaurant,Japanese Restaurant,Salad Place,Seafood Restaurant,American Restaurant,Beer Bar,Asian Restaurant
11,Downtown Toronto,0,Coffee Shop,Restaurant,Café,Hotel,Gym,American Restaurant,Japanese Restaurant,Seafood Restaurant,Deli / Bodega,Italian Restaurant


In [32]:
dt_f.loc[dt_f['Cluster Labels'] == 1, dt_f.columns[[1] + list(range(5, dt_f.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Downtown Toronto,1,Park,Playground,Trail,Deli / Bodega,Eastern European Restaurant,Dumpling Restaurant,Donut Shop,Doner Restaurant,Dog Run,Distribution Center


In [33]:
dt_f.loc[dt_f['Cluster Labels'] == 2, dt_f.columns[[1] + list(range(5, dt_f.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
17,Downtown Toronto,2,Grocery Store,Café,Park,Italian Restaurant,Candy Store,Restaurant,Diner,Gas Station,Baby Store,Nightclub


In [34]:
dt_f.loc[dt_f['Cluster Labels'] == 3, dt_f.columns[[1] + list(range(5, dt_f.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
14,Downtown Toronto,3,Airport Service,Airport Lounge,Airport Terminal,Harbor / Marina,Sculpture Garden,Airport Food Court,Boat or Ferry,Boutique,Coffee Shop,Plane


In [35]:
dt_f.loc[dt_f['Cluster Labels'] == 4, dt_f.columns[[1] + list(range(5, dt_f.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
7,Downtown Toronto,4,Coffee Shop,Italian Restaurant,Sandwich Place,Café,Fried Chicken Joint,Bubble Tea Shop,Salad Place,Thai Restaurant,Japanese Restaurant,Department Store
18,Downtown Toronto,4,Coffee Shop,Sushi Restaurant,Diner,College Cafeteria,Sandwich Place,Restaurant,Burger Joint,Burrito Place,Café,Park
