# <center>  Segmenting and Clustering Neighborhood in Toronto 

## Importing required libraries

In [1]:
import matplotlib.cm as cm
import matplotlib.colors as colors
from pandas.io.json import json_normalize
from sklearn.cluster import KMeans
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import requests
import folium

## Taking the url to be webscrapped and passing into and BeautifulSoup object

In [2]:
url=requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(url, 'lxml')

## Taking the required column values

In [3]:
table_contents=[]
table=soup.find('table')

# iterating over the 'td' cells and the value assigned cells
for row in table.findAll('td'):
    cell = {}
    if row.span.text=='Not assigned':
        pass
    else:
        cell['PostalCode'] = row.p.text[:3]
        cell['Borough'] = (row.span.text).split('(')[0]
        cell['Neighborhood'] = (((((row.span.text).split('(')[1]).strip(')')).replace(' /',',')).replace(')',' ')).strip(' ')
        table_contents.append(cell)

df=pd.DataFrame(table_contents)

# converting the lengthier borough name into precised structured name
df['Borough']=df['Borough'].replace({'Downtown TorontoStn A PO Boxes25 The Esplanade':'Downtown Toronto Stn A',
                                             'East TorontoBusiness reply mail Processing Centre969 Eastern':'East Toronto Business',
                                             'EtobicokeNorthwest':'Etobicoke Northwest','East YorkEast Toronto':'East York/East Toronto',
                                             'MississaugaCanada Post Gateway Processing Centre':'Mississauga'})
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Queen's Park,Ontario Provincial Government


## shape of dataframe

In [4]:
df.shape

(103, 3)

## finding latitude and longitude over postalcode

In [5]:
# taking the co-ordinates from the provided geo-spatial data sheet
gecoder_csv = pd.read_csv('Geospatial_Coordinates.csv')
gecoder_csv.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [6]:
for _,row in df.iterrows():
    postalcode=row['PostalCode']
    if str(postalcode)==str(gecoder_csv['Postal Code'].any()):
        df['latitude']=gecoder_csv['Latitude']
        df['longitude']=gecoder_csv['Longitude']
    else:
        pass
    
df

Unnamed: 0,PostalCode,Borough,Neighborhood,latitude,longitude
0,M3A,North York,Parkwoods,43.806686,-79.194353
1,M4A,North York,Victoria Village,43.784535,-79.160497
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.763573,-79.188711
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.770992,-79.216917
4,M7A,Queen's Park,Ontario Provincial Government,43.773136,-79.239476
...,...,...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",43.706876,-79.518188
99,M4Y,Downtown Toronto,Church and Wellesley,43.696319,-79.532242
100,M7Y,East Toronto Business,Enclave of M4L,43.688905,-79.554724
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu...",43.739416,-79.588437


## Getting the geo-location for toronto, ontario

In [7]:
from geopy.geocoders import Nominatim

In [8]:
address = 'Toronto, Ontario'

geolocator = Nominatim(user_agent="tor_explorer")
location = geolocator.geocode(address)
tor_latitude = location.latitude
tor_longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(tor_latitude, tor_longitude))

The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


In [9]:
# create map of New York using latitude and longitude values
map_toronto = folium.Map(location=[tor_latitude, tor_longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(df['latitude'], df['longitude'], df['Borough'], df['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

In [10]:
scarborough_data = df[df['Borough'] == 'Scarborough'].reset_index(drop=True)
print(scarborough_data.shape)
scarborough_data.head()

(17, 5)


Unnamed: 0,PostalCode,Borough,Neighborhood,latitude,longitude
0,M1B,Scarborough,"Malvern, Rouge",43.727929,-79.262029
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.7942,-79.262029
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.778517,-79.346556
3,M1G,Scarborough,Woburn,43.77012,-79.408493
4,M1H,Scarborough,Cedarbrae,43.745906,-79.352188


In [11]:
address = 'Scarborough, Toronto'

geolocator = Nominatim(user_agent="tor_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Scarborough are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Scarborough are 43.7729744, -79.2576479.


In [12]:
# create map of Manhattan using latitude and longitude values
map_scarborough = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, label in zip(scarborough_data['latitude'], scarborough_data['longitude'], scarborough_data['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_scarborough)  
    
map_scarborough

## foursquare for the analysis
### Defining foursquare credentials

In [None]:
CLIENT_ID =  # your Foursquare ID
CLIENT_SECRET =  # your Foursquare Secret
VERSION =  # Foursquare API version
LIMIT =  # A default Foursquare API limit value

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

### exploring first neighborhood

In [14]:
scarborough_data.loc[0, 'Neighborhood']

'Malvern, Rouge'

In [15]:
neighborhood_latitude = scarborough_data.loc[0, 'latitude'] # neighborhood latitude value
neighborhood_longitude = scarborough_data.loc[0, 'longitude'] # neighborhood longitude value

neighborhood_name = scarborough_data.loc[0, 'Neighborhood'] # neighborhood name

print('Latitude and longitude values of {} are {}, {}.'.format(neighborhood_name, 
                                                               neighborhood_latitude, 
                                                               neighborhood_longitude))

Latitude and longitude values of Malvern, Rouge are 43.7279292, -79.26202940000002.


In [None]:
limit=100
radius=500

url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    neighborhood_latitude, 
    neighborhood_longitude, 
    radius, 
    LIMIT)

url

In [17]:
results = requests.get(url).json()

In [18]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [19]:
venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

Unnamed: 0,name,categories,lat,lng
0,Tim Hortons,Coffee Shop,43.726895,-79.266157
1,Kennedy GO Station,Train Station,43.732275,-79.262418
2,Bros. CONVENIENCE,Convenience Store,43.727781,-79.265708
3,Hakka No.1,Chinese Restaurant,43.727688,-79.266057
4,Giant Tiger,Department Store,43.727447,-79.26624


In [20]:
print('{} venues were returned by Foursquare.'.format(nearby_venues.shape[0]))  ## total venues in the defined radius

7 venues were returned by Foursquare.


## Explore all neighborhood in Toronto

In [21]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [22]:
# type your answer here
scarborough_venues = getNearbyVenues(names=scarborough_data['Neighborhood'],
                                   latitudes=scarborough_data['latitude'],
                                   longitudes=scarborough_data['longitude']
                                  )

Malvern, Rouge
Rouge Hill, Port Union, Highland Creek
Guildwood, Morningside, West Hill
Woburn
Cedarbrae
Scarborough Village
Kennedy Park, Ionview, East Birchmount Park
Golden Mile, Clairlea, Oakridge
Cliffside, Cliffcrest, Scarborough Village West
Birch Cliff, Cliffside West
Dorset Park, Wexford Heights, Scarborough Town Centre
Wexford, Maryvale
Agincourt
Clarks Corners, Tam O'Shanter, Sullivan
Milliken, Agincourt North, Steeles East, L'Amoreaux East
Steeles West, L'Amoreaux West
Upper Rouge


In [23]:
print(scarborough_venues.shape)
scarborough_venues

(119, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Malvern, Rouge",43.727929,-79.262029,Tim Hortons,43.726895,-79.266157,Coffee Shop
1,"Malvern, Rouge",43.727929,-79.262029,Kennedy GO Station,43.732275,-79.262418,Train Station
2,"Malvern, Rouge",43.727929,-79.262029,Bros. CONVENIENCE,43.727781,-79.265708,Convenience Store
3,"Malvern, Rouge",43.727929,-79.262029,Hakka No.1,43.727688,-79.266057,Chinese Restaurant
4,"Malvern, Rouge",43.727929,-79.262029,Giant Tiger,43.727447,-79.266240,Department Store
...,...,...,...,...,...,...,...
114,Upper Rouge,43.643515,-79.577201,Bros. CONVENIENCE,43.727781,-79.265708,Convenience Store
115,Upper Rouge,43.643515,-79.577201,Hakka No.1,43.727688,-79.266057,Chinese Restaurant
116,Upper Rouge,43.643515,-79.577201,Giant Tiger,43.727447,-79.266240,Department Store
117,Upper Rouge,43.643515,-79.577201,Tandy Leather,43.726974,-79.266513,Hobby Shop


In [24]:
scarborough_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Agincourt,7,7,7,7,7,7
"Birch Cliff, Cliffside West",7,7,7,7,7,7
Cedarbrae,7,7,7,7,7,7
"Clarks Corners, Tam O'Shanter, Sullivan",7,7,7,7,7,7
"Cliffside, Cliffcrest, Scarborough Village West",7,7,7,7,7,7
"Dorset Park, Wexford Heights, Scarborough Town Centre",7,7,7,7,7,7
"Golden Mile, Clairlea, Oakridge",7,7,7,7,7,7
"Guildwood, Morningside, West Hill",7,7,7,7,7,7
"Kennedy Park, Ionview, East Birchmount Park",7,7,7,7,7,7
"Malvern, Rouge",7,7,7,7,7,7


In [25]:
print('There are {} uniques categories.'.format(len(scarborough_venues['Venue Category'].unique())))

There are 7 uniques categories.


## Analyze each neighborhood

In [26]:
# one hot encoding
scarborough_onehot = pd.get_dummies(scarborough_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
scarborough_onehot['Neighborhood'] = scarborough_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [scarborough_onehot.columns[-1]] + list(scarborough_onehot.columns[:-1])
scarborough_onehot = scarborough_onehot[fixed_columns]

scarborough_onehot.head()

Unnamed: 0,Neighborhood,Chinese Restaurant,Coffee Shop,Convenience Store,Department Store,Discount Store,Hobby Shop,Train Station
0,"Malvern, Rouge",0,1,0,0,0,0,0
1,"Malvern, Rouge",0,0,0,0,0,0,1
2,"Malvern, Rouge",0,0,1,0,0,0,0
3,"Malvern, Rouge",1,0,0,0,0,0,0
4,"Malvern, Rouge",0,0,0,1,0,0,0


In [27]:
scarborough_onehot.shape

(119, 8)

#### Next, let's group rows by neighborhood and by taking the mean of the frequency of occurrence of each category

In [28]:
scarborough_grouped = scarborough_onehot.groupby('Neighborhood').mean().reset_index()
scarborough_grouped

Unnamed: 0,Neighborhood,Chinese Restaurant,Coffee Shop,Convenience Store,Department Store,Discount Store,Hobby Shop,Train Station
0,Agincourt,0.142857,0.142857,0.142857,0.142857,0.142857,0.142857,0.142857
1,"Birch Cliff, Cliffside West",0.142857,0.142857,0.142857,0.142857,0.142857,0.142857,0.142857
2,Cedarbrae,0.142857,0.142857,0.142857,0.142857,0.142857,0.142857,0.142857
3,"Clarks Corners, Tam O'Shanter, Sullivan",0.142857,0.142857,0.142857,0.142857,0.142857,0.142857,0.142857
4,"Cliffside, Cliffcrest, Scarborough Village West",0.142857,0.142857,0.142857,0.142857,0.142857,0.142857,0.142857
5,"Dorset Park, Wexford Heights, Scarborough Town...",0.142857,0.142857,0.142857,0.142857,0.142857,0.142857,0.142857
6,"Golden Mile, Clairlea, Oakridge",0.142857,0.142857,0.142857,0.142857,0.142857,0.142857,0.142857
7,"Guildwood, Morningside, West Hill",0.142857,0.142857,0.142857,0.142857,0.142857,0.142857,0.142857
8,"Kennedy Park, Ionview, East Birchmount Park",0.142857,0.142857,0.142857,0.142857,0.142857,0.142857,0.142857
9,"Malvern, Rouge",0.142857,0.142857,0.142857,0.142857,0.142857,0.142857,0.142857


In [29]:
scarborough_grouped.shape

(17, 8)

In [30]:
num_top_venues = 5

for hood in scarborough_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = scarborough_grouped[scarborough_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Agincourt----
                venue  freq
0  Chinese Restaurant  0.14
1         Coffee Shop  0.14
2   Convenience Store  0.14
3    Department Store  0.14
4      Discount Store  0.14


----Birch Cliff, Cliffside West----
                venue  freq
0  Chinese Restaurant  0.14
1         Coffee Shop  0.14
2   Convenience Store  0.14
3    Department Store  0.14
4      Discount Store  0.14


----Cedarbrae----
                venue  freq
0  Chinese Restaurant  0.14
1         Coffee Shop  0.14
2   Convenience Store  0.14
3    Department Store  0.14
4      Discount Store  0.14


----Clarks Corners, Tam O'Shanter, Sullivan----
                venue  freq
0  Chinese Restaurant  0.14
1         Coffee Shop  0.14
2   Convenience Store  0.14
3    Department Store  0.14
4      Discount Store  0.14


----Cliffside, Cliffcrest, Scarborough Village West----
                venue  freq
0  Chinese Restaurant  0.14
1         Coffee Shop  0.14
2   Convenience Store  0.14
3    Department Store  0.14
4   

In [31]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [32]:
num_top_venues = 5

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = scarborough_grouped['Neighborhood']

for ind in np.arange(scarborough_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(scarborough_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,Agincourt,Train Station,Hobby Shop,Discount Store,Department Store,Convenience Store
1,"Birch Cliff, Cliffside West",Train Station,Hobby Shop,Discount Store,Department Store,Convenience Store
2,Cedarbrae,Train Station,Hobby Shop,Discount Store,Department Store,Convenience Store
3,"Clarks Corners, Tam O'Shanter, Sullivan",Train Station,Hobby Shop,Discount Store,Department Store,Convenience Store
4,"Cliffside, Cliffcrest, Scarborough Village West",Train Station,Hobby Shop,Discount Store,Department Store,Convenience Store


## Cluster neighborhoods

In [33]:
# set number of clusters
kclusters = 5

scarborough_grouped_clustering = scarborough_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(scarborough_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:20] 

  return_n_iter=True)


array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [34]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

scarborough_merged = scarborough_data

# merge manhattan_grouped with manhattan_data to add latitude/longitude for each neighborhood
scarborough_merged = scarborough_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

scarborough_merged.head() # check the last columns!

Unnamed: 0,PostalCode,Borough,Neighborhood,latitude,longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,M1B,Scarborough,"Malvern, Rouge",43.727929,-79.262029,0,Train Station,Hobby Shop,Discount Store,Department Store,Convenience Store
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.7942,-79.262029,0,Train Station,Hobby Shop,Discount Store,Department Store,Convenience Store
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.778517,-79.346556,0,Train Station,Hobby Shop,Discount Store,Department Store,Convenience Store
3,M1G,Scarborough,Woburn,43.77012,-79.408493,0,Train Station,Hobby Shop,Discount Store,Department Store,Convenience Store
4,M1H,Scarborough,Cedarbrae,43.745906,-79.352188,0,Train Station,Hobby Shop,Discount Store,Department Store,Convenience Store


#### visualizing clusters

In [35]:
# create map
map_clusters = folium.Map(location=[tor_latitude, tor_longitude], zoom_start=10)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(scarborough_merged['latitude'], scarborough_merged['longitude'], scarborough_merged['Neighborhood'], scarborough_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

## Examine clusters

In [36]:
scarborough_merged.loc[scarborough_merged['Cluster Labels'] == 0, scarborough_merged.columns[[1] + list(range(5, scarborough_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,Scarborough,0,Train Station,Hobby Shop,Discount Store,Department Store,Convenience Store
1,Scarborough,0,Train Station,Hobby Shop,Discount Store,Department Store,Convenience Store
2,Scarborough,0,Train Station,Hobby Shop,Discount Store,Department Store,Convenience Store
3,Scarborough,0,Train Station,Hobby Shop,Discount Store,Department Store,Convenience Store
4,Scarborough,0,Train Station,Hobby Shop,Discount Store,Department Store,Convenience Store
5,Scarborough,0,Train Station,Hobby Shop,Discount Store,Department Store,Convenience Store
6,Scarborough,0,Train Station,Hobby Shop,Discount Store,Department Store,Convenience Store
7,Scarborough,0,Train Station,Hobby Shop,Discount Store,Department Store,Convenience Store
8,Scarborough,0,Train Station,Hobby Shop,Discount Store,Department Store,Convenience Store
9,Scarborough,0,Train Station,Hobby Shop,Discount Store,Department Store,Convenience Store
