# <center> Segmenting and Clustering Neighborhoods in Toronto Peer Assignment 

### Installing Libraries

In [1]:
!pip install pandas
!pip install requests
!pip install bs4
!pip install plotly
!conda install -c conda-forge geopy --yes
!conda install -c conda-forge folium=0.5.0 --yes 
!pip install geocoder
!pip install pgeocode
print("Done")

Collecting package metadata (current_repodata.json): ...working... done
Solving environment: ...working... 
  - anaconda/win-64::ca-certificates-2020.10.14-0
  - defaults/win-64::ca-certificates-2020.10.14-0done

# All requested packages already installed.

Collecting package metadata (current_repodata.json): ...working... done
Solving environment: ...working... 
  - anaconda/win-64::ca-certificates-2020.10.14-0
  - defaults/win-64::ca-certificates-2020.10.14-0done

# All requested packages already installed.

Done


In [2]:
#Install libraries for web scrapping
import pandas as pd
import requests
from bs4 import BeautifulSoup
import numpy as np
from geopy.geocoders import Nominatim 
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans
import folium 
import geocoder
print("Done")

Done


### Part of data scraping from the Wiki page

In [3]:
#Please, use an old version of Wikipage! Check out Discussion on the Peer Assignment Week 3 page
#Uase request library to download the Wiki webpage
url = "https://en.wikipedia.org/w/index.php?title=List_of_postal_codes_of_Canada:_M&oldid=1011037969"
canada_html = requests.get(url).text

# getting dataframes from Wiki webpage. On this webpage there are 3 dataframes
dataframe_list = pd.read_html(url,flavor='bs4')
print(len(dataframe_list))

3


In [4]:
# Searching for necessary table index. The search is by word, which the table contains for sure. 
#In our case this is first post code "M1A"
for index,table in enumerate(dataframe_list):
    if ("M1A" in str(table)):
        table_index = index
print(table_index)
#the index of our table is 0

0


In [5]:
# now we have our raw dataframe 
toronto_data= dataframe_list[0]
toronto_data= pd.DataFrame(toronto_data)
print(toronto_data.shape)
toronto_data.head()

(180, 3)


Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


### Data processing

In [6]:
# Replace missing data with NaN
toronto_data.replace("Not assigned", np.nan, inplace = True)
toronto_data.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,,
1,M2A,,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [7]:
#Missing data to dataframe
missing_data=toronto_data.isnull()
missing_data.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,False,True,True
1,False,True,True
2,False,False,False
3,False,False,False
4,False,False,False


In [8]:
#
for column in missing_data.columns.values.tolist():
    print(column)
    print (missing_data[column].value_counts())
    print("")    

Postal Code
False    180
Name: Postal Code, dtype: int64

Borough
False    103
True      77
Name: Borough, dtype: int64

Neighbourhood
False    103
True      77
Name: Neighbourhood, dtype: int64



In [9]:
# Drop rows with NaN. According to the task, we deop row where "Borough" is NAN
toronto_data.dropna(subset=["Borough"], axis=0, inplace=True)

# reset index, because we droped two rows
toronto_data.reset_index(drop=True, inplace=True)
toronto_data

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
99,M4Y,Downtown Toronto,Church and Wellesley
100,M7Y,East Toronto,"Business reply mail Processing Centre, South C..."
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


In [10]:
#Checking missing values again
missing_data=toronto_data.isnull()
for column in missing_data.columns.values.tolist():
    print(column)
    print (missing_data[column].value_counts())
    print("")    

Postal Code
False    103
Name: Postal Code, dtype: int64

Borough
False    103
Name: Borough, dtype: int64

Neighbourhood
False    103
Name: Neighbourhood, dtype: int64



In [11]:
#Shape of our data frame
toronto_data.shape

(103, 3)

In [12]:
#@hidden_cell
import pgeocode
nomi = pgeocode.Nominatim('ca')
lat_log= nomi.query_postal_code("M2A")

In [13]:
#getting longitude and latitude
latitude1 = []
longitude1 = []
for i in range(len(toronto_data['Postal Code'])):
    coordinates= nomi.query_postal_code(toronto_data['Postal Code'].iloc[i])
    latitude1.append(coordinates.latitude)
    longitude1.append(coordinates.longitude)

toronto_data['Latitude'] = latitude1
toronto_data['Longitude'] = longitude1


In [14]:
print('The dataframe has {} boroughs and {} neighbourhoods.'.format(len(toronto_data['Borough'].unique()),
        toronto_data.shape[0]))
toronto_data.head()


The dataframe has 11 boroughs and 103 neighbourhoods.


Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.7545,-79.33
1,M4A,North York,Victoria Village,43.7276,-79.3148
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.6555,-79.3626
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.7223,-79.4504
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.6641,-79.3889


In [15]:
missing_data=toronto_data.isnull()
for column in missing_data.columns.values.tolist():
    print(column)
    print (missing_data[column].value_counts())
    print("")  

Postal Code
False    103
Name: Postal Code, dtype: int64

Borough
False    103
Name: Borough, dtype: int64

Neighbourhood
False    103
Name: Neighbourhood, dtype: int64

Latitude
False    102
True       1
Name: Latitude, dtype: int64

Longitude
False    102
True       1
Name: Longitude, dtype: int64



In [16]:
# Drop rows with NaN. According to the task, we deop row where "Borough" is NAN
toronto_data.dropna(subset=["Latitude"], axis=0, inplace=True)

# reset index, because we droped two rows
toronto_data.reset_index(drop=True, inplace=True)


### Ready to use Dataframe 

In [17]:
toronto_data

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.7545,-79.3300
1,M4A,North York,Victoria Village,43.7276,-79.3148
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.6555,-79.3626
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.7223,-79.4504
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.6641,-79.3889
...,...,...,...,...,...
97,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",43.6518,-79.5076
98,M4Y,Downtown Toronto,Church and Wellesley,43.6656,-79.3830
99,M7Y,East Toronto,"Business reply mail Processing Centre, South C...",43.7804,-79.2505
100,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu...",43.6325,-79.4939


#### Use geopy library to get the latitude and longitude values of Toronto

In [18]:
address = 'Toronto, ON'

geolocator = Nominatim(user_agent="toronto_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


#### Map of Toronto with neighborhoods

In [19]:
# create map of New York using latitude and longitude values
toronto_map = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(toronto_data['Latitude'], toronto_data['Longitude'], toronto_data['Borough'], toronto_data['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(toronto_map)  
    
toronto_map

#### Define Foursquare Credentials and Version

In [20]:
# @hidden_cell
CLIENT_ID = 'ZURMMC1TY3A2PMPQWRZI5LPW5QYSMFE5VOCBTAZHKM0YVV13' # your Foursquare ID
CLIENT_SECRET = 'SEOVOCS0XWAB2WMOTMFICLVWKHAYY2LB5WQEFT3KP20VADLE' # your Foursquare Secret
VERSION = '20120609' # Foursquare API version
LIMIT = 100 # A default Foursquare API limit value
print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: ZURMMC1TY3A2PMPQWRZI5LPW5QYSMFE5VOCBTAZHKM0YVV13
CLIENT_SECRET:SEOVOCS0XWAB2WMOTMFICLVWKHAYY2LB5WQEFT3KP20VADLE


###### Explore Toronto

In [21]:
radius = 1000
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    latitude, 
    longitude, 
    radius, 
    LIMIT)
url

'https://api.foursquare.com/v2/venues/explore?&client_id=ZURMMC1TY3A2PMPQWRZI5LPW5QYSMFE5VOCBTAZHKM0YVV13&client_secret=SEOVOCS0XWAB2WMOTMFICLVWKHAYY2LB5WQEFT3KP20VADLE&v=20120609&ll=43.6534817,-79.3839347&radius=1000&limit=100'

In [22]:
results = requests.get(url).json()
results

{'meta': {'code': 200, 'requestId': '605117717bc2ec28de63ed67'},
 'response': {'suggestedFilters': {'header': 'Tap to show:',
   'filters': [{'name': 'Open now', 'key': 'openNow'}]},
  'headerLocation': 'Bay Street Corridor',
  'headerFullLocation': 'Bay Street Corridor, Toronto',
  'headerLocationGranularity': 'neighborhood',
  'totalResults': 235,
  'suggestedBounds': {'ne': {'lat': 43.66248170900001,
    'lng': -79.37151886118865},
   'sw': {'lat': 43.64448169099999, 'lng': -79.39635053881135}},
  'groups': [{'type': 'Recommended Places',
    'name': 'recommended',
    'items': [{'reasons': {'count': 2,
       'items': [{'summary': 'Lots of people like this place',
         'type': 'general',
         'reasonName': 'rawLikesReason'}]},
      'venue': {'id': '5227bb01498e17bf485e6202',
       'name': 'Downtown Toronto',
       'contact': {},
       'location': {'lat': 43.65323167517444,
        'lng': -79.38529600606677,
        'labeledLatLngs': [{'label': 'display',
          'lat'

In [23]:
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [24]:
venues = results['response']['groups'][0]['items']
    
nearby_venues = pd.json_normalize (venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

Unnamed: 0,name,categories,lat,lng
0,Downtown Toronto,Neighborhood,43.653232,-79.385296
1,Nathan Phillips Square,Plaza,43.65227,-79.383516
2,Indigo,Bookstore,43.653515,-79.380696
3,Japango,Sushi Restaurant,43.655268,-79.385165
4,Chatime 日出茶太,Bubble Tea Shop,43.655542,-79.384684


In [25]:
print('{} venues were returned by Foursquare.'.format(nearby_venues.shape[0]))

100 venues were returned by Foursquare.


### Explore Neighbourhoods in Toronto

In [26]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighbourhood', 
                  'Neighbourhood Latitude', 
                  'Neighbourhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [27]:
toronto_venues = getNearbyVenues(names=toronto_data['Neighbourhood'],
                                   latitudes=toronto_data['Latitude'],
                                   longitudes=toronto_data['Longitude']
                                  )

Parkwoods
Victoria Village
Regent Park, Harbourfront
Lawrence Manor, Lawrence Heights
Queen's Park, Ontario Provincial Government
Islington Avenue, Humber Valley Village
Malvern, Rouge
Don Mills
Parkview Hill, Woodbine Gardens
Garden District, Ryerson
Glencairn
West Deane Park, Princess Gardens, Martin Grove, Islington, Cloverdale
Rouge Hill, Port Union, Highland Creek
Don Mills
Woodbine Heights
St. James Town
Humewood-Cedarvale
Eringate, Bloordale Gardens, Old Burnhamthorpe, Markland Wood
Guildwood, Morningside, West Hill
The Beaches
Berczy Park
Caledonia-Fairbanks
Woburn
Leaside
Central Bay Street
Christie
Cedarbrae
Hillcrest Village
Bathurst Manor, Wilson Heights, Downsview North
Thorncliffe Park
Richmond, Adelaide, King
Dufferin, Dovercourt Village
Scarborough Village
Fairview, Henry Farm, Oriole
Northwood Park, York University
East Toronto, Broadview North (Old East York)
Harbourfront East, Union Station, Toronto Islands
Little Portugal, Trinity
Kennedy Park, Ionview, East Birchmo

In [28]:
print(toronto_venues.shape)
toronto_venues.head()

(2163, 7)


Unnamed: 0,Neighbourhood,Neighbourhood Latitude,Neighbourhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Parkwoods,43.7545,-79.33,Brookbanks Park,43.751976,-79.33214,Park
1,Parkwoods,43.7545,-79.33,TTC stop - 44 Valley Woods,43.755402,-79.333741,Bus Stop
2,Parkwoods,43.7545,-79.33,Variety Store,43.751974,-79.333114,Food & Drink Shop
3,Parkwoods,43.7545,-79.33,Brookbanks Pool,43.751389,-79.332184,Pool
4,Victoria Village,43.7276,-79.3148,Victoria Village Arena,43.723481,-79.315635,Hockey Arena


In [29]:
toronto_venues.groupby('Neighbourhood').count()


Unnamed: 0_level_0,Neighbourhood Latitude,Neighbourhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighbourhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Agincourt,5,5,5,5,5,5
"Alderwood, Long Branch",6,6,6,6,6,6
"Bathurst Manor, Wilson Heights, Downsview North",6,6,6,6,6,6
Bayview Village,4,4,4,4,4,4
"Bedford Park, Lawrence Manor East",24,24,24,24,24,24
...,...,...,...,...,...,...
"Willowdale, Willowdale West",6,6,6,6,6,6
Woburn,1,1,1,1,1,1
Woodbine Heights,5,5,5,5,5,5
York Mills West,4,4,4,4,4,4


In [30]:
print('There are {} uniques categories.'.format(len(toronto_venues['Venue Category'].unique())))

There are 256 uniques categories.


In [31]:
neighbors = np.asarray(toronto_venues['Neighbourhood'])
neighbors

array(['Parkwoods', 'Parkwoods', 'Parkwoods', ...,
       'Mimico NW, The Queensway West, South of Bloor, Kingsway Park South West, Royal York South West',
       'Mimico NW, The Queensway West, South of Bloor, Kingsway Park South West, Royal York South West',
       'Mimico NW, The Queensway West, South of Bloor, Kingsway Park South West, Royal York South West'],
      dtype=object)

### Analyze Each Neighborhood

In [32]:
# one hot encoding


toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighbourhood'] = toronto_venues['Neighbourhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()


Unnamed: 0,Neighbourhood,Accessories Store,Adult Boutique,Afghan Restaurant,Airport,American Restaurant,Art Gallery,Art Museum,Arts & Crafts Store,Asian Restaurant,...,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Whisky Bar,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio
0,Parkwoods,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Parkwoods,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Parkwoods,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Parkwoods,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Victoria Village,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [33]:
print(toronto_onehot.shape)

toronto_onehot.head()

(2163, 257)


Unnamed: 0,Neighbourhood,Accessories Store,Adult Boutique,Afghan Restaurant,Airport,American Restaurant,Art Gallery,Art Museum,Arts & Crafts Store,Asian Restaurant,...,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Whisky Bar,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio
0,Parkwoods,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Parkwoods,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Parkwoods,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Parkwoods,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Victoria Village,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


##### Group rows by neighborhood and by taking the mean of the frequency of occurrence of each category


In [34]:
toronto_grouped = toronto_onehot.groupby('Neighbourhood').mean().reset_index()
print(toronto_grouped.shape)
toronto_grouped

(95, 257)


Unnamed: 0,Neighbourhood,Accessories Store,Adult Boutique,Afghan Restaurant,Airport,American Restaurant,Art Gallery,Art Museum,Arts & Crafts Store,Asian Restaurant,...,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Whisky Bar,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio
0,Agincourt,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,"Alderwood, Long Branch",0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"Bathurst Manor, Wilson Heights, Downsview North",0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Bayview Village,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,"Bedford Park, Lawrence Manor East",0.0,0.0,0.0,0.0,0.041667,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
90,"Willowdale, Willowdale West",0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
91,Woburn,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
92,Woodbine Heights,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
93,York Mills West,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


##### Each neighborhood along with the top 5 most common venues

In [35]:
num_top_venues = 5

for hood in toronto_grouped['Neighbourhood']:
    print("----"+hood+"----")
    temp = toronto_grouped[toronto_grouped['Neighbourhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Agincourt----
                       venue  freq
0  Latin American Restaurant   0.2
1            Badminton Court   0.2
2                  Newsagent   0.2
3               Skating Rink   0.2
4             Breakfast Spot   0.2


----Alderwood, Long Branch----
            venue  freq
0     Pizza Place  0.17
1             Pub  0.17
2     Coffee Shop  0.17
3             Gym  0.17
4  Sandwich Place  0.17


----Bathurst Manor, Wilson Heights, Downsview North----
                      venue  freq
0               Pizza Place  0.17
1             Deli / Bodega  0.17
2       Fried Chicken Joint  0.17
3  Mediterranean Restaurant  0.17
4               Coffee Shop  0.17


----Bayview Village----
                             venue  freq
0                      Gas Station  0.25
1                            Trail  0.25
2                      Flower Shop  0.25
3                             Park  0.25
4  Molecular Gastronomy Restaurant  0.00


----Bedford Park, Lawrence Manor East----
                v

##### Top 10 venues for each neighborhood

In [36]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [37]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighbourhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighbourhood'] = toronto_grouped['Neighbourhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighbourhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Agincourt,Breakfast Spot,Latin American Restaurant,Badminton Court,Skating Rink,Newsagent,Yoga Studio,Falafel Restaurant,Escape Room,Ethiopian Restaurant,Event Space
1,"Alderwood, Long Branch",Pizza Place,Pub,Sandwich Place,Coffee Shop,Gym,Convenience Store,Curling Ice,Dance Studio,Fish Market,Fish & Chips Shop
2,"Bathurst Manor, Wilson Heights, Downsview North",Pizza Place,Deli / Bodega,Middle Eastern Restaurant,Fried Chicken Joint,Coffee Shop,Mediterranean Restaurant,Dance Studio,Escape Room,Flea Market,Fish Market
3,Bayview Village,Flower Shop,Trail,Park,Gas Station,Falafel Restaurant,Electronics Store,Escape Room,Ethiopian Restaurant,Event Space,Farmers Market
4,"Bedford Park, Lawrence Manor East",Coffee Shop,Sandwich Place,Italian Restaurant,Pharmacy,Juice Bar,Comfort Food Restaurant,Pizza Place,Pub,Restaurant,Café


 ### Cluster Neighborhoods

In [38]:
# set number of clusters
kclusters = 2

toronto_grouped_clustering = toronto_grouped.drop('Neighbourhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([1, 1, 1, 0, 1, 1, 1, 1, 1, 1])

In [39]:
neighborhoods_venues_sorted['Neighbourhood']

0                                           Agincourt
1                              Alderwood, Long Branch
2     Bathurst Manor, Wilson Heights, Downsview North
3                                     Bayview Village
4                   Bedford Park, Lawrence Manor East
                           ...                       
90                        Willowdale, Willowdale West
91                                             Woburn
92                                   Woodbine Heights
93                                    York Mills West
94                           York Mills, Silver Hills
Name: Neighbourhood, Length: 95, dtype: object

In [40]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_merged = toronto_data

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighbourhood'), on='Neighbourhood')

toronto_merged.head() 

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M3A,North York,Parkwoods,43.7545,-79.33,0.0,Food & Drink Shop,Park,Pool,Bus Stop,Farmers Market,Electronics Store,Escape Room,Ethiopian Restaurant,Event Space,Falafel Restaurant
1,M4A,North York,Victoria Village,43.7276,-79.3148,1.0,French Restaurant,Pizza Place,Hockey Arena,Park,Intersection,Coffee Shop,Portuguese Restaurant,Event Space,Eastern European Restaurant,Electronics Store
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.6555,-79.3626,1.0,Coffee Shop,Breakfast Spot,Restaurant,Bakery,Distribution Center,Pub,Electronics Store,Event Space,Food Truck,Spa
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.7223,-79.4504,1.0,Clothing Store,Coffee Shop,Restaurant,Cosmetics Shop,Women's Store,Juice Bar,Food Court,Sandwich Place,Men's Store,Furniture / Home Store
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.6641,-79.3889,1.0,Sushi Restaurant,Italian Restaurant,Bubble Tea Shop,Distribution Center,Burrito Place,Café,Mexican Restaurant,Ethiopian Restaurant,Escape Room,Beer Bar


In [47]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighbourhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=[cluster],
        fill=True,
        fill_color=[cluster],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

### Examine Clusters

Cluster 1

In [42]:
 toronto_merged.loc[ toronto_merged['Cluster Labels'] == 0,  toronto_merged.columns[[1] + list(range(5,  toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,North York,0.0,Food & Drink Shop,Park,Pool,Bus Stop,Farmers Market,Electronics Store,Escape Room,Ethiopian Restaurant,Event Space,Falafel Restaurant
7,North York,0.0,River,Gym,Park,Pool,Trail,Dumpling Restaurant,Eastern European Restaurant,Electronics Store,Escape Room,Ethiopian Restaurant
13,North York,0.0,River,Gym,Park,Pool,Trail,Dumpling Restaurant,Eastern European Restaurant,Electronics Store,Escape Room,Ethiopian Restaurant
16,York,0.0,Deli / Bodega,Grocery Store,Hockey Arena,Playground,Park,Field,Trail,Eastern European Restaurant,Electronics Store,Escape Room
21,York,0.0,Park,Bakery,Gym,Women's Store,Sporting Goods Shop,Mexican Restaurant,Beer Store,Escape Room,Ethiopian Restaurant,Event Space
27,North York,0.0,Park,Residential Building (Apartment / Condo),Yoga Studio,Falafel Restaurant,Eastern European Restaurant,Electronics Store,Escape Room,Ethiopian Restaurant,Event Space,Farmers Market
32,Scarborough,0.0,Spa,Grocery Store,Park,Farmers Market,Electronics Store,Escape Room,Ethiopian Restaurant,Event Space,Falafel Restaurant,Fast Food Restaurant
35,East York,0.0,Intersection,Park,Convenience Store,Greek Restaurant,Yoga Studio,Falafel Restaurant,Electronics Store,Escape Room,Ethiopian Restaurant,Event Space
36,Downtown Toronto,0.0,Music Venue,Harbor / Marina,Park,Café,Electronics Store,Escape Room,Ethiopian Restaurant,Event Space,Falafel Restaurant,Farmers Market
39,North York,0.0,Flower Shop,Trail,Park,Gas Station,Falafel Restaurant,Electronics Store,Escape Room,Ethiopian Restaurant,Event Space,Farmers Market


Cluster 2

In [43]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 1, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
1,North York,1.0,French Restaurant,Pizza Place,Hockey Arena,Park,Intersection,Coffee Shop,Portuguese Restaurant,Event Space,Eastern European Restaurant,Electronics Store
2,Downtown Toronto,1.0,Coffee Shop,Breakfast Spot,Restaurant,Bakery,Distribution Center,Pub,Electronics Store,Event Space,Food Truck,Spa
3,North York,1.0,Clothing Store,Coffee Shop,Restaurant,Cosmetics Shop,Women's Store,Juice Bar,Food Court,Sandwich Place,Men's Store,Furniture / Home Store
4,Downtown Toronto,1.0,Sushi Restaurant,Italian Restaurant,Bubble Tea Shop,Distribution Center,Burrito Place,Café,Mexican Restaurant,Ethiopian Restaurant,Escape Room,Beer Bar
5,Etobicoke,1.0,Pharmacy,Park,Grocery Store,Skating Rink,Bank,Event Space,Eastern European Restaurant,Electronics Store,Escape Room,Ethiopian Restaurant
...,...,...,...,...,...,...,...,...,...,...,...,...
96,Downtown Toronto,1.0,Coffee Shop,Café,Hotel,Restaurant,Gym,Asian Restaurant,Deli / Bodega,Salad Place,Japanese Restaurant,Steakhouse
97,Etobicoke,1.0,Bank,Bakery,Sushi Restaurant,Breakfast Spot,Bar,Boutique,Coffee Shop,Pub,Restaurant,Liquor Store
98,Downtown Toronto,1.0,Japanese Restaurant,Sushi Restaurant,Coffee Shop,Gay Bar,Restaurant,Yoga Studio,Hotel,Fast Food Restaurant,Mediterranean Restaurant,Men's Store
99,East Toronto,1.0,Italian Restaurant,Restaurant,Coffee Shop,Yoga Studio,Breakfast Spot,Bookstore,Sushi Restaurant,Bank,Japanese Restaurant,Martial Arts School
