## Installing needed packages

In [120]:
# !conda install -c conda-forge BeautifulSoup4 --yes 
# !conda install -c conda-forge requests --yes 
# !conda install -c conda-forge lxml --yes 
# !conda install -c conda-forge html5lib --yes 
# !conda install -c conda-forge geopy --yes

### Importing needed libraries

In [121]:
from bs4 import BeautifulSoup # for webscarping
import requests #to call the link
import numpy as np
import pandas as pd

#for maps
import folium

#for json files
import json
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#import for the lat and lang as the geocoder is not working
from geopy.geocoders import Nominatim
print('All imported')

All imported


## Part 1 begins here

### Using Beautiful Soup to parse the link after reading it using requests

In [5]:
#!conda install -c conda-forge lxml --yes 
source =  requests.get ('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup  = BeautifulSoup(source,'lxml')
#print (soup.prettify())

#### Converting all needed tables from the link in the dataframe

In [6]:
for table in soup.find_all('table',class_='wikitable sortable'):
    s = table
#print(type(s))
st =  str(s)
dfs =  pd.read_html(st)[0]
print(dfs)

    Postcode           Borough           Neighborhood
0        M1A      Not assigned           Not assigned
1        M2A      Not assigned           Not assigned
2        M3A        North York              Parkwoods
3        M4A        North York       Victoria Village
4        M5A  Downtown Toronto           Harbourfront
..       ...               ...                    ...
282      M8Z         Etobicoke              Mimico NW
283      M8Z         Etobicoke     The Queensway West
284      M8Z         Etobicoke  Royal York South West
285      M8Z         Etobicoke         South of Bloor
286      M9Z      Not assigned           Not assigned

[287 rows x 3 columns]


## Optional - directly using ipython to display 'table'

In [7]:
#!conda install -c conda-forge ipython --yes 
#from IPython.display import display_html
#display_html(st, raw=True)

### To directly read a table from the link - will be using this for the problem

In [8]:
## using df as the final dataframe for part 1 of the question

df = pd.read_html('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')[0]
print(df)

    Postcode           Borough           Neighborhood
0        M1A      Not assigned           Not assigned
1        M2A      Not assigned           Not assigned
2        M3A        North York              Parkwoods
3        M4A        North York       Victoria Village
4        M5A  Downtown Toronto           Harbourfront
..       ...               ...                    ...
282      M8Z         Etobicoke              Mimico NW
283      M8Z         Etobicoke     The Queensway West
284      M8Z         Etobicoke  Royal York South West
285      M8Z         Etobicoke         South of Bloor
286      M9Z      Not assigned           Not assigned

[287 rows x 3 columns]


### Renaming the column Postcode to PostalCode to match with the requirement

In [9]:
df = df.rename(columns= {"Postcode": "PostalCode"})
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


### Filtering out the 'Not assigned' Borough values

In [10]:
df_filtered = df[df['Borough']!='Not assigned']
df_filtered

Unnamed: 0,PostalCode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor
...,...,...,...
281,M8Z,Etobicoke,Kingsway Park South West
282,M8Z,Etobicoke,Mimico NW
283,M8Z,Etobicoke,The Queensway West
284,M8Z,Etobicoke,Royal York South West


#### First copying over the Borough to Neighborhood for Neighborhood values as 'Not assigned'. This will help if we have to aggregate neighborhood values in one row





In [11]:
df_copy = df_filtered.copy()
df_copy.loc[df_copy['Neighborhood'] == 'Not assigned','Neighborhood'] = df_copy['Borough']
#df_copy

#### To check if the column value was copied - first check the uncopied dataframe and then the updated i.e. df_copy

In [12]:
print('Rows where Neighborhood and borough are same is:\n ',df_filtered.loc[df_filtered['Neighborhood']== df_filtered['Borough']]) # returns empty as there's no same value
print('Rows where Neighborhood is Not assigned:\n',df_filtered.loc[df_filtered['Neighborhood']== 'Not assigned']) # returns one row where Neighborhood is Not assigned
print('Rows where Neighborhood is copied from Borough:\n',df_copy.loc[df_copy['Neighborhood']== df_copy['Borough']]) # returns the row where neighborhood value was updated

Rows where Neighborhood and borough are same is:
  Empty DataFrame
Columns: [PostalCode, Borough, Neighborhood]
Index: []
Rows where Neighborhood is Not assigned:
   PostalCode       Borough  Neighborhood
7        M7A  Queen's Park  Not assigned
Rows where Neighborhood is copied from Borough:
   PostalCode       Borough  Neighborhood
7        M7A  Queen's Park  Queen's Park


#### Checking the rows where the Neighborhood values would need to be aggregated

In [13]:
df_cnt = df_copy.groupby(['PostalCode','Borough']).count()
print (df_cnt)
print('Final data should have {} rows'.format(df_cnt.shape[0]))

                        Neighborhood
PostalCode Borough                  
M1B        Scarborough             2
M1C        Scarborough             3
M1E        Scarborough             3
M1G        Scarborough             1
M1H        Scarborough             1
...                              ...
M9N        York                    1
M9P        Etobicoke               1
M9R        Etobicoke               4
M9V        Etobicoke               8
M9W        Etobicoke               1

[103 rows x 1 columns]
Final data should have 103 rows


#### Creating the final dataset as per requirements

In [14]:
df_final_part1 =  df_copy.groupby(['PostalCode','Borough']).agg(Neighborhood = ('Neighborhood',', '.join)).reset_index()
print(df_final_part1)

    PostalCode      Borough                                       Neighborhood
0          M1B  Scarborough                                     Rouge, Malvern
1          M1C  Scarborough             Highland Creek, Rouge Hill, Port Union
2          M1E  Scarborough                  Guildwood, Morningside, West Hill
3          M1G  Scarborough                                             Woburn
4          M1H  Scarborough                                          Cedarbrae
..         ...          ...                                                ...
98         M9N         York                                             Weston
99         M9P    Etobicoke                                          Westmount
100        M9R    Etobicoke  Kingsview Village, Martin Grove Gardens, Richv...
101        M9V    Etobicoke  Albion Gardens, Beaumond Heights, Humbergate, ...
102        M9W    Etobicoke                                          Northwest

[103 rows x 3 columns]


In [15]:
print('Final data has {} rows'.format(df_final_part1.shape[0]))

Final data has 103 rows


## Part 2 begins here

In [16]:
# !conda install -c conda-forge geocoder --yes
# import geocoder as gc
# print("Geocoder installed and imported.")

##### Trying to ude geocoder but returns "Denied Request error"

In [17]:
#  lat_lng = None
# while(lat_lng is None) :
#         g = gc.google('{}, Toronto, Ontario'.format(df_copy['PostalCode'].to_string()))
#         print (g)        #print('{}, Canada'.format(df_copy['PostalCode'].to_string()))
#         lat_lng = g.latlng
# print(df_copy['Borough'].to_string())

#### Using the same geo package as in the other exercises but this also fails at some addresses

In [18]:
# for i in range(1,df_cnt.shape[0]+1): # for unique count
#     #print (i)
#     address = df_copy['PostalCode'].iloc[i]+ ', Downtown Toronto, Ontario'
#     print(address)
#     location = geolocator.geocode(address)
#     print(location)
#     latitude = location.latitude
#     longitude = location.longitude
# print('The geograpical coordinate of CA are {}, {}.'.format(latitude, longitude))

#### Reading from the CSV file provided for the exercise :(

In [19]:
df_csv = pd.read_csv('Geospatial_Coordinates.csv')
df_csv

##Renaming the Postal Code to PostalCode

df_csv = df_csv.rename(columns= {"Postal Code": "PostalCode"})
df_csv.head()

Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


#### Above dataframe will need to be joined with df_final_part1 to add lat and lng values

##### Using merge to add Latitude and Longitude

In [20]:
df_part2 = pd.merge(df_final_part1, df_csv, left_on ='PostalCode', right_on = 'PostalCode')
df_part2

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
...,...,...,...,...,...
98,M9N,York,Weston,43.706876,-79.518188
99,M9P,Etobicoke,Westmount,43.696319,-79.532242
100,M9R,Etobicoke,"Kingsview Village, Martin Grove Gardens, Richv...",43.688905,-79.554724
101,M9V,Etobicoke,"Albion Gardens, Beaumond Heights, Humbergate, ...",43.739416,-79.588437


### Part 3 begins here

##### Limiting the data to Boroughs having "Downtown Toronto" in the name

In [21]:
df_Toronto = df_part2[df_part2['Borough'].str.contains('Downtown Toronto')].reset_index(drop=True)
df_Toronto

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M4W,Downtown Toronto,Rosedale,43.679563,-79.377529
1,M4X,Downtown Toronto,"Cabbagetown, St. James Town",43.667967,-79.367675
2,M4Y,Downtown Toronto,Church and Wellesley,43.66586,-79.38316
3,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636
4,M5B,Downtown Toronto,"Ryerson, Garden District",43.657162,-79.378937
5,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
6,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306
7,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383
8,M5H,Downtown Toronto,"Adelaide, King, Richmond",43.650571,-79.384568
9,M5J,Downtown Toronto,"Harbourfront East, Toronto Islands, Union Station",43.640816,-79.381752


### Visualizing DowntownToronto in the map

#### First checking co-ordinates of Toronto

In [24]:
address = 'Downtown Toronto, ON'
geolocator = Nominatim(user_agent="ca_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Downtown Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Downtown Toronto are 43.6563221, -79.3809161.


#### Checking Downtown Toronto on the map marking the Neighborhoods

In [25]:
#map creation
map_DT = folium.Map(location=[latitude,longitude],zoom_start=11)

#adding markers on above map

for lat, lng, label in zip(df_Toronto['Latitude'],df_Toronto['Longitude'],df_Toronto['Neighborhood']) :
    label = folium.Popup(label, parse_html= True)
    folium.CircleMarker( 
        [lat,lng], 
        radius=5, 
        popup= label,
        color = 'green',
        fill = True,
        fill_color = '#33FF4F',
        fill_opacity = 0.7,
        parse_html =False).add_to(map_DT)
map_DT

### Exploring the neighborhoods around Downtown Toronto

###### Using existing Foursquare creds

In [26]:
CLIENT_ID = 'ANUSTZNTX5MOJM10FOMOTATTFTLZ2C4WVMAGBCTTRCWQZEOO' # your Foursquare ID
CLIENT_SECRET = 'V4PHMEO5TGQ22DL1KYFB4TFDKX4N4SQ4AS5XOXBSLM0I4Z1W' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: ANUSTZNTX5MOJM10FOMOTATTFTLZ2C4WVMAGBCTTRCWQZEOO
CLIENT_SECRET:V4PHMEO5TGQ22DL1KYFB4TFDKX4N4SQ4AS5XOXBSLM0I4Z1W


#### Taking any one of the neighborhoods to explore first

In [27]:
df_Toronto.loc[3,'Neighborhood']

'Harbourfront'

###### Let's shorten Harbourfront as HF for upcoming analysis

In [28]:
HF_name = df_Toronto.loc[3,'Neighborhood']
HF_lat  =  df_Toronto.loc[3,'Latitude']
HF_lng  = df_Toronto.loc[3,'Longitude']

print('Location credentials for HF : Name {}, Lat {}, Long{}'.format(HF_name, HF_lat, HF_lng))

Location credentials for HF : Name Harbourfront, Lat 43.6542599, Long-79.3606359


##### Getting url for exploration using Foursquare API and limiting the results to 30

In [86]:
radius = 500
LIMIT = 50
url =  'https://api.foursquare.com/v2/venues/search?client_id={}&client_secret={}&ll={},{}&v={}&radius={}&limit={}'.format(CLIENT_ID, CLIENT_SECRET, HF_lat, HF_lng, VERSION, radius, LIMIT)
## Getting the response from the API

results = requests.get(url).json()#['response']['venues']
#results

#### Leveraging the get category type function

In [68]:
def get_category_type(i):
    try :
        cateogories =  i['venue.categories']
    except :
        categories =  i['categories']
        
        if len(categories) == 0:
            return None
        else:
            return categories[0]['name']

#### Cleaning the json and loading in dataframe

In [87]:
#only taking the required values
venues = results['response']['venues']

#flatten the json

df_HF = json_normalize(venues)
filtered_columns = ['name','categories','location.lat','location.lng']
filtered_columns
                    
HF_venues = df_HF.loc[:, filtered_columns]
HF_venues['categories'] = HF_venues.apply(get_category_type,axis=1)
                  
HF_venues.columns= [col.split(".")[-1] for col in HF_venues.columns]    
    
HF_venues.head()

Unnamed: 0,name,categories,lat,lng
0,Sackville Playground,Park,43.654656,-79.359871
1,TTC Streetcar #503 Kingston Rd,Moving Target,43.661689,-79.3374
2,Cam's Auto Service,Automotive Shop,43.654195,-79.360545
3,Tandem Coffee,Coffee Shop,43.653559,-79.361809
4,TTC Streetcar #504 King St,Moving Target,43.65677,-79.45302


##### Rows returned by Foursquare

In [70]:
HF_venues.shape[0]
LIMIT == HF_venues.shape[0]

True

##### Limit that we set matches with the shape

#### Now to replicate the process for all other neighborhoods of Downtown Toronto - we will create a function

In [88]:
def getNearbyVenues(names, lats, lngs, radius = 500):
    
    venues_list = []
    for name, lat, lng in zip(names, lats, lngs):
        print(name) # to get the names of the venues after callign the function
        
        #using the similar method as for Harbourfront
        
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
        #print(url)
        results = requests.get(url).json()['response']['groups'][0]['items']
        #print (results)
        
        venues_list.append([(
            name,
            lat,
            lng,
            v['venue']['name'],
            v['venue']['location']['lat'],
            v['venue']['location']['lng'],
            v['venue']['categories'][0]['name']) for v in results])
        
        nearby_ven = pd.DataFrame([item for venue_list in venues_list for item in venue_list ])
        nearby_ven.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']

        
    return(nearby_ven)

In [89]:
downtown_venues = getNearbyVenues(names=df_Toronto['Neighborhood'],
                                 lats= df_Toronto['Latitude'],
                                 lngs =df_Toronto['Longitude']
                                 )


Rosedale
Cabbagetown, St. James Town
Church and Wellesley
Harbourfront
Ryerson, Garden District
St. James Town
Berczy Park
Central Bay Street
Adelaide, King, Richmond
Harbourfront East, Toronto Islands, Union Station
Design Exchange, Toronto Dominion Centre
Commerce Court, Victoria Hotel
Harbord, University of Toronto
Chinatown, Grange Park, Kensington Market
CN Tower, Bathurst Quay, Island airport, Harbourfront West, King and Spadina, Railway Lands, South Niagara
Stn A PO Boxes 25 The Esplanade
First Canadian Place, Underground city
Christie
Queen's Park


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Rosedale,43.679563,-79.377529,Rosedale Park,43.682328,-79.378934,Playground
1,Rosedale,43.679563,-79.377529,Whitney Park,43.682036,-79.373788,Park
2,Rosedale,43.679563,-79.377529,Alex Murray Parkette,43.6783,-79.382773,Park
3,Rosedale,43.679563,-79.377529,Milkman's Lane,43.676352,-79.373842,Trail
4,"Cabbagetown, St. James Town",43.667967,-79.367675,Cranberries,43.667843,-79.369407,Diner


In [90]:
downtown_venues.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Rosedale,43.679563,-79.377529,Rosedale Park,43.682328,-79.378934,Playground
1,Rosedale,43.679563,-79.377529,Whitney Park,43.682036,-79.373788,Park
2,Rosedale,43.679563,-79.377529,Alex Murray Parkette,43.6783,-79.382773,Park
3,Rosedale,43.679563,-79.377529,Milkman's Lane,43.676352,-79.373842,Trail
4,"Cabbagetown, St. James Town",43.667967,-79.367675,Cranberries,43.667843,-79.369407,Diner


In [91]:
downtown_venues.shape

(760, 7)

##### Checking number of venues for each neighborhood

In [92]:
downtown_venues.groupby('Neighborhood').count().reset_index()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Adelaide, King, Richmond",50,50,50,50,50,50
1,Berczy Park,50,50,50,50,50,50
2,"CN Tower, Bathurst Quay, Island airport, Harbo...",16,16,16,16,16,16
3,"Cabbagetown, St. James Town",43,43,43,43,43,43
4,Central Bay Street,50,50,50,50,50,50
5,"Chinatown, Grange Park, Kensington Market",50,50,50,50,50,50
6,Christie,17,17,17,17,17,17
7,Church and Wellesley,50,50,50,50,50,50
8,"Commerce Court, Victoria Hotel",50,50,50,50,50,50
9,"Design Exchange, Toronto Dominion Centre",50,50,50,50,50,50


In [93]:
print('There are {} uniques categories.'.format(len(downtown_venues['Venue Category'].unique())))

There are 176 uniques categories.


### Analyzing each neighborhood before clustering

In [94]:
# doing onehot encoding

downtown_onehot =  pd.get_dummies(downtown_venues['Venue Category'])

# rename of column required as the Venue category value is also 'Neighborhood'
downtown_onehot = downtown_onehot.rename(columns={"Neighborhood":"Neighboring"})

downtown_onehot = pd.concat([downtown_venues['Neighborhood'],downtown_onehot],axis=1)

#### One hot encoding for frequency calculation

In [97]:
downtown_grouped = downtown_onehot.groupby('Neighborhood').mean().reset_index()
d = downtown_grouped.iloc[0,:]
#downtown_grouped
#print(d.iloc[1:].sort_values(ascending=False))
#d.iloc[1:].sort_values(ascending=False).index.values[0:3]

###### Checking sample data

In [107]:
downtown_grouped.head()

Unnamed: 0,Neighborhood,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,Aquarium,...,Theater,Theme Restaurant,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wings Joint,Yoga Studio
0,"Adelaide, King, Richmond",0.0,0.0,0.0,0.0,0.0,0.0,0.02,0.0,0.0,...,0.0,0.0,0.0,0.0,0.02,0.0,0.0,0.0,0.0,0.0
1,Berczy Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.02,0.0,0.0,0.0,0.0,0.0
2,"CN Tower, Bathurst Quay, Island airport, Harbo...",0.0625,0.0625,0.0625,0.125,0.1875,0.125,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,"Cabbagetown, St. James Town",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Central Bay Street,0.0,0.0,0.0,0.0,0.0,0.0,0.02,0.0,0.0,...,0.0,0.0,0.0,0.0,0.02,0.0,0.0,0.02,0.0,0.02


#### Checking top venues in the neighborhoods

In [119]:
top_venue_num = 10

for hood in downtown_grouped['Neighborhood']:
   
    #selecting each row and transposing to get frequency of venues for each neighborhood
    temp = downtown_grouped[downtown_grouped['Neighborhood']==hood].T.reset_index()
    
    #selecting from 2nd row as the first row has neighborhood values
    temp = temp.iloc[1:]
    
    #defining columns for the temp table
    temp.columns=['Venue','Freq']
    print(hood)
    #printing top 2 venues for each neighborhood
    print(temp.sort_values('Freq',ascending = False).head(top_venue_num).reset_index(drop=True))
    print('\n')

Adelaide, King, Richmond
                  Venue  Freq
0            Steakhouse  0.06
1           Coffee Shop  0.06
2      Asian Restaurant  0.06
3                  Café  0.06
4           Pizza Place  0.04
5      Sushi Restaurant  0.04
6                 Hotel  0.04
7                   Bar  0.04
8             Gastropub  0.04
9  Colombian Restaurant  0.02


Berczy Park
                Venue  Freq
0         Coffee Shop   0.1
1        Cocktail Bar  0.06
2              Bakery  0.04
3  Seafood Restaurant  0.04
4         Cheese Shop  0.04
5                Café  0.04
6          Steakhouse  0.04
7            Beer Bar  0.04
8      Farmers Market  0.04
9        Gourmet Shop  0.02


CN Tower, Bathurst Quay, Island airport, Harbourfront West, King and Spadina, Railway Lands, South Niagara
                Venue    Freq
0     Airport Service  0.1875
1      Airport Lounge   0.125
2    Airport Terminal   0.125
3             Airport  0.0625
4        Airport Gate  0.0625
5       Boat or Ferry  0.0625
6   

#### Creating a function to get top venues for each neighborhood and storing them in pandas dataframe

In [101]:
def get_top_venues(row, num_top):
    row_categ_top = row.iloc[1:].sort_values(ascending=False)
    return row_categ_top.index.values[0:num_top]

#downtown_grouped.iloc[1, 1:]
#get_top_venues(downtown_grouped.iloc[1,1:], 5)

##### Creating the dataframe now for all neighborhoods

In [114]:
num_top = 10  #to get top venues
columns = ['Neighborhood']
indicators = ['st', 'nd', 'rd'] #for column names as they will append to 1,2,3 to make them 1st, 2nd, 3rd
for i in range(1, num_top):
    try:
        columns.append('{}{} Most Common Venue'.format(i, indicators[i-1]))
    except:     
        columns.append('{}th Most common venue'.format(i))

neighborhood_venues_top = pd.DataFrame(columns=columns) #adding the columns to the dataframe
neighborhood_venues_top['Neighborhood'] = downtown_grouped['Neighborhood'] #copying neighborhood data to the new dataframe

for j in range(0, downtown_grouped.shape[0]):
    neighborhood_venues_top.iloc[j, 1:] = get_top_venues(downtown_grouped.iloc[j, :], num_top-1)
    
neighborhood_venues_top

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most common venue,5th Most common venue,6th Most common venue,7th Most common venue,8th Most common venue,9th Most common venue
0,"Adelaide, King, Richmond",Coffee Shop,Steakhouse,Asian Restaurant,Café,Gastropub,Bar,Sushi Restaurant,Pizza Place,Hotel
1,Berczy Park,Coffee Shop,Cocktail Bar,Beer Bar,Bakery,Seafood Restaurant,Farmers Market,Steakhouse,Cheese Shop,Café
2,"CN Tower, Bathurst Quay, Island airport, Harbo...",Airport Service,Airport Lounge,Airport Terminal,Airport,Boat or Ferry,Coffee Shop,Plane,Rental Car Location,Sculpture Garden
3,"Cabbagetown, St. James Town",Coffee Shop,Restaurant,Pub,Café,Italian Restaurant,Pizza Place,Bakery,Pharmacy,Outdoor Sculpture
4,Central Bay Street,Coffee Shop,Café,Italian Restaurant,Ice Cream Shop,Bubble Tea Shop,Chinese Restaurant,Seafood Restaurant,Sandwich Place,Ramen Restaurant
5,"Chinatown, Grange Park, Kensington Market",Vegetarian / Vegan Restaurant,Café,Vietnamese Restaurant,Mexican Restaurant,Chinese Restaurant,Bakery,Caribbean Restaurant,Dumpling Restaurant,Comfort Food Restaurant
6,Christie,Grocery Store,Café,Park,Athletics & Sports,Nightclub,Restaurant,Baby Store,Italian Restaurant,Candy Store
7,Church and Wellesley,Gastropub,Japanese Restaurant,Restaurant,Burger Joint,Coffee Shop,Men's Store,Gay Bar,Gym,Hobby Shop
8,"Commerce Court, Victoria Hotel",Coffee Shop,Café,Hotel,Restaurant,American Restaurant,Japanese Restaurant,Beer Bar,Seafood Restaurant,Deli / Bodega
9,"Design Exchange, Toronto Dominion Centre",Coffee Shop,Café,Deli / Bodega,Restaurant,Steakhouse,Gastropub,Japanese Restaurant,Sandwich Place,Bakery


## Clustering Neighborhoods

In [115]:
kclusters = 5 # for number of clusters

downtown_clustering =  downtown_grouped.drop('Neighborhood',axis=1) #dropping the column Neighborhood as it won't be needed for clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(downtown_clustering)
kmeans.labels_[0:18] #checking the cluster groups for each neighborhood

array([2, 0, 3, 0, 2, 0, 4, 0, 2, 2, 2, 0, 2, 0, 1, 2, 0, 0], dtype=int32)

#### Merging two datasets to get additional fields along with clusters for the neighborhood

In [116]:
neighborhood_venues_top.insert(0,'Cluster Labels',kmeans.labels_) #inserting the column Cluster Labels

merged_downtown_data = pd.merge(df_Toronto,neighborhood_venues_top,how='inner',on='Neighborhood') #joining on Neighborhood column values

#merged_downtown_data

#### Seeing the clustering on the map

In [117]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(merged_downtown_data['Latitude'], merged_downtown_data['Longitude'], merged_downtown_data['Neighborhood'], merged_downtown_data['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

#### We can check the data for each cluster using below

In [118]:
downtown_merged =  merged_downtown_data

downtown_merged.loc[downtown_merged['Cluster Labels']==0, downtown_merged.columns[[2] + list(range(5, downtown_merged.shape[1]))]].sort_values('Cluster Labels').reset_index()

Unnamed: 0,index,Neighborhood,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most common venue,5th Most common venue,6th Most common venue,7th Most common venue,8th Most common venue,9th Most common venue
0,1,"Cabbagetown, St. James Town",0,Coffee Shop,Restaurant,Pub,Café,Italian Restaurant,Pizza Place,Bakery,Pharmacy,Outdoor Sculpture
1,2,Church and Wellesley,0,Gastropub,Japanese Restaurant,Restaurant,Burger Joint,Coffee Shop,Men's Store,Gay Bar,Gym,Hobby Shop
2,5,St. James Town,0,Coffee Shop,Café,Restaurant,Japanese Restaurant,Gastropub,Cocktail Bar,Park,Beer Bar,Italian Restaurant
3,6,Berczy Park,0,Coffee Shop,Cocktail Bar,Beer Bar,Bakery,Seafood Restaurant,Farmers Market,Steakhouse,Cheese Shop,Café
4,9,"Harbourfront East, Toronto Islands, Union Station",0,Coffee Shop,Aquarium,Plaza,Brewery,Hotel,Park,Café,Sandwich Place,Salad Place
5,12,"Harbord, University of Toronto",0,Café,Restaurant,Italian Restaurant,Japanese Restaurant,Bar,Bookstore,Bakery,Pub,Poutine Place
6,13,"Chinatown, Grange Park, Kensington Market",0,Vegetarian / Vegan Restaurant,Café,Vietnamese Restaurant,Mexican Restaurant,Chinese Restaurant,Bakery,Caribbean Restaurant,Dumpling Restaurant,Comfort Food Restaurant
7,15,Stn A PO Boxes 25 The Esplanade,0,Restaurant,Cocktail Bar,Café,Cheese Shop,Bakery,Coffee Shop,Beer Bar,Japanese Restaurant,Farmers Market
