In [1]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')

Solving environment: done


  current version: 4.5.11
  latest version: 4.7.12

Please update conda by running

    $ conda update -n base -c defaults conda



# All requested packages already installed.

Libraries imported.


In [2]:
url='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
url

'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

# Scrapping the data from wikipedia page for postal code of Canada and arranging it in Pandas Dataframe

In [3]:
page = requests.get(url)
page

<Response [200]>

In [4]:
pip install lxml

Note: you may need to restart the kernel to use updated packages.


In [5]:
import lxml.html as lh

In [6]:
#Store the contents of the website under doc
doc = lh.fromstring(page.content)

In [7]:
#Parse data that are stored between <tr>..</tr> of HTML
tr_elements = doc.xpath('//tr')

In [8]:
#Check the length of the first 12 rows
[len(T) for T in tr_elements[:12]]

[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]

In [9]:
tr_elements = doc.xpath('//tr')
#Create empty list
col=[]
i=0
#For each row, store each first element (header) and an empty list
for t in tr_elements[0]:
    i+=1
    name=t.text_content()
#     print ('%d:"%s"'%(i,name))
    col.append((name,[]))

In [10]:
#Since out first row is the header, data is stored on the second row onwards
for j in range(1,len(tr_elements)):
    #T is our j'th row
    T=tr_elements[j]
    
    #If row is not of size 10, the //tr data is not from our table 
    if len(T)!=3:
        break
    
    #i is the index of our column
    i=0
    
    #Iterate through each element of the row
    for t in T.iterchildren():
        data=t.text_content() 
        #Check if row is empty
        if i>0:
        #Convert any numerical value to integers
            try:
                data=int(data)
            except:
                pass
        #Append the data to the empty list of the i'th column
        col[i][1].append(data)
        #Increment i for the next column
        i+=1

In [11]:
[len(C) for (title,C) in col]

[288, 288, 288]

In [12]:
Dict={title:column for (title,column) in col}
df=pd.DataFrame(Dict)

In [13]:
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood\n
0,M1A,Not assigned,Not assigned\n
1,M2A,Not assigned,Not assigned\n
2,M3A,North York,Parkwoods\n
3,M4A,North York,Victoria Village\n
4,M5A,Downtown Toronto,Harbourfront\n


In [14]:
df.rename(columns={'Neighbourhood\n':'Neighbourhood'}, inplace=True)

In [15]:
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned\n
1,M2A,Not assigned,Not assigned\n
2,M3A,North York,Parkwoods\n
3,M4A,North York,Victoria Village\n
4,M5A,Downtown Toronto,Harbourfront\n


In [16]:
t=df['Neighbourhood'].str.slice(0, -1).to_frame()

In [17]:
df['Neighbourhood']=t.Neighbourhood

In [18]:
# df = df.set_index("Borough")
# df=df.drop('Not assigned',axis=0)

In [19]:
index_names=df[df['Borough']=="Not assigned"].index

In [20]:
df.drop(index_names, inplace=True)

In [21]:
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights


In [22]:
df=df.groupby(('Postcode','Borough'))['Neighbourhood'].apply(lambda Neighbourhood: ','.join(Neighbourhood)).to_frame().reset_index()

  """Entry point for launching an IPython kernel.


In [23]:
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [24]:
df['Neighbourhood'].replace('Not assigned',"Queen's Park",inplace=True)

# Final Ouput of the DataFrame Required for exploring and clustering of the Neighbourhoods

In [25]:
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [26]:
df.shape

(103, 3)

## Loading the data for geogrphical co-ordinates of the neighbours

In [27]:
site='http://cocl.us/Geospatial_data'

In [28]:
l=pd.read_csv(site)

In [29]:
l.rename(columns={'Postal Code':'Postcode'}, inplace=True)
l.head()

Unnamed: 0,Postcode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [30]:
df1=pd.merge(df,l, on='Postcode', how='outer')

In [31]:
df1.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge,Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [32]:
Etobicoke_data = df1[df1['Borough'] == 'Etobicoke'].reset_index(drop=True)
Etobicoke_data.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M8V,Etobicoke,"Humber Bay Shores,Mimico South,New Toronto",43.605647,-79.501321
1,M8W,Etobicoke,"Alderwood,Long Branch",43.602414,-79.543484
2,M8X,Etobicoke,"The Kingsway,Montgomery Road,Old Mill North",43.653654,-79.506944
3,M8Y,Etobicoke,"Humber Bay,King's Mill Park,Kingsway Park Sout...",43.636258,-79.498509
4,M8Z,Etobicoke,"Kingsway Park South West,Mimico NW,The Queensw...",43.628841,-79.520999


# Exploring the Neighbourhood of "Central Toronto"

In [33]:
CentralToronto_data = df1[df1['Borough'] == 'Central Toronto'].reset_index(drop=True)
CentralToronto_data.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879
1,M4P,Central Toronto,Davisville North,43.712751,-79.390197
2,M4R,Central Toronto,North Toronto West,43.715383,-79.405678
3,M4S,Central Toronto,Davisville,43.704324,-79.38879
4,M4T,Central Toronto,"Moore Park,Summerhill East",43.689574,-79.38316


In [34]:
address = 'Central Toronto'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Central Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Central Toronto are 43.653963, -79.387207.


# Map of central Toronto

In [35]:
# create map of Manhattan using latitude and longitude values
map_Central_Toronto = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, label in zip(CentralToronto_data['Latitude'], CentralToronto_data['Longitude'], CentralToronto_data['Neighbourhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_Central_Toronto)  
    
map_Central_Toronto

In [64]:
CLIENT_ID = '' # your Foursquare ID
CLIENT_SECRET = '' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: 
CLIENT_SECRET:


Lets Explore 'Lawrence Park' Neighbourhood in Central Canada

In [37]:
CentralToronto_data.loc[0, 'Neighbourhood']

'Lawrence Park'

Lets get the neighbourhood latitude and longitude

In [38]:
neighborhood_latitude = CentralToronto_data.loc[0, 'Latitude'] # neighborhood latitude value
neighborhood_longitude = CentralToronto_data.loc[0, 'Longitude'] # neighborhood longitude value

neighborhood_name = CentralToronto_data.loc[0, 'Neighbourhood'] # neighborhood name

print('Latitude and longitude values of {} are {}, {}.'.format(neighborhood_name, 
                                                               neighborhood_latitude, 
                                                               neighborhood_longitude))

Latitude and longitude values of Lawrence Park are 43.7280205, -79.3887901.


## Now let's get the top 50 venues near "Lawrence Park" with in an area of radius 500m.

In [39]:
# type your answer here
LIMIT = 50 # limit of number of venues returned by Foursquare API


radius = 500 # define radius


url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    neighborhood_latitude, 
    neighborhood_longitude, 
    radius, 
    LIMIT)
url 

'https://api.foursquare.com/v2/venues/explore?&client_id=ULNZCDXNGCHLV5QJ5PTZKMPU11JZ4XOLYZ1KFZJTXH3U3HUD&client_secret=1E2C4IDY5X5FWF1RHD4BG0QCXSQW2TTT1QWCZXS5TT2ZSF4C&v=20180605&ll=43.7280205,-79.3887901&radius=500&limit=50'

In [40]:
results = requests.get(url).json()
results

{'meta': {'code': 200, 'requestId': '5d803641a30619002ce526db'},
  'headerLocation': 'Toronto',
  'headerFullLocation': 'Toronto',
  'headerLocationGranularity': 'city',
  'totalResults': 3,
  'suggestedBounds': {'ne': {'lat': 43.7325205045, 'lng': -79.3825744605273},
   'sw': {'lat': 43.7235204955, 'lng': -79.3950057394727}},
  'groups': [{'type': 'Recommended Places',
    'name': 'recommended',
    'items': [{'reasons': {'count': 0,
       'items': [{'summary': 'This spot is popular',
         'type': 'general',
         'reasonName': 'globalInteractionReason'}]},
      'venue': {'id': '50e6da19e4b0d8a78a0e9794',
       'name': 'Lawrence Park Ravine',
       'location': {'address': '3055 Yonge Street',
        'crossStreet': 'Lawrence Avenue East',
        'lat': 43.72696303913755,
        'lng': -79.39438246708775,
        'labeledLatLngs': [{'label': 'display',
          'lat': 43.72696303913755,
          'lng': -79.39438246708775}],
        'distance': 465,
        'cc': 'CA',
  

### Now lets get the data in Pandas framework so that we can understand it better.

In [41]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [42]:
venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

Unnamed: 0,name,categories,lat,lng
0,Lawrence Park Ravine,Park,43.726963,-79.394382
1,Zodiac Swim School,Swim School,43.728532,-79.38286
2,TTC Bus #162 - Lawrence-Donway,Bus Line,43.728026,-79.382805


In [43]:
print('{} venues were returned by Foursquare.'.format(nearby_venues.shape[0]))

3 venues were returned by Foursquare.


### Now we shall explore all the Neighbourhood's of Central Toronto

In [44]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighbourhood', 
                  'Neighbourhood Latitude', 
                  'Neighbourhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [45]:
CentralToronto_venues = getNearbyVenues(names=CentralToronto_data['Neighbourhood'],
                                   latitudes=CentralToronto_data['Latitude'],
                                   longitudes=CentralToronto_data['Longitude']
                                  )

Lawrence Park
Davisville North
North Toronto West
Davisville
Moore Park,Summerhill East
Deer Park,Forest Hill SE,Rathnelly,South Hill,Summerhill West
Roselawn
Forest Hill North,Forest Hill West
The Annex,North Midtown,Yorkville


### Now lets check the size of the resulting Dataframe

In [46]:
print(CentralToronto_venues.shape)
CentralToronto_venues.head()

(120, 7)


Unnamed: 0,Neighbourhood,Neighbourhood Latitude,Neighbourhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Lawrence Park,43.72802,-79.38879,Lawrence Park Ravine,43.726963,-79.394382,Park
1,Lawrence Park,43.72802,-79.38879,Zodiac Swim School,43.728532,-79.38286,Swim School
2,Lawrence Park,43.72802,-79.38879,TTC Bus #162 - Lawrence-Donway,43.728026,-79.382805,Bus Line
3,Davisville North,43.712751,-79.390197,Sherwood Park,43.716551,-79.387776,Park
4,Davisville North,43.712751,-79.390197,Summerhill Market North,43.715499,-79.392881,Food & Drink Shop


In [47]:
CentralToronto_venues.groupby('Neighbourhood').count()

Unnamed: 0_level_0,Neighbourhood Latitude,Neighbourhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighbourhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Davisville,35,35,35,35,35,35
Davisville North,8,8,8,8,8,8
"Deer Park,Forest Hill SE,Rathnelly,South Hill,Summerhill West",16,16,16,16,16,16
"Forest Hill North,Forest Hill West",4,4,4,4,4,4
Lawrence Park,3,3,3,3,3,3
"Moore Park,Summerhill East",2,2,2,2,2,2
North Toronto West,23,23,23,23,23,23
Roselawn,3,3,3,3,3,3
"The Annex,North Midtown,Yorkville",26,26,26,26,26,26


#### Let's find out how many unique categories can be curated from all the returned venues

In [48]:
print('There are {} uniques categories.'.format(len(CentralToronto_venues['Venue Category'].unique())))

There are 63 uniques categories.


# Analyzing each Neighbourhood

In [49]:
# one hot encoding
CentralToronto_onehot = pd.get_dummies(CentralToronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
CentralToronto_onehot['Neighbourhood'] = CentralToronto_venues['Neighbourhood'] 

# move neighborhood column to the first column
fixed_columns = [CentralToronto_onehot.columns[-1]] + list(CentralToronto_onehot.columns[:-1])
CentralToronto_onehot = CentralToronto_onehot[fixed_columns]

CentralToronto_onehot.head()

Unnamed: 0,Neighbourhood,American Restaurant,Asian Restaurant,BBQ Joint,Bagel Shop,Breakfast Spot,Brewery,Burger Joint,Bus Line,Café,Chinese Restaurant,Clothing Store,Coffee Shop,Cosmetics Shop,Dessert Shop,Diner,Farmers Market,Flower Shop,Food & Drink Shop,Fried Chicken Joint,Furniture / Home Store,Garden,Gourmet Shop,Greek Restaurant,Gym,Gym / Fitness Center,Health & Beauty Service,History Museum,Home Service,Hotel,Indian Restaurant,Indoor Play Area,Italian Restaurant,Japanese Restaurant,Jewelry Store,Jewish Restaurant,Light Rail Station,Liquor Store,Metro Station,Mexican Restaurant,Park,Pharmacy,Pizza Place,Pool,Pub,Rental Car Location,Restaurant,Salon / Barbershop,Sandwich Place,Seafood Restaurant,Shoe Store,Spa,Sporting Goods Shop,Sports Bar,Supermarket,Sushi Restaurant,Swim School,Tennis Court,Thai Restaurant,Toy / Game Store,Trail,Vegetarian / Vegan Restaurant,Vietnamese Restaurant,Yoga Studio
0,Lawrence Park,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,Lawrence Park,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
2,Lawrence Park,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,Davisville North,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,Davisville North,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [50]:
CentralToronto_onehot.shape

(120, 64)

#### Next, let's group rows by neighborhood and by taking the mean of the frequency of occurrence of each category

In [51]:
CentralToronto_grouped = CentralToronto_onehot.groupby('Neighbourhood').mean().reset_index()
CentralToronto_grouped

Unnamed: 0,Neighbourhood,American Restaurant,Asian Restaurant,BBQ Joint,Bagel Shop,Breakfast Spot,Brewery,Burger Joint,Bus Line,Café,Chinese Restaurant,Clothing Store,Coffee Shop,Cosmetics Shop,Dessert Shop,Diner,Farmers Market,Flower Shop,Food & Drink Shop,Fried Chicken Joint,Furniture / Home Store,Garden,Gourmet Shop,Greek Restaurant,Gym,Gym / Fitness Center,Health & Beauty Service,History Museum,Home Service,Hotel,Indian Restaurant,Indoor Play Area,Italian Restaurant,Japanese Restaurant,Jewelry Store,Jewish Restaurant,Light Rail Station,Liquor Store,Metro Station,Mexican Restaurant,Park,Pharmacy,Pizza Place,Pool,Pub,Rental Car Location,Restaurant,Salon / Barbershop,Sandwich Place,Seafood Restaurant,Shoe Store,Spa,Sporting Goods Shop,Sports Bar,Supermarket,Sushi Restaurant,Swim School,Tennis Court,Thai Restaurant,Toy / Game Store,Trail,Vegetarian / Vegan Restaurant,Vietnamese Restaurant,Yoga Studio
0,Davisville,0.0,0.0,0.0,0.0,0.0,0.028571,0.0,0.0,0.057143,0.028571,0.0,0.057143,0.0,0.085714,0.028571,0.028571,0.028571,0.0,0.028571,0.0,0.0,0.028571,0.028571,0.057143,0.0,0.0,0.0,0.0,0.0,0.028571,0.028571,0.057143,0.028571,0.0,0.0,0.0,0.0,0.0,0.0,0.028571,0.028571,0.057143,0.0,0.0,0.0,0.028571,0.0,0.085714,0.028571,0.0,0.0,0.0,0.0,0.0,0.057143,0.0,0.0,0.028571,0.028571,0.0,0.0,0.0,0.0
1,Davisville North,0.0,0.125,0.0,0.0,0.125,0.0,0.0,0.0,0.0,0.0,0.125,0.0,0.0,0.0,0.0,0.0,0.0,0.125,0.0,0.0,0.0,0.0,0.0,0.125,0.0,0.0,0.0,0.0,0.125,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.125,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.125,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"Deer Park,Forest Hill SE,Rathnelly,South Hill,...",0.0625,0.0,0.0,0.0625,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.125,0.0,0.0,0.0,0.0,0.0,0.0,0.0625,0.0,0.0,0.0,0.0,0.0,0.0,0.0625,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0625,0.0625,0.0,0.0,0.0,0.0,0.0625,0.0,0.125,0.0,0.0625,0.0,0.0,0.0,0.0,0.0,0.0,0.0625,0.0625,0.0625,0.0,0.0,0.0,0.0,0.0,0.0,0.0625,0.0
3,"Forest Hill North,Forest Hill West",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0
4,Lawrence Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,"Moore Park,Summerhill East",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0
6,North Toronto West,0.0,0.0,0.0,0.0,0.0,0.0,0.043478,0.0,0.043478,0.043478,0.130435,0.086957,0.0,0.043478,0.043478,0.0,0.0,0.0,0.0,0.043478,0.0,0.0,0.0,0.0,0.043478,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.043478,0.043478,0.0,0.0,0.0,0.0,0.043478,0.043478,0.043478,0.0,0.0,0.043478,0.043478,0.130435,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.043478
7,Roselawn,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,"The Annex,North Midtown,Yorkville",0.038462,0.038462,0.038462,0.0,0.0,0.0,0.038462,0.0,0.115385,0.0,0.0,0.115385,0.038462,0.0,0.0,0.0,0.038462,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.038462,0.0,0.0,0.038462,0.0,0.0,0.0,0.0,0.038462,0.0,0.038462,0.038462,0.0,0.038462,0.038462,0.076923,0.0,0.038462,0.0,0.0,0.0,0.115385,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.038462,0.0,0.0


In [52]:
CentralToronto_grouped.shape

(9, 64)

#### Let's print each neighborhood along with the top 5 most common venues

In [53]:
num_top_venues = 5

for hood in CentralToronto_grouped['Neighbourhood']:
    print("----"+hood+"----")
    temp = CentralToronto_grouped[CentralToronto_grouped['Neighbourhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Davisville----
                venue  freq
0      Sandwich Place  0.09
1        Dessert Shop  0.09
2  Italian Restaurant  0.06
3         Coffee Shop  0.06
4                 Gym  0.06


----Davisville North----
               venue  freq
0  Food & Drink Shop  0.12
1               Park  0.12
2                Gym  0.12
3     Clothing Store  0.12
4              Hotel  0.12


----Deer Park,Forest Hill SE,Rathnelly,South Hill,Summerhill West----
                 venue  freq
0          Coffee Shop  0.12
1                  Pub  0.12
2  American Restaurant  0.06
3         Liquor Store  0.06
4          Pizza Place  0.06


----Forest Hill North,Forest Hill West----
                venue  freq
0       Jewelry Store  0.25
1               Trail  0.25
2  Mexican Restaurant  0.25
3    Sushi Restaurant  0.25
4  Salon / Barbershop  0.00


----Lawrence Park----
                 venue  freq
0                 Park  0.33
1          Swim School  0.33
2             Bus Line  0.33
3           Restaurant  0

#### Let's print each neighborhood along with the top 5 most common venues

#### Let's put that into a *pandas* dataframe

In [54]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [55]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighbourhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighbourhoods_venues_sorted = pd.DataFrame(columns=columns)
neighbourhoods_venues_sorted['Neighbourhood'] = CentralToronto_grouped['Neighbourhood']

for ind in np.arange(CentralToronto_grouped.shape[0]):
    neighbourhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(CentralToronto_grouped.iloc[ind, :], num_top_venues)

neighbourhoods_venues_sorted.head()

Unnamed: 0,Neighbourhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Davisville,Dessert Shop,Sandwich Place,Italian Restaurant,Sushi Restaurant,Pizza Place,Café,Coffee Shop,Gym,Park,Greek Restaurant
1,Davisville North,Hotel,Gym,Sandwich Place,Park,Clothing Store,Food & Drink Shop,Breakfast Spot,Asian Restaurant,Health & Beauty Service,Gym / Fitness Center
2,"Deer Park,Forest Hill SE,Rathnelly,South Hill,...",Pub,Coffee Shop,American Restaurant,Sports Bar,Vietnamese Restaurant,Light Rail Station,Liquor Store,Fried Chicken Joint,Pizza Place,Restaurant
3,"Forest Hill North,Forest Hill West",Sushi Restaurant,Trail,Jewelry Store,Mexican Restaurant,Gourmet Shop,Food & Drink Shop,Fried Chicken Joint,Furniture / Home Store,Garden,Yoga Studio
4,Lawrence Park,Bus Line,Park,Swim School,Yoga Studio,Food & Drink Shop,Fried Chicken Joint,Furniture / Home Store,Garden,Gourmet Shop,Greek Restaurant


## 4. Cluster Neighborhoods

### Run *k*-means to cluster the neighborhood into 5 clusters.

In [56]:
# set number of clusters
kclusters = 5

CentralToronto_grouped_clustering = CentralToronto_grouped.drop('Neighbourhood', 1)

# run k-means clusteringCentralToronto
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(CentralToronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([0, 0, 0, 4, 3, 2, 0, 1, 0], dtype=int32)

### Let's create a new dataframe that includes the cluster as well as the top 10 venues for each neighborhood.

In [57]:
# add clustering labels
neighbourhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

CentralToronto_merged = CentralToronto_data

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
CentralToronto_merged = CentralToronto_merged.join(neighbourhoods_venues_sorted.set_index('Neighbourhood'), on='Neighbourhood')

CentralToronto_merged.head() # check the last columns!

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879,3,Bus Line,Park,Swim School,Yoga Studio,Food & Drink Shop,Fried Chicken Joint,Furniture / Home Store,Garden,Gourmet Shop,Greek Restaurant
1,M4P,Central Toronto,Davisville North,43.712751,-79.390197,0,Hotel,Gym,Sandwich Place,Park,Clothing Store,Food & Drink Shop,Breakfast Spot,Asian Restaurant,Health & Beauty Service,Gym / Fitness Center
2,M4R,Central Toronto,North Toronto West,43.715383,-79.405678,0,Sporting Goods Shop,Clothing Store,Coffee Shop,Yoga Studio,Shoe Store,Gym / Fitness Center,Furniture / Home Store,Mexican Restaurant,Park,Diner
3,M4S,Central Toronto,Davisville,43.704324,-79.38879,0,Dessert Shop,Sandwich Place,Italian Restaurant,Sushi Restaurant,Pizza Place,Café,Coffee Shop,Gym,Park,Greek Restaurant
4,M4T,Central Toronto,"Moore Park,Summerhill East",43.689574,-79.38316,2,Tennis Court,Park,Yoga Studio,Farmers Market,Home Service,History Museum,Health & Beauty Service,Gym / Fitness Center,Gym,Greek Restaurant


Finally, let's visualize the resulting clusters

In [58]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(CentralToronto_merged['Latitude'], CentralToronto_merged['Longitude'], CentralToronto_merged['Neighbourhood'], CentralToronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

## 5. Examine Clusters

In [59]:
CentralToronto_merged.loc[CentralToronto_merged['Cluster Labels'] == 0, CentralToronto_merged.columns[[1] + list(range(5, CentralToronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
1,Central Toronto,0,Hotel,Gym,Sandwich Place,Park,Clothing Store,Food & Drink Shop,Breakfast Spot,Asian Restaurant,Health & Beauty Service,Gym / Fitness Center
2,Central Toronto,0,Sporting Goods Shop,Clothing Store,Coffee Shop,Yoga Studio,Shoe Store,Gym / Fitness Center,Furniture / Home Store,Mexican Restaurant,Park,Diner
3,Central Toronto,0,Dessert Shop,Sandwich Place,Italian Restaurant,Sushi Restaurant,Pizza Place,Café,Coffee Shop,Gym,Park,Greek Restaurant
5,Central Toronto,0,Pub,Coffee Shop,American Restaurant,Sports Bar,Vietnamese Restaurant,Light Rail Station,Liquor Store,Fried Chicken Joint,Pizza Place,Restaurant
8,Central Toronto,0,Café,Sandwich Place,Coffee Shop,Pizza Place,American Restaurant,Cosmetics Shop,Indian Restaurant,Jewish Restaurant,Liquor Store,Metro Station


In [60]:
CentralToronto_merged.loc[CentralToronto_merged['Cluster Labels'] == 1, CentralToronto_merged.columns[[1] + list(range(5, CentralToronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
6,Central Toronto,1,Home Service,Garden,Pool,Yoga Studio,Indoor Play Area,Hotel,History Museum,Health & Beauty Service,Gym / Fitness Center,Gym


In [61]:
CentralToronto_merged.loc[CentralToronto_merged['Cluster Labels'] == 2, CentralToronto_merged.columns[[1] + list(range(5, CentralToronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
4,Central Toronto,2,Tennis Court,Park,Yoga Studio,Farmers Market,Home Service,History Museum,Health & Beauty Service,Gym / Fitness Center,Gym,Greek Restaurant


In [62]:
CentralToronto_merged.loc[CentralToronto_merged['Cluster Labels'] == 3, CentralToronto_merged.columns[[1] + list(range(5, CentralToronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Central Toronto,3,Bus Line,Park,Swim School,Yoga Studio,Food & Drink Shop,Fried Chicken Joint,Furniture / Home Store,Garden,Gourmet Shop,Greek Restaurant


In [63]:
CentralToronto_merged.loc[CentralToronto_merged['Cluster Labels'] == 4, CentralToronto_merged.columns[[1] + list(range(5, CentralToronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
7,Central Toronto,4,Sushi Restaurant,Trail,Jewelry Store,Mexican Restaurant,Gourmet Shop,Food & Drink Shop,Fried Chicken Joint,Furniture / Home Store,Garden,Yoga Studio


### Thank You Very Much