In [65]:
# Dependencies
import requests # library to handle requests
import lxml.html as lh
import bs4 as bs
import urllib.request

import numpy as np # library to handle data in a vectorized manner
import pandas as pd # library for data analsysis

from geopy.geocoders import Nominatim

import folium # map rendering library

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

print('Libraries imported.')

Libraries imported.


#### 2. Build the code to scrape the following Wikipedia page, https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M, in order to obtain the data that is in the table of postal codes and to transform the data into a pandas dataframe.

In [2]:
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"

In [3]:
# -----------------------------------------------------
# Use the BeautifulSoup package or any other way you are comfortable with 
# to transform the data in the table on the Wikipedia page into the above pandas dataframe
# -----------------------------------------------------
def scrape_bs4(cname,cols):
    page  = urllib.request.urlopen(url).read()
    soup  = bs.BeautifulSoup(page,'lxml')
    table = soup.find("table",class_=cname)
    header = [head.findAll(text=True)[0].strip() for head in table.find_all("th")]
    data   = [[td.findAll(text=True)[0].strip() for td in tr.find_all("td")]
              for tr in table.find_all("tr")]
    data    = [row for row in data if len(row) == cols]
    # Store data to this temporary dataframe
    raw_df = pd.DataFrame(data,columns=header)
    return raw_df

In [4]:
# Python lxml - similar library that supports scraping with xpath
# def scrape_lxml(XPATH,cols):
#     page = requests.get(url)
#     doc = lh.fromstring(page.content)
#     table_content = doc.xpath(XPATH)
#     for table in table_content:
#         headers = [th.text_content().strip() for th in table.xpath('//th')]
#         headers = headers[0:3]
#         data    = [[td.text_content().strip() for td in tr.xpath('td')] 
#                    for tr in table.xpath('//tbody/tr')]
#         data    = [row for row in data if len(row) == cols]
#         raw_df = pd.DataFrame(data,columns=headers)
#         return raw_df

In [6]:
#Test in beautifulSoup
raw_TorontoPostalCodes = scrape_bs4("wikitable",3)

#Test in lxml ( for xpath based extraction)
#raw_TorontoPostalCodes = scrape_table_lxml("/html/body/div[3]/div[3]/div[4]/div/table[1]",3)

print("# Toronto Postal codes stored in data")
print(raw_TorontoPostalCodes.info(verbose = True))

# Toronto Postal codes stored in data
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 289 entries, 0 to 288
Data columns (total 3 columns):
Postcode         289 non-null object
Borough          289 non-null object
Neighbourhood    289 non-null object
dtypes: object(3)
memory usage: 6.9+ KB
None


#### Data Cleanup

In [11]:
# -----------------------------------------------------
# Only process the cells that have an assigned borough. 
# Ignore cells with a borough that is Not assigned.
# -----------------------------------------------------
TorontoPostalCodes = raw_TorontoPostalCodes[~raw_TorontoPostalCodes['Borough'].isin(['Not assigned'])]

# Sort and Reset index.
TorontoPostalCodes = TorontoPostalCodes.sort_values(by = ['Postcode','Borough','Neighbourhood'], ascending = [1,1,1]).reset_index(drop = True)

# -----------------------------------------------------
# If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough.
# For example the value of the Borough and the Neighborhood columns will be Queen's Park.
# -----------------------------------------------------
TorontoPostalCodes.loc[TorontoPostalCodes['Neighbourhood'] == 'Not assigned', ['Neighbourhood']] = TorontoPostalCodes['Borough']
check_unassigned_post_state_sample = TorontoPostalCodes.loc[TorontoPostalCodes['Borough'] == 'Queen\'s Park']
#print('DEBUG:',check_unassigned_post_state_sample) ; # Print sample borough problem post state

# -----------------------------------------------------
# More than one neighborhood can exist in one postal code area. 
# For example, in the table on the Wikipedia page, you will notice that M5A is listed twice 
# and has two neighborhoods: Harbourfront and Regent Park. 
# These two rows will be combined into one row with the neighborhoods separated with a comma.
# -----------------------------------------------------
TorontoPostalCodes = TorontoPostalCodes.groupby(['Postcode','Borough'])['Neighbourhood'].apply(', '.join).reset_index()

In [12]:
TorontoPostalCodes.shape

(103, 3)

In [13]:
TorontoPostalCodes.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Highland Creek, Port Union, Rouge Hill"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [17]:
TorontoPostalCodes.rename(columns = {'Postcode':'Postal Code'}, inplace=True)
TorontoPostalCodes.set_index("Postal Code", inplace = True)
TorontoPostalCodes.head()

Unnamed: 0_level_0,Borough,Neighbourhood
Postal Code,Unnamed: 1_level_1,Unnamed: 2_level_1
M1B,Scarborough,"Malvern, Rouge"
M1C,Scarborough,"Highland Creek, Port Union, Rouge Hill"
M1E,Scarborough,"Guildwood, Morningside, West Hill"
M1G,Scarborough,Woburn
M1H,Scarborough,Cedarbrae


In [18]:
# Retrieve coordinates for the Postal Code
toronto_geocsv = 'https://cocl.us/Geospatial_data'
!wget -q -O 'toronto_m.geospatial_data.csv' toronto_geocsv
geocsv_data = pd.read_csv(toronto_geocsv).set_index("Postal Code")
geocsv_data.head()

Unnamed: 0_level_0,Latitude,Longitude
Postal Code,Unnamed: 1_level_1,Unnamed: 2_level_1
M1B,43.806686,-79.194353
M1C,43.784535,-79.160497
M1E,43.763573,-79.188711
M1G,43.770992,-79.216917
M1H,43.773136,-79.239476


In [19]:
# Combining Two data frames with mapped postcodes
toronto_neighborhoods = TorontoPostalCodes.join(geocsv_data)
toronto_neighborhoods.head()

Unnamed: 0_level_0,Borough,Neighbourhood,Latitude,Longitude
Postal Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
M1C,Scarborough,"Highland Creek, Port Union, Rouge Hill",43.784535,-79.160497
M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
M1G,Scarborough,Woburn,43.770992,-79.216917
M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [20]:
toronto_neighborhoods.shape

(103, 4)

In [23]:
# Use geopy library to get the latitude and longitude values of Toronto
address = 'Toronto, Ontario Canada'

geolocator = Nominatim(user_agent = "SegmentingClusteringNeighborhoods_Toronto")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto Canada are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto Canada are 43.653963, -79.387207.


In [26]:
# Create a map of Toronto with neighborhoods superimposed on top.

map_toronto = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, borough, neighborhood in zip(toronto_neighborhoods['Latitude'], toronto_neighborhoods['Longitude'], toronto_neighborhoods['Borough'], toronto_neighborhoods['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius = 4,
        popup = label,
        color = 'blue',
        fill = True,
        fill_color = '#87cefa',
        fill_opacity = 0.5,
        parse_html = False).add_to(map_toronto)
    
map_toronto

In [60]:
# For the sake of simplicity we will only take the list of first 101 items
toronto_data = toronto_neighborhoods.head(101)
print(toronto_data.shape)
toronto_data.head()


(101, 4)


Unnamed: 0_level_0,Borough,Neighbourhood,Latitude,Longitude
Postal Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
M1C,Scarborough,"Highland Creek, Port Union, Rouge Hill",43.784535,-79.160497
M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
M1G,Scarborough,Woburn,43.770992,-79.216917
M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [59]:
# Define Foursquare Credentials and Version - Gmail
Client_ID = "ABC"
Client_Secret = "XYZ"
VERSION = '20190315'

In [35]:
def getNearbyVenues(names, latitudes, longitudes, radius=700):
    LIMIT = 100
    venues_list=[]
    count = 0
    for name, lat, lng in zip(names, latitudes, longitudes):
        count = count + 1
        print("----"+str(count)+"----Neighborhood = "+name+"----")    
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            Client_ID, 
            Client_Secret, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        venue_results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in venue_results])

#         if count == 2:
#             break
    
    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighbourhood', 
                  'Neighbourhood Latitude', 
                  'Neighbourhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [36]:
toronto_neighborhoods = toronto_neighborhoods
toronto_venues = getNearbyVenues(names = toronto_neighborhoods['Neighbourhood'],
                                   latitudes = toronto_neighborhoods['Latitude'],
                                   longitudes = toronto_neighborhoods['Longitude']
                                  )

----1----Neighborhood = Malvern, Rouge----
----2----Neighborhood = Highland Creek, Port Union, Rouge Hill----
----3----Neighborhood = Guildwood, Morningside, West Hill----
----4----Neighborhood = Woburn----
----5----Neighborhood = Cedarbrae----
----6----Neighborhood = Scarborough Village----
----7----Neighborhood = East Birchmount Park, Ionview, Kennedy Park----
----8----Neighborhood = Clairlea, Golden Mile, Oakridge----
----9----Neighborhood = Cliffcrest, Cliffside, Scarborough Village West----
----10----Neighborhood = Birch Cliff, Cliffside West----
----11----Neighborhood = Dorset Park, Scarborough Town Centre, Wexford Heights----
----12----Neighborhood = Maryvale, Wexford----
----13----Neighborhood = Agincourt----
----14----Neighborhood = Clarks Corners, Sullivan, Tam O'Shanter----
----15----Neighborhood = Agincourt North, L'Amoreaux East, Milliken, Steeles East----
----16----Neighborhood = L'Amoreaux West, Steeles West----
----17----Neighborhood = Upper Rouge----
----18----Neighbor

In [37]:
print(toronto_venues.shape)
toronto_venues.head()

(3447, 7)


Unnamed: 0,Neighbourhood,Neighbourhood Latitude,Neighbourhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Malvern, Rouge",43.806686,-79.194353,Images Salon & Spa,43.802283,-79.198565,Spa
1,"Malvern, Rouge",43.806686,-79.194353,Wendy's,43.807448,-79.199056,Fast Food Restaurant
2,"Malvern, Rouge",43.806686,-79.194353,Wendy's,43.802008,-79.19808,Fast Food Restaurant
3,"Malvern, Rouge",43.806686,-79.194353,Tim Hortons,43.802,-79.198169,Coffee Shop
4,"Malvern, Rouge",43.806686,-79.194353,Lee Valley,43.803161,-79.199681,Hobby Shop


In [38]:
toronto_venues.groupby('Neighbourhood').count()

Unnamed: 0_level_0,Neighbourhood Latitude,Neighbourhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighbourhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"Adelaide, King, Richmond",100,100,100,100,100,100
Agincourt,8,8,8,8,8,8
"Agincourt North, L'Amoreaux East, Milliken, Steeles East",17,17,17,17,17,17
"Albion Gardens, Beaumond Heights, Humbergate, Jamestown, Mount Olive, Silverstone, South Steeles, Thistletown",13,13,13,13,13,13
"Alderwood, Long Branch",12,12,12,12,12,12
"Bathurst Manor, Downsview North, Wilson Heights",19,19,19,19,19,19
"Bathurst Quay, CN Tower, Harbourfront West, Island airport, King and Spadina, Railway Lands, South Niagara",21,21,21,21,21,21
Bayview Village,9,9,9,9,9,9
"Bedford Park, Lawrence Manor East",36,36,36,36,36,36
Berczy Park,100,100,100,100,100,100


In [39]:
print('There are {} uniques categories.'.format(len(toronto_venues['Venue Category'].unique())))

There are 316 uniques categories.


#### Neighbourhod Analysis

In [40]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighbourhood'] = toronto_venues['Neighbourhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()

Unnamed: 0,Neighbourhood,Accessories Store,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio
0,"Malvern, Rouge",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,"Malvern, Rouge",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,"Malvern, Rouge",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,"Malvern, Rouge",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,"Malvern, Rouge",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [41]:
toronto_onehot.shape

(3447, 317)

In [42]:
# Grouping rows by neighborhood and by taking the mean of the frequency of occurrence of each category
toronto_grouped = toronto_onehot.groupby('Neighbourhood').mean().reset_index()
toronto_grouped

Unnamed: 0,Neighbourhood,Accessories Store,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio
0,"Adelaide, King, Richmond",0.010000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.030000,...,0.010000,0.00,0.000000,0.000000,0.000000,0.010000,0.00,0.000000,0.000000,0.000000
1,Agincourt,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.00,0.000000,0.000000,0.000000,0.000000,0.00,0.000000,0.000000,0.000000
2,"Agincourt North, L'Amoreaux East, Milliken, St...",0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.00,0.000000,0.000000,0.000000,0.000000,0.00,0.000000,0.000000,0.000000
3,"Albion Gardens, Beaumond Heights, Humbergate, ...",0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.00,0.000000,0.000000,0.000000,0.000000,0.00,0.000000,0.000000,0.000000
4,"Alderwood, Long Branch",0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.00,0.000000,0.000000,0.000000,0.000000,0.00,0.000000,0.000000,0.000000
5,"Bathurst Manor, Downsview North, Wilson Heights",0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.00,0.052632,0.000000,0.000000,0.000000,0.00,0.000000,0.000000,0.000000
6,"Bathurst Quay, CN Tower, Harbourfront West, Is...",0.000000,0.000000,0.047619,0.047619,0.047619,0.095238,0.095238,0.142857,0.000000,...,0.000000,0.00,0.000000,0.000000,0.000000,0.000000,0.00,0.000000,0.000000,0.000000
7,Bayview Village,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.00,0.000000,0.000000,0.000000,0.000000,0.00,0.000000,0.000000,0.000000
8,"Bedford Park, Lawrence Manor East",0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.027778,...,0.000000,0.00,0.027778,0.000000,0.000000,0.000000,0.00,0.000000,0.000000,0.000000
9,Berczy Park,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.00,0.000000,0.000000,0.000000,0.000000,0.00,0.000000,0.000000,0.010000


In [43]:
toronto_grouped.shape

(101, 317)

In [44]:
num_top_venues = 5
for neigh in toronto_grouped['Neighbourhood']:
    print("----"+neigh+"----")
    temp = toronto_grouped[toronto_grouped['Neighbourhood'] == neigh].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Adelaide, King, Richmond----
              venue  freq
0       Coffee Shop  0.07
1              Café  0.06
2  Sushi Restaurant  0.04
3        Steakhouse  0.04
4               Bar  0.04


----Agincourt----
             venue  freq
0           Lounge  0.12
1        Pool Hall  0.12
2  Badminton Court  0.12
3   Breakfast Spot  0.12
4   Sandwich Place  0.12


----Agincourt North, L'Amoreaux East, Milliken, Steeles East----
                venue  freq
0  Chinese Restaurant  0.18
1        Noodle House  0.12
2           BBQ Joint  0.12
3         Pizza Place  0.12
4            Pharmacy  0.12


----Albion Gardens, Beaumond Heights, Humbergate, Jamestown, Mount Olive, Silverstone, South Steeles, Thistletown----
                  venue  freq
0         Grocery Store  0.15
1        Sandwich Place  0.08
2          Liquor Store  0.08
3           Pizza Place  0.08
4  Fast Food Restaurant  0.08


----Alderwood, Long Branch----
               venue  freq
0        Pizza Place  0.17
1        Gas Statio

                  venue  freq
0         Grocery Store  0.18
1           Pizza Place  0.09
2        Sandwich Place  0.09
3  Fast Food Restaurant  0.09
4        Discount Store  0.09


----Downsview West----
           venue  freq
0           Park  0.14
1  Grocery Store  0.14
2    Pizza Place  0.14
3           Bank  0.14
4  Shopping Mall  0.14


----East Birchmount Park, Ionview, Kennedy Park----
               venue  freq
0        Coffee Shop  0.20
1     Discount Store  0.13
2  Convenience Store  0.07
3   Department Store  0.07
4      Metro Station  0.07


----East Toronto----
                  venue  freq
0           Pizza Place  0.11
1           Coffee Shop  0.09
2                  Café  0.09
3     Convenience Store  0.06
4  Fast Food Restaurant  0.06


----Emery, Humberlea----
                venue  freq
0      Discount Store  0.25
1  Italian Restaurant  0.25
2   Convenience Store  0.25
3      Baseball Field  0.25
4              Museum  0.00


----Fairview, Henry Farm, Oriole----
    

             venue  freq
0      Coffee Shop  0.06
1   Breakfast Spot  0.06
2             Café  0.04
3  Thai Restaurant  0.04
4     Gourmet Shop  0.04


----Parkview Hill, Woodbine Gardens----
                  venue  freq
0  Fast Food Restaurant  0.12
1           Pizza Place  0.12
2               Brewery  0.06
3        Breakfast Spot  0.06
4                  Café  0.06


----Parkwoods----
                  venue  freq
0  Fast Food Restaurant  0.17
1     Food & Drink Shop  0.17
2                   Spa  0.17
3             Pet Store  0.17
4                  Park  0.17


----Queen's Park----
                venue  freq
0         Coffee Shop  0.22
1      Sandwich Place  0.08
2  Italian Restaurant  0.05
3                Café  0.04
4     Bubble Tea Shop  0.03


----Riverdale, The Danforth West----
                  venue  freq
0      Greek Restaurant  0.13
1           Coffee Shop  0.08
2                   Pub  0.04
3         Grocery Store  0.04
4  Fast Food Restaurant  0.04


----Rosedale----

In [45]:
# Sort the venues in descending order
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [67]:
# Put the venues in a pandas dataframe
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighbourhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighbourhood'] = toronto_grouped['Neighbourhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.shape

(101, 11)

#### Run k-means to cluster the neighborhood into k clusters

In [68]:
# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Neighbourhood', 1)
# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=1).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
print(kmeans.labels_[0:10])
print(len(kmeans.labels_))

[0 0 0 2 0 0 0 0 0 0]
101


In [69]:
toronto_data

Unnamed: 0_level_0,Borough,Neighbourhood,Latitude,Longitude,Cluster Labels
Postal Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353,0
M1C,Scarborough,"Highland Creek, Port Union, Rouge Hill",43.784535,-79.160497,0
M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711,0
M1G,Scarborough,Woburn,43.770992,-79.216917,2
M1H,Scarborough,Cedarbrae,43.773136,-79.239476,0
M1J,Scarborough,Scarborough Village,43.744734,-79.239476,0
M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park",43.727929,-79.262029,0
M1L,Scarborough,"Clairlea, Golden Mile, Oakridge",43.711112,-79.284577,0
M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West",43.716316,-79.239476,0
M1N,Scarborough,"Birch Cliff, Cliffside West",43.692657,-79.264848,0


In [70]:
toronto_merged = toronto_data

# add clustering labels
toronto_merged['Cluster Labels'] = kmeans.labels_

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighbourhood'), on ='Neighbourhood')

toronto_merged.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


Unnamed: 0_level_0,Borough,Neighbourhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
Postal Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353,0,Fast Food Restaurant,Coffee Shop,Hobby Shop,Spa,Business Service,Electronics Store,Dog Run,Doner Restaurant,Donut Shop,Drugstore
M1C,Scarborough,"Highland Creek, Port Union, Rouge Hill",43.784535,-79.160497,0,Breakfast Spot,Burger Joint,Bar,Doner Restaurant,Donut Shop,Drugstore,Dumpling Restaurant,Eastern European Restaurant,Electronics Store,Empanada Restaurant
M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711,0,Pizza Place,Fast Food Restaurant,Mexican Restaurant,Sports Bar,Thrift / Vintage Store,Medical Center,Fried Chicken Joint,Beer Store,Moving Target,Electronics Store
M1G,Scarborough,Woburn,43.770992,-79.216917,2,Coffee Shop,Park,Business Service,Convenience Store,Yoga Studio,Electronics Store,Dog Run,Doner Restaurant,Donut Shop,Drugstore
M1H,Scarborough,Cedarbrae,43.773136,-79.239476,0,Coffee Shop,Indian Restaurant,Bakery,Thai Restaurant,Caribbean Restaurant,Fried Chicken Joint,Chinese Restaurant,Rental Car Location,Athletics & Sports,Asian Restaurant


In [71]:
# Visualize the clusters
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i+x+(i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighbourhood'],kmeans.labels_):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters