In [1]:
# Import All Libraries.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from urllib.request import urlopen
from bs4 import BeautifulSoup
import requests
from pandas.io.json import json_normalize

!conda install -c conda-forge folium=0.5.0
import folium

print('All necessary libraries Imported')

Solving environment: done

# All requested packages already installed.

All necessary libraries Imported


In [3]:
# Web Scraping and creating dataframe.
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
html = urlopen(url) # Convert url to html
soup = BeautifulSoup(html) # Create Soup

In [4]:
# Extract all tables form the page. Visit page for a clear idea.
tables = soup.find_all('table') 

# Extract only First Table as only interested in Postcode, Borough and Neighbourhood Table.
table = tables[0] 

# Find all rows in the table.
rows = table.find_all('tr') 

# Create list of cleaned data
list_rows = []
for row in rows:
    cols = row.find_all('td') # Find column data in each row.
    clean_row=[]
    for col in cols:
        data = col.find(text=True).strip() # Find only text from each column data and remove new-line characters.
        clean_row.append(data)
    list_rows.append(clean_row)

 # Create data frame from cleaned data
df_postcode = pd.DataFrame(list_rows[1:], columns=['Postcode','Borough','Neighborhood'])

In [5]:
# Convert all data to strings
df_postcode = df_postcode.astype('str')

In [6]:
 # Remove all rows where Borough is not assigned
df_postcode = df_postcode[df_postcode['Borough'] != 'Not assigned']

In [7]:
# If Neighborhood is Not-Assigned, assign the same value as Borough
for x in df_postcode['Neighborhood']:
    if x == 'Not assigned\n':
        x = df_postcode['Borough']

In [8]:
# Groupby Postcode + Borough & join Neighborhood with ','
df_postcode_final = df_postcode.groupby(['Postcode','Borough'])['Neighborhood'].apply(lambda x: ','.join(x)).reset_index()

In [9]:
# Check results
print('Shape of final postcode data is {}'.format(df_postcode_final.shape))
df_postcode_final.head()

Shape of final postcode data is (103, 3)


Unnamed: 0,Postcode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [10]:
# Read Geo-Spatial Data and rename colun name
df_geo_data = pd.read_csv('http://cocl.us/Geospatial_data')
df_geo_data.rename(columns={'Postal Code':'Postcode'}, inplace=True)

print('Shape of df_geo_data: {}'.format(df_geo_data.shape))
print('Head of df_geo_data: \n{}'.format(df_geo_data.head()))

Shape of df_geo_data: (103, 3)
Head of df_geo_data: 
  Postcode   Latitude  Longitude
0      M1B  43.806686 -79.194353
1      M1C  43.784535 -79.160497
2      M1E  43.763573 -79.188711
3      M1G  43.770992 -79.216917
4      M1H  43.773136 -79.239476


In [11]:
# Merge df_postcode_data and df_geo_code based on postal code.
df_data = pd.merge(df_postcode_final, df_geo_data, on='Postcode', how='inner')

print('Shape of df_data = No of unique neighborhood/postcode: {}'.format(df_data.shape))
df_data.head()

Shape of df_data = No of unique neighborhood/postcode: (103, 5)


Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge,Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [12]:
#!conda install -c conda-forge geopy --yes 
from geopy.geocoders import Nominatim
# Retrieve Lati and Long of Toronto for Map creation.
address = 'Toronto, Canada'

geolocator = Nominatim(user_agent="explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of {} are {}, {}.'.format(address, latitude, longitude))

The geograpical coordinate of Toronto, Canada are 43.653963, -79.387207.


In [13]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(df_data['Latitude'], df_data['Longitude'], df_data['Borough'], df_data['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=4,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)

In [14]:
map_toronto

In [15]:
# Setup Foursquare Credentials
CLIENT_ID = 'YWLNZ2TWMJ4KDQGPEDQSEKMZU2L1SHPZXIMK42OCON21P5R5' # your Foursquare ID
CLIENT_SECRET = 'QOZSNPHZWO2RS2I01O4C1RF2G0SAVCEN0TT1BGBXMU3AMJUP' # your Foursquare Secret
VERSION = '20190901' # Foursquare API version

RADIUS = 500 # Radius of search in meters
LIMIT = 100 # Limit the count of search

In [16]:
def getNearbyVenues(neighborhood, lati, longi):
    
    venues_list=[]
    for name, LATI, LONGI in zip(neighborhood, lati, longi):
        print(name)
        
        # Setup url for API call
        url = 'https://api.foursquare.com/v2/venues/search?client_id={}&client_secret={}&ll={},{}&v={}&radius={}&limit={}' \
                                            .format(CLIENT_ID, CLIENT_SECRET, LATI, LONGI, VERSION, RADIUS, LIMIT)
            
       # API call to get the json file
        results = requests.get(url).json()
        
        # Extract all the necessary categories from the venue.
        venue_details = results['response']['venues']

        category_details = []
        for venue in venue_details:
            category_details.append(venue['categories'])

        for detail in category_details:
            if len(detail) != 0:
                venues_list.append((name,detail[0]['name']))


    nearby_venues = pd.DataFrame(venues_list)
    nearby_venues.columns = ['Neighborhood', 'Venue Category']
    
    return(nearby_venues)

In [17]:
df_toronto_venues = getNearbyVenues(neighborhood=df_data['Neighborhood'], lati=df_data['Latitude'], longi=df_data['Longitude'])

Rouge,Malvern
Highland Creek,Rouge Hill,Port Union
Guildwood,Morningside,West Hill
Woburn
Cedarbrae
Scarborough Village
East Birchmount Park,Ionview,Kennedy Park
Clairlea,Golden Mile,Oakridge
Cliffcrest,Cliffside,Scarborough Village West
Birch Cliff,Cliffside West
Dorset Park,Scarborough Town Centre,Wexford Heights
Maryvale,Wexford
Agincourt
Clarks Corners,Sullivan,Tam O'Shanter
Agincourt North,L'Amoreaux East,Milliken,Steeles East
L'Amoreaux West
Upper Rouge
Hillcrest Village
Fairview,Henry Farm,Oriole
Bayview Village
Silver Hills,York Mills
Newtonbrook,Willowdale
Willowdale South
York Mills West
Willowdale West
Parkwoods
Don Mills North
Flemingdon Park,Don Mills South
Bathurst Manor,Downsview North,Wilson Heights
Northwood Park,York University
CFB Toronto,Downsview East
Downsview West
Downsview Central
Downsview Northwest
Victoria Village
Woodbine Gardens,Parkview Hill
Woodbine Heights
The Beaches
Leaside
Thorncliffe Park
East Toronto
The Danforth West,Riverdale
The Beaches West,Indi

In [18]:
print('Shape of df_toronto_venues: {}'.format(df_toronto_venues.shape))
print('Total no of unique neighborhoods in df_toronto_venues: {}'.format(len(np.unique(df_toronto_venues['Neighborhood']))))
df_toronto_venues.head()

Shape of df_toronto_venues: (9084, 2)
Total no of unique neighborhoods in df_toronto_venues: 103


Unnamed: 0,Neighborhood,Venue Category
0,"Rouge,Malvern",Gas Station
1,"Rouge,Malvern",Park
2,"Rouge,Malvern",Office
3,"Rouge,Malvern",Building
4,"Rouge,Malvern",Print Shop


In [19]:
# Drop duplicate values
df_toronto_venues = df_toronto_venues.drop_duplicates(['Neighborhood','Venue Category'], keep='first')

print('Shape of df_toronto_venues: {}'.format(df_toronto_venues.shape))
print('Total no of Unique Category in df_toronto_venues: {}'.format(len(np.unique(df_toronto_venues['Venue Category']))))
df_toronto_venues.head()

Shape of df_toronto_venues: (5756, 2)
Total no of Unique Category in df_toronto_venues: 506


Unnamed: 0,Neighborhood,Venue Category
0,"Rouge,Malvern",Gas Station
1,"Rouge,Malvern",Park
2,"Rouge,Malvern",Office
3,"Rouge,Malvern",Building
4,"Rouge,Malvern",Print Shop


In [20]:
# Apply One Hot Encoding on df_toronto_venues
venue_onehot = pd.get_dummies(df_toronto_venues[['Venue Category']],prefix='',prefix_sep='')

print('Shape of venue_onehot: {}'.format(venue_onehot.shape))
print('Total no of Categories in venue_onehot: {}'.format(len(venue_onehot.columns)))
venue_onehot.head()

Shape of venue_onehot: (5756, 506)
Total no of Categories in venue_onehot: 506


Unnamed: 0,ATM,Accessories Store,Acupuncturist,Adult Boutique,Advertising Agency,Afghan Restaurant,African Restaurant,Airport,Airport Food Court,Airport Gate,...,Watch Shop,Water Park,Wine Bar,Wine Shop,Winery,Wings Joint,Women's Store,Yoga Studio,Zoo,Zoo Exhibit
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [21]:
# Add neighborhood to venue_onehot and move it to 1st column for easy understanding
venue_onehot['Neighborhood'] = df_toronto_venues['Neighborhood']
new_col_seq = [venue_onehot.columns[-1]] + list(venue_onehot.columns[:-1])
venue_onehot = venue_onehot[new_col_seq]


print('Shape of venue_onehot: {}'.format(venue_onehot.shape))
print('Total no of Categories in venue_onehot: {}'.format(len(venue_onehot.columns)-1))
venue_onehot.head()

Shape of venue_onehot: (5756, 507)
Total no of Categories in venue_onehot: 506


Unnamed: 0,Neighborhood,ATM,Accessories Store,Acupuncturist,Adult Boutique,Advertising Agency,Afghan Restaurant,African Restaurant,Airport,Airport Food Court,...,Watch Shop,Water Park,Wine Bar,Wine Shop,Winery,Wings Joint,Women's Store,Yoga Studio,Zoo,Zoo Exhibit
0,"Rouge,Malvern",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,"Rouge,Malvern",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,"Rouge,Malvern",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,"Rouge,Malvern",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,"Rouge,Malvern",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [22]:
# Group_by venue_onehot based on neighborhood to match df_toronto for joining. Here mean is considered.
neighborhood_onehot = venue_onehot.groupby('Neighborhood').mean().reset_index()

print('Total neighborhoods = Total Postcodes:{}'.format(neighborhood_onehot.shape[0]))
print('Total no of diff venue categories used: {}'.format(neighborhood_onehot.shape[1]-1))
neighborhood_onehot.head()

Total neighborhoods = Total Postcodes:103
Total no of diff venue categories used: 506


Unnamed: 0,Neighborhood,ATM,Accessories Store,Acupuncturist,Adult Boutique,Advertising Agency,Afghan Restaurant,African Restaurant,Airport,Airport Food Court,...,Watch Shop,Water Park,Wine Bar,Wine Shop,Winery,Wings Joint,Women's Store,Yoga Studio,Zoo,Zoo Exhibit
0,"Adelaide,King,Richmond",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Agincourt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"Agincourt North,L'Amoreaux East,Milliken,Steel...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,"Albion Gardens,Beaumond Heights,Humbergate,Jam...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,"Alderwood,Long Branch",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [23]:
# Create an Empty dataframe which shows top n venue categories in a neighborhood.
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhood_top_venues = pd.DataFrame(columns=columns)
neighborhood_top_venues['Neighborhood'] = neighborhood_onehot['Neighborhood']

neighborhood_top_venues.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"Adelaide,King,Richmond",,,,,,,,,,
1,Agincourt,,,,,,,,,,
2,"Agincourt North,L'Amoreaux East,Milliken,Steel...",,,,,,,,,,
3,"Albion Gardens,Beaumond Heights,Humbergate,Jam...",,,,,,,,,,
4,"Alderwood,Long Branch",,,,,,,,,,


In [24]:
# Add values to the above dataframe.
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)    
    return row_categories_sorted.index.values[0:num_top_venues]

for row in np.arange(neighborhood_onehot.shape[0]):
    neighborhood_top_venues.iloc[row, 1:] = return_most_common_venues(neighborhood_onehot.iloc[row, :], num_top_venues)
    
neighborhood_top_venues

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"Adelaide,King,Richmond",Pizza Place,Coffee Shop,Office,Opera House,Cosmetics Shop,Convention Center,Convenience Store,Comfort Food Restaurant,Pharmacy,Photography Studio
1,Agincourt,Spa,Mosque,Auto Garage,Non-Profit,Automotive Shop,Factory,Miscellaneous Shop,Badminton Court,Theme Park,Print Shop
2,"Agincourt North,L'Amoreaux East,Milliken,Steel...",Medical Center,BBQ Joint,Record Shop,Bakery,Design Studio,Tech Startup,Coffee Shop,Food Court,Church,Chinese Restaurant
3,"Albion Gardens,Beaumond Heights,Humbergate,Jam...",Pizza Place,Dentist's Office,Art Gallery,Church,Financial or Legal Service,Caribbean Restaurant,Casino,College Cafeteria,Office,Garden
4,"Alderwood,Long Branch",Gym / Fitness Center,Bike Shop,Salon / Barbershop,Electronics Store,Medical Center,Shoe Store,Shop & Service,Market,Café,Lounge
5,"Bathurst Manor,Downsview North,Wilson Heights",Playground,Korean Restaurant,Residential Building (Apartment / Condo),Laundry Service,Fast Food Restaurant,Monument / Landmark,Church,Supermarket,Sushi Restaurant,Restaurant
6,Bayview Village,Spa,Medical Center,School,Meeting Room,Café,Shopping Mall,Shopping Plaza,Skate Park,Skating Rink,Electronics Store
7,"Bedford Park,Lawrence Manor East",Jewish Restaurant,Dentist's Office,Breakfast Spot,Fast Food Restaurant,Boutique,Coffee Shop,Bike Shop,Sushi Restaurant,Nail Salon,Thai Restaurant
8,Berczy Park,Building,Breakfast Spot,Bus Stop,Bus Station,Lounge,Park,Spa,Liquor Store,Library,Laundry Service
9,"Birch Cliff,Cliffside West",Parking,Thai Restaurant,Church,Laundry Service,Dive Bar,Discount Store,Student Center,Fire Station,Bistro,Diner


In [89]:
# Apply Kmeans clustering
from sklearn.cluster import KMeans
neighborhood_model_data = neighborhood_onehot.drop(['Neighborhood'], axis=1)

k = 5
kmeans = KMeans(n_clusters=k, random_state=0).fit(neighborhood_model_data)
len(kmeans.labels_)

103

In [90]:
# Is done to avoid rerun of complete code for any issues.
neighborhood_top_clustered_venues = neighborhood_top_venues.copy(deep=True)

In [91]:
# Add labels to neighborhood_top_clustered_venues
neighborhood_top_clustered_venues.insert(1, 'Cluster Labels', kmeans.labels_)
neighborhood_top_clustered_venues

Unnamed: 0,Neighborhood,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"Adelaide,King,Richmond",1,Pizza Place,Coffee Shop,Office,Opera House,Cosmetics Shop,Convention Center,Convenience Store,Comfort Food Restaurant,Pharmacy,Photography Studio
1,Agincourt,2,Spa,Mosque,Auto Garage,Non-Profit,Automotive Shop,Factory,Miscellaneous Shop,Badminton Court,Theme Park,Print Shop
2,"Agincourt North,L'Amoreaux East,Milliken,Steel...",3,Medical Center,BBQ Joint,Record Shop,Bakery,Design Studio,Tech Startup,Coffee Shop,Food Court,Church,Chinese Restaurant
3,"Albion Gardens,Beaumond Heights,Humbergate,Jam...",3,Pizza Place,Dentist's Office,Art Gallery,Church,Financial or Legal Service,Caribbean Restaurant,Casino,College Cafeteria,Office,Garden
4,"Alderwood,Long Branch",3,Gym / Fitness Center,Bike Shop,Salon / Barbershop,Electronics Store,Medical Center,Shoe Store,Shop & Service,Market,Café,Lounge
5,"Bathurst Manor,Downsview North,Wilson Heights",3,Playground,Korean Restaurant,Residential Building (Apartment / Condo),Laundry Service,Fast Food Restaurant,Monument / Landmark,Church,Supermarket,Sushi Restaurant,Restaurant
6,Bayview Village,3,Spa,Medical Center,School,Meeting Room,Café,Shopping Mall,Shopping Plaza,Skate Park,Skating Rink,Electronics Store
7,"Bedford Park,Lawrence Manor East",2,Jewish Restaurant,Dentist's Office,Breakfast Spot,Fast Food Restaurant,Boutique,Coffee Shop,Bike Shop,Sushi Restaurant,Nail Salon,Thai Restaurant
8,Berczy Park,1,Building,Breakfast Spot,Bus Stop,Bus Station,Lounge,Park,Spa,Liquor Store,Library,Laundry Service
9,"Birch Cliff,Cliffside West",1,Parking,Thai Restaurant,Church,Laundry Service,Dive Bar,Discount Store,Student Center,Fire Station,Bistro,Diner


In [92]:
# Join df_data and neighborhood_top_clustered_venues on Neighborhood for final o/p dataset.
clustered_toronto = pd.merge(df_data, neighborhood_top_clustered_venues, on='Neighborhood', how='inner')

In [93]:
neighborhood_top_clustered_venues.head()

Unnamed: 0,Neighborhood,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"Adelaide,King,Richmond",1,Pizza Place,Coffee Shop,Office,Opera House,Cosmetics Shop,Convention Center,Convenience Store,Comfort Food Restaurant,Pharmacy,Photography Studio
1,Agincourt,2,Spa,Mosque,Auto Garage,Non-Profit,Automotive Shop,Factory,Miscellaneous Shop,Badminton Court,Theme Park,Print Shop
2,"Agincourt North,L'Amoreaux East,Milliken,Steel...",3,Medical Center,BBQ Joint,Record Shop,Bakery,Design Studio,Tech Startup,Coffee Shop,Food Court,Church,Chinese Restaurant
3,"Albion Gardens,Beaumond Heights,Humbergate,Jam...",3,Pizza Place,Dentist's Office,Art Gallery,Church,Financial or Legal Service,Caribbean Restaurant,Casino,College Cafeteria,Office,Garden
4,"Alderwood,Long Branch",3,Gym / Fitness Center,Bike Shop,Salon / Barbershop,Electronics Store,Medical Center,Shoe Store,Shop & Service,Market,Café,Lounge


In [94]:
# Retrieve Lati and Long of Toronto for Map creation.
address = 'Toronto, Canada'

geolocator = Nominatim(user_agent="explorer")
location = geolocator.geocode(address)
toronto_lati = location.latitude
toronto_longi = location.longitude
print('The geograpical coordinate of {} are {}, {}.'.format(address, toronto_lati, toronto_longi))

The geograpical coordinate of Toronto, Canada are 43.653963, -79.387207.


In [95]:
# create clustered map of Toronto using latitude and longitude values
map_toronto_clustered = folium.Map(location=[toronto_lati, toronto_longi], zoom_start=10)

latitude = clustered_toronto['Latitude']
longitude = clustered_toronto['Longitude']
borough = clustered_toronto['Borough']
neighborhood = clustered_toronto['Neighborhood']
clusters = clustered_toronto['Cluster Labels']

colors = ['cyan', 'blue', 'green', 'yellow', 'red']
# add markers to map
for lat, lng, boro, neigh, cluster in zip(latitude, longitude, borough, neighborhood, clusters):
    label = '{}, {}, {}'.format(boro, neigh, cluster)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=4,
        popup=label,
        color=colors[cluster],
        fill=True,
        fill_color=colors[cluster],
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto_clustered)

In [97]:
map_toronto_clustered

In [None]:
# Create a dataframe with top 10 venues per cluster.

In [225]:
one_hot = neighborhood_onehot.copy(deep=True)
one_hot.insert(1, 'Cluster Labels', kmeans.labels_)
one_hot.head()

Unnamed: 0,Neighborhood,Cluster Labels,ATM,Accessories Store,Acupuncturist,Adult Boutique,Advertising Agency,Afghan Restaurant,African Restaurant,Airport,...,Watch Shop,Water Park,Wine Bar,Wine Shop,Winery,Wings Joint,Women's Store,Yoga Studio,Zoo,Zoo Exhibit
0,"Adelaide,King,Richmond",1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Agincourt,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"Agincourt North,L'Amoreaux East,Milliken,Steel...",3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,"Albion Gardens,Beaumond Heights,Humbergate,Jam...",3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,"Alderwood,Long Branch",3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [226]:
# Edit dataframe to groupby Clusters
one_hot = one_hot.drop(['Neighborhood'], axis=1)
one_hot.head()

Unnamed: 0,Cluster Labels,ATM,Accessories Store,Acupuncturist,Adult Boutique,Advertising Agency,Afghan Restaurant,African Restaurant,Airport,Airport Food Court,...,Watch Shop,Water Park,Wine Bar,Wine Shop,Winery,Wings Joint,Women's Store,Yoga Studio,Zoo,Zoo Exhibit
0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [227]:
# Groupby ClusterLables and take mean of all venues available
grouped_one_hot = test_df.groupby('Cluster Labels').mean().reset_index()
grouped_one_hot.head()

Unnamed: 0,Cluster Labels,ATM,Accessories Store,Acupuncturist,Adult Boutique,Advertising Agency,Afghan Restaurant,African Restaurant,Airport,Airport Food Court,...,Watch Shop,Water Park,Wine Bar,Wine Shop,Winery,Wings Joint,Women's Store,Yoga Studio,Zoo,Zoo Exhibit
0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.022222,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,0.001271,0.00082,0.0,0.000387,0.000827,0.000393,0.000443,0.000864,0.0,...,0.000393,0.0,0.001189,0.0,0.001274,0.00211,0.003137,0.002707,0.000488,0.000881
2,2,0.00119,0.009866,0.0,0.0,0.0,0.0,0.001276,0.0,0.0,...,0.0,0.0,0.001099,0.002778,0.00291,0.001587,0.008147,0.002289,0.0,0.001276
3,3,0.001574,0.000682,0.000395,0.0,0.001315,0.000837,0.001805,0.000453,0.0,...,0.0,0.001532,0.00053,0.002725,0.001923,0.001853,0.001067,0.00179,0.0,0.0
4,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.037037,0.037037,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [228]:
# Create an Empty dataframe which shows top n venue categories in a cluster label.
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Cluster Labels']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
cluster_top_venues = pd.DataFrame(columns=columns)
cluster_top_venues['Cluster Labels'] = grouped_test_df['Cluster Labels']

cluster_top_venues.head()

Unnamed: 0,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,0,,,,,,,,,,
1,1,,,,,,,,,,
2,2,,,,,,,,,,
3,3,,,,,,,,,,
4,4,,,,,,,,,,


In [229]:
# Add values to the above dataframe.
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)    
    return row_categories_sorted.index.values[0:num_top_venues]

for row in np.arange(grouped_test_df.shape[0]):
    cluster_top_venues.iloc[row, 1:] = return_most_common_venues(grouped_test_df.iloc[row, :], num_top_venues)
    
cluster_top_venues

Unnamed: 0,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,0,High School,Donut Shop,Furniture / Home Store,Bakery,Bank,Baseball Field,Food Service,Storage Facility,Factory,Spa
1,1,Building,Office,Coffee Shop,Event Space,Café,Park,Residential Building (Apartment / Condo),Convenience Store,Miscellaneous Shop,Restaurant
2,2,Salon / Barbershop,Coffee Shop,Furniture / Home Store,Office,Sporting Goods Shop,Electronics Store,Bank,Gift Shop,Miscellaneous Shop,Clothing Store
3,3,Salon / Barbershop,Residential Building (Apartment / Condo),Office,Church,Bank,Convenience Store,Coffee Shop,Pharmacy,Grocery Store,Park
4,4,Baggage Claim,American Restaurant,Boat or Ferry,Sculpture Garden,Boutique,Flight School,Office,Coworking Space,Tree,Rental Car Location


In [230]:
# Add colors as used in map to Cluster labels.
for i in range(len(cluster_top_venues['Cluster Labels'])):
    cluster_top_venues['Cluster Labels'][i] = str(cluster_top_venues['Cluster Labels'][i]) + '-' + colors[i].upper()

cluster_top_venues.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()


Unnamed: 0,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,0-CYAN,High School,Donut Shop,Furniture / Home Store,Bakery,Bank,Baseball Field,Food Service,Storage Facility,Factory,Spa
1,1-BLUE,Building,Office,Coffee Shop,Event Space,Café,Park,Residential Building (Apartment / Condo),Convenience Store,Miscellaneous Shop,Restaurant
2,2-GREEN,Salon / Barbershop,Coffee Shop,Furniture / Home Store,Office,Sporting Goods Shop,Electronics Store,Bank,Gift Shop,Miscellaneous Shop,Clothing Store
3,3-YELLOW,Salon / Barbershop,Residential Building (Apartment / Condo),Office,Church,Bank,Convenience Store,Coffee Shop,Pharmacy,Grocery Store,Park
4,4-RED,Baggage Claim,American Restaurant,Boat or Ferry,Sculpture Garden,Boutique,Flight School,Office,Coworking Space,Tree,Rental Car Location
