# Recreate Data Frame from Part 1

### Downloading Libraries

In [1]:
from bs4 import BeautifulSoup
import pandas as pd
import requests

print('Libraries imported.')

Libraries imported.


### Extracting the html file from the URL

In [2]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
r = requests.get(url).text

print('Data downloaded!')

Data downloaded!


### Using BeautfilSoup for table row containing postcodes

In [3]:
soup = BeautifulSoup(r,'lxml')
btable = soup.find('table')
tab_rows = btable.find_all('tr')

### Transform the data into a pandas dataframe

In [4]:
# define the dataframe columns
column_names = ['PostalCode', 'Borough', 'Neighborhood'] 

# instantiate the dataframe
postcodes = pd.DataFrame(columns=column_names)

for n in range(1,len(tab_rows)):
    row_t = tab_rows[n].text.split('\n')
    if row_t[2] != 'Not assigned':  # Skip rows with 'Not assigned' Boroughs
        if row_t[3] == 'Not assigned':  # Copying Borough name to 'Not assigned' Neighborhoods
            row_t[3] = row_t[2]
        if row_t[1] in postcodes.loc[:,'PostalCode'].unique():  # Appending the Neighborhood names to duplicate PostalCode areas
            postcodes.loc[tag,'Neighborhood'] = postcodes.loc[tag,'Neighborhood'] + ', ' + row_t[3]
        else: 
            postcodes.loc[n,:] = row_t[1:4]
            tag = n

postcodes.reset_index(drop=True,inplace=True)
postcodes.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront, Regent Park"
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,M7A,Queen's Park,Queen's Park


# Recreate Part 2

### Download csv file with geographical coordinates of each postal code

In [5]:
import csv

!wget -q -O Geospatial_Coordinates.csv http://cocl.us/Geospatial_data
print('Data downloaded!')

# read csv file
coord = pd.read_csv('Geospatial_Coordinates.csv')
coord.head()

Data downloaded!


Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


### Match Postal Codes to the coordinate values to populate the dataframe

In [6]:
for i,n in enumerate(postcodes['PostalCode']):
    res = coord[coord['Postal Code']==n]
    postcodes.loc[i,'Latitude'] = res.Latitude.values
    postcodes.loc[i,'Longitude'] = res.Longitude.values

postcodes.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Harbourfront, Regent Park",43.65426,-79.360636
3,M6A,North York,"Lawrence Heights, Lawrence Manor",43.718518,-79.464763
4,M7A,Queen's Park,Queen's Park,43.662301,-79.389494


# Part 3

### Downloading more libraries

In [22]:
# import k-means from clustering stage
from sklearn.cluster import KMeans
import numpy as np
import folium # map rendering library

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

print('Libraries imported.')

Libraries imported.


### Define Foursquare Credentials and Version

In [11]:
CLIENT_ID = 'NHZOV5ISEMB3VWNLRUAQHL2DS4UIEWBROQXV0JYRWKHFV5XS' # your Foursquare ID
CLIENT_SECRET = '0LU1L02OT50RRR1GCA2KX01H0ENR2E2ZL132QX1GR5RFMKQF' # your Foursquare Secret
VERSION = '20181111' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: NHZOV5ISEMB3VWNLRUAQHL2DS4UIEWBROQXV0JYRWKHFV5XS
CLIENT_SECRET:0LU1L02OT50RRR1GCA2KX01H0ENR2E2ZL132QX1GR5RFMKQF


### Definte GetNearbyFunction from FourSquare

In [12]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        #print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            100)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

### Run the function to obtain nearby venues for a subset of the neighborhood

In [80]:
toronto_data = postcodes[6:92].reset_index(drop=True)
toronto_venues = getNearbyVenues(toronto_data['Neighborhood'],toronto_data["Latitude"],toronto_data['Longitude'])

### Create a one hot encoding dataframe for the neighborhood's venues

In [81]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
#toronto_onehot.columns = ['Neighborhood'].extend(toronto_onehot.columns)
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 

toronto_onehot.head()

Unnamed: 0,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,Aquarium,Arepa Restaurant,...,Turkish Restaurant,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Group the one hot encoded dataframe by Neighborhood to obtain frequencies for each venue¶

In [82]:
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_grouped.head()

Unnamed: 0,Neighborhood,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,Aquarium,...,Turkish Restaurant,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,"Adelaide, King, Richmond",0.0,0.0,0.0,0.0,0.0,0.0,0.04,0.0,0.0,...,0.0,0.01,0.0,0.0,0.0,0.0,0.01,0.0,0.01,0.0
1,Agincourt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"Agincourt North, L'Amoreaux East, Milliken, St...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,"Albion Gardens, Beaumond Heights, Humbergate, ...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,"Bathurst Manor, Downsview North, Wilson Heights",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.058824,0.0,0.0,0.0,0.0,0.0,0.0


### Define return_most_common_venues from FourSquare

In [83]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

### Top 5 most common venues for each Neighborhood

In [117]:
num_top_venues = 5

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()


Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,"Adelaide, King, Richmond",Coffee Shop,Café,Thai Restaurant,Steakhouse,American Restaurant
1,Agincourt,Breakfast Spot,Lounge,Clothing Store,Skating Rink,Dumpling Restaurant
2,"Agincourt North, L'Amoreaux East, Milliken, St...",Park,Playground,Yoga Studio,Doner Restaurant,Dessert Shop
3,"Albion Gardens, Beaumond Heights, Humbergate, ...",Grocery Store,Coffee Shop,Pizza Place,Sandwich Place,Fast Food Restaurant
4,"Bathurst Manor, Downsview North, Wilson Heights",Coffee Shop,Fast Food Restaurant,Sushi Restaurant,Middle Eastern Restaurant,Deli / Bodega


In [85]:
neighborhoods_venues_sorted.shape

(84, 6)

### Run k-means to cluster the neighborhood into 5 clusters

In [118]:
# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]

array([2, 2, 3, 2, 2, 2, 2, 2, 2, 2], dtype=int32)

### Create the dataframe for displaying the cluster at which each neighborhood belongs to

In [122]:
labels = abs(kmeans.labels_)
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', labels)

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_data.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

toronto_merged

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353,1.0,Fast Food Restaurant,Yoga Studio,Department Store,Falafel Restaurant,Event Space
1,M3B,North York,Don Mills North,43.745906,-79.352188,2.0,Gym / Fitness Center,Pool,Café,Caribbean Restaurant,Japanese Restaurant
2,M4B,East York,"Woodbine Gardens, Parkview Hill",43.706397,-79.309937,2.0,Fast Food Restaurant,Pizza Place,Gastropub,Gym / Fitness Center,Pharmacy
3,M5B,Downtown Toronto,"Ryerson, Garden District",43.657162,-79.378937,2.0,Clothing Store,Coffee Shop,Café,Cosmetics Shop,Middle Eastern Restaurant
4,M6B,North York,Glencairn,43.709577,-79.445073,2.0,Italian Restaurant,Bakery,Pizza Place,Japanese Restaurant,Pub
5,M9B,Etobicoke,"Cloverdale, Islington, Martin Grove, Princess ...",43.650943,-79.554724,4.0,Bank,Yoga Studio,Field,Farmers Market,Falafel Restaurant
6,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497,2.0,Bar,Yoga Studio,Field,Farmers Market,Falafel Restaurant
7,M3C,North York,"Flemingdon Park, Don Mills South",43.725900,-79.340923,2.0,Coffee Shop,Asian Restaurant,Beer Store,Gym,Dim Sum Restaurant
8,M4C,East York,Woodbine Heights,43.695344,-79.318389,2.0,Skating Rink,Cosmetics Shop,Spa,Beer Store,Intersection
9,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418,2.0,Coffee Shop,Restaurant,Hotel,Café,Clothing Store


### Generate Folium map (assuming Toronto city hall as the centre of Toronto)

In [49]:
# Assuming Toronto coordinates as 43.653170, -79.383541
tor_latitude = 43.653170
tor_longitude = -79.383541

# create map
map_clusters = folium.Map(location=[tor_latitude, tor_longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i+x+(i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[int(cluster-1)],
        fill=True,
        fill_color=rainbow[int(cluster-1)],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

ValueError: cannot convert float NaN to integer