In [75]:
import pandas as pd
import lxml.html as html
import numpy as np
import matplotlib.cm as cm
import matplotlib

In [4]:
url = 'http://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
df = pd.read_html(url, header=0)[0]
df = df[df['Borough'] != 'Not assigned']

In [5]:
geo_df = pd.read_csv('https://cocl.us/Geospatial_data')

In [6]:
df.reset_index(drop=True)
merge_df = df.join(geo_df.set_index('Postal Code'), on='Postal Code')

In [7]:
merge_df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
2,M3A,North York,Parkwoods,43.753259,-79.329656
3,M4A,North York,Victoria Village,43.725882,-79.315572
4,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
5,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


In [1]:
!conda install -c conda-forge geopy --yes
from geopy.geocoders import Nominatim
import requests
!conda install -c conda-forge folium=0.5.0 --yes
import folium
from sklearn.cluster import KMeans

Collecting package metadata (current_repodata.json): ...working... done
Solving environment: ...working... done

## Package Plan ##

  environment location: C:\Users\DRONOV\miniconda3

  added / updated specs:
    - geopy


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    openssl-1.1.1g             |       he774522_0         5.7 MB  conda-forge
    ------------------------------------------------------------
                                           Total:         5.7 MB

The following packages will be UPDATED:

  ca-certificates      anaconda::ca-certificates-2020.1.1-0 --> conda-forge::ca-certificates-2020.4.5.1-hecc5488_0
  conda                        anaconda::conda-4.8.3-py37_0 --> conda-forge::conda-4.8.3-py37hc8dfbb8_1

The following packages will be SUPERSEDED by a higher-priority channel:

  certifi               anaconda::certifi-2020.4.5.1-py37_0 --> conda-forge::certifi-2020

In [8]:
nb_df = merge_df.copy()
nb_df['Neighborhood'] = nb_df['Neighborhood'].str.split(', ')

In [9]:
nb_df

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
2,M3A,North York,[Parkwoods],43.753259,-79.329656
3,M4A,North York,[Victoria Village],43.725882,-79.315572
4,M5A,Downtown Toronto,"[Regent Park, Harbourfront]",43.654260,-79.360636
5,M6A,North York,"[Lawrence Manor, Lawrence Heights]",43.718518,-79.464763
6,M7A,Downtown Toronto,"[Queen's Park, Ontario Provincial Government]",43.662301,-79.389494
...,...,...,...,...,...
160,M8X,Etobicoke,"[The Kingsway, Montgomery Road, Old Mill North]",43.653654,-79.506944
165,M4Y,Downtown Toronto,[Church and Wellesley],43.665860,-79.383160
168,M7Y,East Toronto,[Business reply mail Processing Centre],43.662744,-79.321558
169,M8Y,Etobicoke,"[Old Mill South, King's Mill Park, Sunnylea, H...",43.636258,-79.498509


Let's parse the actual neighborhoods.

In [30]:
nb2_df = pd.DataFrame(columns=nb_df.columns.tolist())
for row in nb_df.iterrows():
    for neighborhood in row[1]['Neighborhood']:
        nb2_df = nb2_df.append({'Postal Code': row[1]['Postal Code'],
                       'Borough': row[1]['Borough'],
                       'Neighborhood': neighborhood,
                       'Latitude': row[1]['Latitude'],
                       'Longitude': row[1]['Longitude']            
        }, ignore_index=True)

In [40]:
nb2_df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,Regent Park,43.65426,-79.360636
3,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636
4,M6A,North York,Lawrence Manor,43.718518,-79.464763


In [35]:
nb2_df.drop_duplicates(subset='Neighborhood', inplace=True)
nb2_df.shape

(201, 5)

In [37]:
address = 'Toronto'
geolocator = Nominatim(user_agent="canada_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


In [39]:
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=11)
for lat, lng, label in zip(nb2_df['Latitude'], nb2_df['Longitude'], nb2_df['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

In [45]:
# @hidden_cell
CLIENT_ID = 'FVL0LP5RVAUFF42PIQLTJ5Z434VVGVEDD0RNJSMON50RPCGN'
CLIENT_SECRET = '0MMJVQMWCX4I0XT3FDEXWUPN4YLK2ITVUVREBRLXEO0U1LJY'
VERSION = '20180605'
LIMIT = 100

In [46]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [48]:
toronto_venues =  getNearbyVenues(names=nb2_df['Neighborhood'],
                                   latitudes=nb2_df['Latitude'],
                                   longitudes=nb2_df['Longitude']
                                  )

Parkwoods
Victoria Village
Regent Park
Harbourfront
Lawrence Manor
Lawrence Heights
Queen's Park
Ontario Provincial Government
Islington Avenue
Malvern
Rouge
Don Mills
Parkview Hill
Woodbine Gardens
Garden District
Ryerson
Glencairn
West Deane Park
Princess Gardens
Martin Grove
Islington
Cloverdale
Rouge Hill
Port Union
Highland Creek
Woodbine Heights
St. James Town
Humewood-Cedarvale
Eringate
Bloordale Gardens
Old Burnhamthorpe
Markland Wood
Guildwood
Morningside
West Hill
The Beaches
Berczy Park
Caledonia-Fairbanks
Woburn
Leaside
Central Bay Street
Christie
Cedarbrae
Hillcrest Village
Bathurst Manor
Wilson Heights
Downsview North
Thorncliffe Park
Richmond
Adelaide
King
Dufferin
Dovercourt Village
Scarborough Village
Fairview
Henry Farm
Oriole
Northwood Park
York University
East Toronto
Harbourfront East
Union Station
Toronto Islands
Little Portugal
Trinity
Kennedy Park
Ionview
East Birchmount Park
Bayview Village
Downsview
The Danforth West
Riverdale
Toronto Dominion Centre
Design Ex

In [51]:
toronto_venues.shape

(4077, 7)

In [52]:
toronto_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Adelaide,93,93,93,93,93,93
Agincourt,4,4,4,4,4,4
Agincourt North,2,2,2,2,2,2
Albion Gardens,12,12,12,12,12,12
Alderwood,11,11,11,11,11,11
...,...,...,...,...,...,...
Woodbine Heights,9,9,9,9,9,9
York Mills,1,1,1,1,1,1
York Mills West,4,4,4,4,4,4
York University,6,6,6,6,6,6


In [53]:
print('There are {} uniques categories.'.format(len(toronto_venues['Venue Category'].unique())))

There are 264 uniques categories.


In [55]:
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 

Unnamed: 0,Accessories Store,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,...,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [57]:
_ = toronto_onehot.pop('Neighborhood')
toronto_onehot.insert(0,'Neighborhood', _)

In [58]:
toronto_onehot.head()

Unnamed: 0,Neighborhood,Accessories Store,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,Parkwoods,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Parkwoods,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Parkwoods,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Victoria Village,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Victoria Village,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [59]:
toronto_onehot.shape

(4077, 264)

In [60]:
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_grouped.head(15)

Unnamed: 0,Neighborhood,Accessories Store,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,Adelaide,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.021505,...,0.0,0.010753,0.0,0.0,0.0,0.0,0.0,0.0,0.010753,0.0
1,Agincourt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Agincourt North,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Albion Gardens,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Alderwood,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,Bathurst Manor,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.052632,0.0,0.0,0.0,0.0,0.0,0.0
6,Bathurst Quay,0.0,0.0,0.058824,0.058824,0.058824,0.117647,0.176471,0.058824,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,Bayview Village,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,Beaumond Heights,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,Bedford Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.045455,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [61]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [64]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Adelaide,Coffee Shop,Café,Restaurant,Deli / Bodega,Gym,Thai Restaurant,Clothing Store,Hotel,Pizza Place,Seafood Restaurant
1,Agincourt,Lounge,Latin American Restaurant,Clothing Store,Breakfast Spot,Electronics Store,Ethiopian Restaurant,Event Space,Eastern European Restaurant,Department Store,Drugstore
2,Agincourt North,Park,Playground,Dog Run,Deli / Bodega,Department Store,Dessert Shop,Diner,Discount Store,Distribution Center,Doner Restaurant
3,Albion Gardens,Grocery Store,Liquor Store,Coffee Shop,Pharmacy,Pizza Place,Fried Chicken Joint,Beer Store,Fast Food Restaurant,Discount Store,Japanese Restaurant
4,Alderwood,Pizza Place,Dance Studio,Coffee Shop,Gym,Skating Rink,Pharmacy,Pub,Athletics & Sports,Sandwich Place,Pool


In [65]:
kclusters = 5
toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

In [96]:
#neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)
toronto_merged = nb2_df
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')
toronto_merged.dropna(axis=0, inplace=True)
toronto_merged['Cluster Labels'] = toronto_merged['Cluster Labels'].astype('int')

In [97]:
pd.set_option('display.max_rows', None)
toronto_merged

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M3A,North York,Parkwoods,43.753259,-79.329656,2,Park,Fast Food Restaurant,Food & Drink Shop,Yoga Studio,Dog Run,Dessert Shop,Diner,Discount Store,Distribution Center,Donut Shop
1,M4A,North York,Victoria Village,43.725882,-79.315572,1,Coffee Shop,Pizza Place,French Restaurant,Portuguese Restaurant,Hockey Arena,Yoga Studio,Distribution Center,Deli / Bodega,Department Store,Dessert Shop
2,M5A,Downtown Toronto,Regent Park,43.65426,-79.360636,1,Coffee Shop,Pub,Bakery,Park,Breakfast Spot,Restaurant,Theater,Café,Health Food Store,Historic Site
3,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636,1,Coffee Shop,Pub,Bakery,Park,Breakfast Spot,Restaurant,Theater,Café,Health Food Store,Historic Site
4,M6A,North York,Lawrence Manor,43.718518,-79.464763,1,Furniture / Home Store,Clothing Store,Women's Store,Boutique,Miscellaneous Shop,Event Space,Athletics & Sports,Accessories Store,Vietnamese Restaurant,Coffee Shop
5,M6A,North York,Lawrence Heights,43.718518,-79.464763,1,Furniture / Home Store,Clothing Store,Women's Store,Boutique,Miscellaneous Shop,Event Space,Athletics & Sports,Accessories Store,Vietnamese Restaurant,Coffee Shop
6,M7A,Downtown Toronto,Queen's Park,43.662301,-79.389494,1,Coffee Shop,Sushi Restaurant,Yoga Studio,Creperie,Bar,Beer Bar,Sandwich Place,Burrito Place,Restaurant,Café
7,M7A,Downtown Toronto,Ontario Provincial Government,43.662301,-79.389494,1,Coffee Shop,Sushi Restaurant,Yoga Studio,Creperie,Bar,Beer Bar,Sandwich Place,Burrito Place,Restaurant,Café
9,M1B,Scarborough,Malvern,43.806686,-79.194353,4,Fast Food Restaurant,Farmers Market,Event Space,Ethiopian Restaurant,Electronics Store,Eastern European Restaurant,Drugstore,Donut Shop,Doner Restaurant,Dog Run
10,M1B,Scarborough,Rouge,43.806686,-79.194353,4,Fast Food Restaurant,Farmers Market,Event Space,Ethiopian Restaurant,Electronics Store,Eastern European Restaurant,Drugstore,Donut Shop,Doner Restaurant,Dog Run


In [98]:
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [matplotlib.colors.rgb2hex(i) for i in colors_array]

markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters