Part 1

In [1]:
import numpy as np
import pandas as pd
import urllib

In [2]:
page = urllib.request.urlopen("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")
content = page.read()

In [3]:
#convert data to usable format
scontent = content.decode("UTF-8")

In [4]:
#extract data needed
tables = scontent[scontent.find("<table"):scontent.find("</table>")+8]

In [5]:
#read data using pandas
data = pd.read_html(tables, header = 0)[0]

In [6]:
#get rows that have a defined burough, neighbohoods are grouped by postal code
data = data[data.Borough != "Not assigned"]
data.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [7]:
data.shape

(103, 3)

Part 2

In [8]:
csv_path = 'http://cocl.us/Geospatial_data'
df = pd.read_csv(csv_path).set_index('Postal Code')
df.head()

Unnamed: 0_level_0,Latitude,Longitude
Postal Code,Unnamed: 1_level_1,Unnamed: 2_level_1
M1B,43.806686,-79.194353
M1C,43.784535,-79.160497
M1E,43.763573,-79.188711
M1G,43.770992,-79.216917
M1H,43.773136,-79.239476


In [9]:
Combined_data = data.merge(df, on='Postal Code', how='left')
Combined_data.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


Part 3

In [10]:
!conda install -c conda-forge geopy --yes
from geopy.geocoders import Nominatim
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
!conda install -c conda-forge folium=0.5.0 --yes
import folium
import requests
from tqdm import tqdm
from collections import deque
import matplotlib.cm as cm
import matplotlib.colors as colors

print('Libraries imported.')

Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/Python36

  added / updated specs: 
    - geopy


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    openssl-1.1.1g             |       h516909a_0         2.1 MB  conda-forge
    ca-certificates-2020.4.5.1 |       hecc5488_0         146 KB  conda-forge
    geographiclib-1.50         |             py_0          34 KB  conda-forge
    python_abi-3.6             |          1_cp36m           4 KB  conda-forge
    certifi-2020.4.5.1         |   py36h9f0ad1d_0         151 KB  conda-forge
    geopy-1.22.0               |     pyh9f0ad1d_0          63 KB  conda-forge
    ------------------------------------------------------------
                                           Total:         2.5 MB

The following NEW packages will be INSTALLED:

    geographiclib:   1.50-py_0           conda-forge
    geopy:          

In [11]:
address = 'Toronto, Canada'

geolocator = Nominatim()
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

  app.launch_new_instance()


The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


In [14]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

neighborhoods = Combined_data

# add markers to map
for lat, lng, borough, neighborhood in zip(neighborhoods['Latitude'],
                                           neighborhoods['Longitude'],
                                           neighborhoods['Borough'],
                                           neighborhoods['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7).add_to(map_toronto)  
    
map_toronto

FourSquare access

In [15]:
CLIENT_ID = 'TQV5UDZPIJNN5BTPK2ZNYGI2C2KJPJDS3PKP2KPIXDMLB1AW' # your Foursquare ID
CLIENT_SECRET = '52RMS13HRFWLJF0ALRXMVAQIMSRMFS5JUUKM4FNC4A5H5LFF' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: TQV5UDZPIJNN5BTPK2ZNYGI2C2KJPJDS3PKP2KPIXDMLB1AW
CLIENT_SECRET:52RMS13HRFWLJF0ALRXMVAQIMSRMFS5JUUKM4FNC4A5H5LFF


Gets the top 100 venues around by making use of the name and locations of neighborhoods in Toronto.

In [16]:
def getNearbyVenues(names, latitudes, longitudes, radius=500, LIMIT = 100):
    
    venues_list=[]
    for name, lat, lng in tqdm(zip(names, latitudes, longitudes), total = names.size):
        #print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [18]:
Toronto_venues = getNearbyVenues(Combined_data.Neighborhood,
                            Combined_data.Latitude,
                            Combined_data.Longitude)

100%|██████████| 103/103 [00:26<00:00,  3.55it/s]


In [19]:
print(Toronto_venues.shape)
Toronto_venues.head()

(2127, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Parkwoods,43.753259,-79.329656,Brookbanks Park,43.751976,-79.33214,Park
1,Parkwoods,43.753259,-79.329656,Variety Store,43.751974,-79.333114,Food & Drink Shop
2,Victoria Village,43.725882,-79.315572,Victoria Village Arena,43.723481,-79.315635,Hockey Arena
3,Victoria Village,43.725882,-79.315572,Tim Hortons,43.725517,-79.313103,Coffee Shop
4,Victoria Village,43.725882,-79.315572,Portugril,43.725819,-79.312785,Portuguese Restaurant


In [20]:
#number of venues per neighborhood
Toronto_venues.groupby("Neighborhood").Venue.count().sort_values(ascending=False).head()

Neighborhood
First Canadian Place, Underground city               100
Commerce Court, Victoria Hotel                       100
Garden District, Ryerson                             100
Harbourfront East, Union Station, Toronto Islands    100
Toronto Dominion Centre, Design Exchange             100
Name: Venue, dtype: int64

In [21]:
#number of unique categories
print('There are {} uniques categories.'.format(len(Toronto_venues['Venue Category'].unique())))

There are 275 uniques categories.


In [22]:
#create a dataframe using venue data
Toronto_OHE = pd.get_dummies(Toronto_venues["Venue Category"],
                             prefix = "",
                             prefix_sep = "")

Toronto_OHE["Neighborhood"] = Toronto_venues["Neighborhood"]


nindex = list(Toronto_OHE.columns).index("Neighborhood")
cols = deque(Toronto_OHE.columns)
cols.rotate(-nindex)
cols = list(cols)
Toronto_OHE = Toronto_OHE[cols]

Toronto_OHE.head()

Unnamed: 0,Neighborhood,New American Restaurant,Nightclub,Noodle House,Office,Opera House,Optical Shop,Organic Grocery,Other Great Outdoors,Park,...,Mobile Phone Shop,Modern European Restaurant,Molecular Gastronomy Restaurant,Monument / Landmark,Moroccan Restaurant,Motel,Movie Theater,Museum,Music Venue,Nail Salon
0,Parkwoods,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,Parkwoods,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Victoria Village,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Victoria Village,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Victoria Village,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [23]:
#check shape of new dataframe
Toronto_OHE.shape

(2127, 275)

In [24]:
# average number of venues per neighborhood
Toronto_grouped = Toronto_OHE.groupby('Neighborhood').mean().reset_index()
Toronto_grouped.head()

Unnamed: 0,Neighborhood,New American Restaurant,Nightclub,Noodle House,Office,Opera House,Optical Shop,Organic Grocery,Other Great Outdoors,Park,...,Mobile Phone Shop,Modern European Restaurant,Molecular Gastronomy Restaurant,Monument / Landmark,Moroccan Restaurant,Motel,Movie Theater,Museum,Music Venue,Nail Salon
0,Agincourt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,"Alderwood, Long Branch",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"Bathurst Manor, Wilson Heights, Downsview North",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Bayview Village,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,"Bedford Park, Lawrence Manor East",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [25]:
Toronto_grouped.shape

(94, 275)

Cluster based on frequent venue categories per neighborhood

In [26]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [27]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = Toronto_grouped['Neighborhood']

for ind in np.arange(Toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(Toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Agincourt,Latin American Restaurant,Lounge,Breakfast Spot,Skating Rink,Art Gallery,Airport Terminal,American Restaurant,Antique Shop,Aquarium,Nail Salon
1,"Alderwood, Long Branch",Pizza Place,Coffee Shop,Skating Rink,Gym,Dance Studio,Sandwich Place,Pub,Pharmacy,American Restaurant,Antique Shop
2,"Bathurst Manor, Wilson Heights, Downsview North",Bank,Coffee Shop,Shopping Mall,Middle Eastern Restaurant,Supermarket,Grocery Store,Convenience Store,Sushi Restaurant,Sandwich Place,Pharmacy
3,Bayview Village,Bank,Café,Chinese Restaurant,Japanese Restaurant,Art Museum,Airport Service,Airport Terminal,American Restaurant,Antique Shop,Aquarium
4,"Bedford Park, Lawrence Manor East",Coffee Shop,Sandwich Place,Italian Restaurant,Restaurant,Butcher,Thai Restaurant,Sushi Restaurant,Greek Restaurant,Grocery Store,American Restaurant


Neighborhood clustering, reduce noise to improve efficiency

In [29]:
pca = PCA(.95)
Toronto_grouped_clustering = pca.fit_transform(Toronto_grouped.drop('Neighborhood', 1))
Toronto_grouped_clustering = Toronto_grouped.drop('Neighborhood', 1)

In [30]:
Toronto_grouped_clustering.shape

(94, 274)

Carry out Kmeans clustering

In [31]:
# set number of clusters
kclusters = 5

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(Toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
print(kmeans.labels_[0:10])
print(kmeans.labels_.shape)

[1 0 1 1 1 1 1 1 1 1]
(94,)


In [34]:
Toronto_grouped["Cluster Labels"] = kmeans.labels_

# add clustering labels
Toronto_combined = Combined_data.merge(Toronto_grouped, left_on = "Neighborhood", right_on = "Neighborhood", how = "outer")

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
Toronto_combined = Toronto_combined.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

Toronto_combined["Cluster Labels"] = Toronto_combined["Cluster Labels"].fillna(5).astype("int")

Toronto_combined.head() # check the last columns!

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude,New American Restaurant,Nightclub,Noodle House,Office,Opera House,...,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M3A,North York,Parkwoods,43.753259,-79.329656,0.0,0.0,0.0,0.0,0.0,...,Park,Food & Drink Shop,Airport Gate,Airport Service,Airport Terminal,American Restaurant,Antique Shop,Aquarium,Art Gallery,Nail Salon
1,M4A,North York,Victoria Village,43.725882,-79.315572,0.0,0.0,0.0,0.0,0.0,...,Portuguese Restaurant,French Restaurant,Hockey Arena,Pizza Place,Coffee Shop,Intersection,Vietnamese Restaurant,Airport Service,BBQ Joint,Auto Workshop
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636,0.0,0.0,0.0,0.0,0.0,...,Coffee Shop,Pub,Bakery,Park,Breakfast Spot,Restaurant,Theater,Café,Art Gallery,Event Space
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763,0.0,0.0,0.0,0.0,0.0,...,Clothing Store,Furniture / Home Store,Accessories Store,Arts & Crafts Store,Event Space,Vietnamese Restaurant,Gift Shop,Boutique,Miscellaneous Shop,Coffee Shop
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494,0.0,0.0,0.0,0.0,0.0,...,Coffee Shop,Smoothie Shop,Yoga Studio,Gym,Bar,Beer Bar,Creperie,Italian Restaurant,Sushi Restaurant,Sandwich Place


In [35]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

kclusters = kclusters + 1

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i+x+(i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(Toronto_combined['Latitude'],
                                  Toronto_combined['Longitude'],
                                  Toronto_combined['Neighborhood'],
                                  Toronto_combined['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

Analyze Clusters

Cluster 1 has many resturants

In [36]:
Toronto_combined.loc[Toronto_combined['Cluster Labels'] == 0, 
                     "1st Most Common Venue":"10th Most Common Venue"].head()

Unnamed: 0,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
1,Portuguese Restaurant,French Restaurant,Hockey Arena,Pizza Place,Coffee Shop,Intersection,Vietnamese Restaurant,Airport Service,BBQ Joint,Auto Workshop
11,Pizza Place,Pub,Japanese Restaurant,Sushi Restaurant,Park,Nail Salon,Art Gallery,Airport Terminal,American Restaurant,Antique Shop
52,Pizza Place,Nail Salon,Airport Gate,Airport Service,Airport Terminal,American Restaurant,Antique Shop,Aquarium,Art Gallery,Art Museum
70,Pizza Place,Intersection,Discount Store,Sandwich Place,Chinese Restaurant,Middle Eastern Restaurant,Coffee Shop,Nail Salon,Airport Terminal,American Restaurant
77,Sandwich Place,Bus Line,Pizza Place,Mobile Phone Shop,Nail Salon,Art Gallery,Airport Service,Airport Terminal,American Restaurant,Antique Shop


Cluster 2 has many shops

In [37]:
Toronto_combined.loc[Toronto_combined['Cluster Labels'] == 1, 
                     "1st Most Common Venue":"10th Most Common Venue"].head()

Unnamed: 0,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
2,Coffee Shop,Pub,Bakery,Park,Breakfast Spot,Restaurant,Theater,Café,Art Gallery,Event Space
3,Clothing Store,Furniture / Home Store,Accessories Store,Arts & Crafts Store,Event Space,Vietnamese Restaurant,Gift Shop,Boutique,Miscellaneous Shop,Coffee Shop
4,Coffee Shop,Smoothie Shop,Yoga Studio,Gym,Bar,Beer Bar,Creperie,Italian Restaurant,Sushi Restaurant,Sandwich Place
6,Fast Food Restaurant,Print Shop,Nail Salon,Art Gallery,Airport Terminal,American Restaurant,Antique Shop,Aquarium,Art Museum,Airport Lounge
7,Asian Restaurant,Japanese Restaurant,Restaurant,Gym,Beer Store,Coffee Shop,Supermarket,Bike Shop,Dim Sum Restaurant,Art Gallery


Cluster 3 is an outlier containing one neigborhood

In [38]:
Toronto_combined.loc[Toronto_combined['Cluster Labels'] == 2, 
                     "1st Most Common Venue":"10th Most Common Venue"].head()

Unnamed: 0,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
64,Convenience Store,Nail Salon,Art Gallery,Airport Service,Airport Terminal,American Restaurant,Antique Shop,Aquarium,Art Museum,Bagel Shop


Cluster 4 is dominated ny a park

In [39]:
Toronto_combined.loc[Toronto_combined['Cluster Labels'] == 3, 
                     "1st Most Common Venue":"10th Most Common Venue"].head()

Unnamed: 0,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Park,Food & Drink Shop,Airport Gate,Airport Service,Airport Terminal,American Restaurant,Antique Shop,Aquarium,Art Gallery,Nail Salon
21,Park,Women's Store,Pool,BBQ Joint,Auto Workshop,Auto Garage,Athletics & Sports,Asian Restaurant,Arts & Crafts Store,Airport Lounge
35,Park,Convenience Store,Airport Gate,Airport Service,Airport Terminal,American Restaurant,Antique Shop,Aquarium,Art Gallery,Nail Salon
61,Park,Swim School,Bus Line,Art Museum,Airport Service,Airport Terminal,American Restaurant,Antique Shop,Aquarium,Art Gallery
66,Bank,Convenience Store,Park,Auto Workshop,Auto Garage,Athletics & Sports,Asian Restaurant,Arts & Crafts Store,Art Museum,Airport Gate


Cluster 5 has a baseball field

In [40]:
Toronto_combined.loc[Toronto_combined['Cluster Labels'] == 4, 
                     "1st Most Common Venue":"10th Most Common Venue"].head()

Unnamed: 0,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
58,Baseball Field,Nail Salon,Art Museum,Airport Service,Airport Terminal,American Restaurant,Antique Shop,Aquarium,Art Gallery,Arts & Crafts Store
