<h1>Problem 1</h1>

In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import re

We create a proper pandas dataframe from <a href="https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M">this link</a>. We parse a table in ist current state: skip 'Not assigned' fields and use regular expressions to separate borough from neighbourhood. Then we replace slashes ith commas to fit required format.

In [2]:
html = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M").text
soup = BeautifulSoup(html, 'xml')
table=soup.find('table')
df = pd.DataFrame(columns = ['Postalcode','Borough','Neighborhood'])
for row in table.find_all('tr'):
    for cell in row.find_all('td'):
        postal_code = cell.find('p').find('b').text
        borough = cell.find('p').find('span').text
        if borough == 'Not assigned':
            continue
        match_result = re.match(r'(.*)\((.*)\)', borough)
        if match_result:
            m1 = match_result.group(1)
            m2 = match_result.group(2)
            m2 = m2.replace(' / ', ', ')
            df.loc[len(df)] = [postal_code, m1,m2]
df.head()

Unnamed: 0,Postalcode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [3]:
df.shape

(103, 3)

<h1>Problem 2</h2>

We download the geodata file and have a look at it

In [4]:
geodata = pd.read_csv('https://cocl.us/Geospatial_data')
geodata.rename(columns={'Postal Code': 'Postalcode'}, inplace=True)
geodata.head()

Unnamed: 0,Postalcode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


Now for each line in the original dataframe we look up a matching postal code and append latitude and longitude

In [5]:
df = pd.merge(df, geodata, on="Postalcode", how='left')
df.head()

Unnamed: 0,Postalcode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


<h1>Problem 3</h1>

First of all, we try to visualize our data.

In [6]:
import json # library to handle JSON files

!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')

Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/Python36

  added / updated specs: 
    - geopy


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    geopy-1.21.0               |             py_0          58 KB  conda-forge
    geographiclib-1.50         |             py_0          34 KB  conda-forge
    certifi-2019.11.28         |   py36h9f0ad1d_1         149 KB  conda-forge
    python_abi-3.6             |          1_cp36m           4 KB  conda-forge
    openssl-1.1.1e             |       h516909a_0         2.1 MB  conda-forge
    ca-certificates-2019.11.28 |       hecc5488_0         145 KB  conda-forge
    ------------------------------------------------------------
                                           Total:         2.5 MB

The following NEW packages will be INSTALLED:

    geographiclib:   1.50-py_0         conda-forge
    geopy:           1

In [7]:
address = 'Toronto, Canada'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


In [80]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=12)

# add markers to map
for lat, lng, borough, neighborhood in zip(df['Latitude'], df['Longitude'], df['Borough'], df['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

In [81]:
CLIENT_ID = 'OOUVZP5UUDLAVGILSUR1CUYV4MUXOS5VHH4KM4DRBBMXYJB4' # your Foursquare ID
CLIENT_SECRET = 'P15VAUVJ0KWJRE220CXAOLLHVY3CUBQTO1JGV5BI3OQWJLJX' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version
LIMIT=200

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: OOUVZP5UUDLAVGILSUR1CUYV4MUXOS5VHH4KM4DRBBMXYJB4
CLIENT_SECRET:P15VAUVJ0KWJRE220CXAOLLHVY3CUBQTO1JGV5BI3OQWJLJX


In [118]:
def getNearbyVenues(names, latitudes, longitudes, radius=1500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        print(len(results))
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [119]:
venues = getNearbyVenues(names=df['Neighborhood'],
                                   latitudes=df['Latitude'],
                                   longitudes=df['Longitude']
                                  )

Parkwoods
39
Victoria Village
46
Regent Park, Harbourfront
100
Lawrence Manor, Lawrence Heights
100
Queen's Park, Ontario Provincial Government
100
Islington Avenue
16
Malvern, Rouge
35
Don Mills
95
Parkview Hill, Woodbine Gardens
43
Garden District, Ryerson
100
Glencairn
60
West Deane Park, Princess Gardens, Martin Grove, Islington, Cloverdale
23
Rouge Hill, Port Union, Highland Creek
10
Flemingdon Park
88
Woodbine Heights
68
St. James Town
100
Humewood-Cedarvale
93
Eringate, Bloordale Gardens, Old Burnhamthorpe, Markland Wood
42
Guildwood, Morningside, West Hill
34
The Beaches
100
Berczy Park
100
Caledonia-Fairbanks
72
Woburn
34
Leaside
72
Central Bay Street
100
Christie
100
Cedarbrae
70
Hillcrest Village
52
Bathurst Manor, Wilson Heights, Downsview North
43
Thorncliffe Park
95
Richmond, Adelaide, King
100
Dufferin, Dovercourt Village
100
Scarborough Village
34
Fairview, Henry Farm, Oriole
65
Northwood Park, York University
43
The Danforth  East
85
Harbourfront East, Union Station, T

We queried venues info and put it into a dataframe. Let's explore the data received.

In [120]:
venues.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Parkwoods,43.753259,-79.329656,Allwyn's Bakery,43.75984,-79.324719,Caribbean Restaurant
1,Parkwoods,43.753259,-79.329656,Donalda Golf & Country Club,43.752816,-79.342741,Golf Course
2,Parkwoods,43.753259,-79.329656,Brookbanks Park,43.751976,-79.33214,Park
3,Parkwoods,43.753259,-79.329656,Tim Hortons,43.760668,-79.326368,Café
4,Parkwoods,43.753259,-79.329656,LCBO,43.757774,-79.314257,Liquor Store


In [121]:
venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Agincourt,66,66,66,66,66,66
"Alderwood, Long Branch",47,47,47,47,47,47
"Bathurst Manor, Wilson Heights, Downsview North",43,43,43,43,43,43
Bayview Village,16,16,16,16,16,16
"Bedford Park, Lawrence Manor East",77,77,77,77,77,77
Berczy Park,100,100,100,100,100,100
"Birch Cliff, Cliffside West",16,16,16,16,16,16
"Brockton, Parkdale Village, Exhibition Place",100,100,100,100,100,100
CFB Toronto,27,27,27,27,27,27
"CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport",69,69,69,69,69,69


In [122]:
print('There are {} uniques categories.'.format(len(venues['Venue Category'].unique())))

There are 344 uniques categories.


In [123]:
# one hot encoding
venues_onehot = pd.get_dummies(venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
venues_onehot['Neighborhood'] = venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [venues_onehot.columns[-1]] + list(venues_onehot.columns[:-1])
venues_onehot = venues_onehot[fixed_columns]

venues_onehot.head()

Unnamed: 0,Zoo Exhibit,ATM,Accessories Store,Afghan Restaurant,African Restaurant,Airport,Airport Lounge,American Restaurant,Amphitheater,Animal Shelter,...,Volleyball Court,Warehouse Store,Whisky Bar,Wine Bar,Wine Shop,Wings Joint,Women's Store,Xinjiang Restaurant,Yoga Studio,Zoo
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [124]:
venues_grouped = venues_onehot.groupby('Neighborhood').mean().reset_index()
venues_grouped

Unnamed: 0,Neighborhood,Zoo Exhibit,ATM,Accessories Store,Afghan Restaurant,African Restaurant,Airport,Airport Lounge,American Restaurant,Amphitheater,...,Volleyball Court,Warehouse Store,Whisky Bar,Wine Bar,Wine Shop,Wings Joint,Women's Store,Xinjiang Restaurant,Yoga Studio,Zoo
0,Agincourt,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.0,...,0.000000,0.000000,0.0,0.00,0.000,0.000000,0.000000,0.000000,0.000000,0.0
1,"Alderwood, Long Branch",0.0,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.0,...,0.000000,0.000000,0.0,0.00,0.000,0.021277,0.000000,0.000000,0.000000,0.0
2,"Bathurst Manor, Wilson Heights, Downsview North",0.0,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.023256,0.0,...,0.000000,0.000000,0.0,0.00,0.000,0.000000,0.000000,0.000000,0.000000,0.0
3,Bayview Village,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.0,...,0.000000,0.000000,0.0,0.00,0.000,0.000000,0.000000,0.000000,0.000000,0.0
4,"Bedford Park, Lawrence Manor East",0.0,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.0,...,0.000000,0.000000,0.0,0.00,0.000,0.012987,0.000000,0.000000,0.000000,0.0
5,Berczy Park,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.020000,0.0,...,0.000000,0.000000,0.0,0.00,0.000,0.000000,0.000000,0.000000,0.000000,0.0
6,"Birch Cliff, Cliffside West",0.0,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.0,...,0.000000,0.000000,0.0,0.00,0.000,0.000000,0.000000,0.000000,0.000000,0.0
7,"Brockton, Parkdale Village, Exhibition Place",0.0,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.020000,0.0,...,0.000000,0.000000,0.0,0.00,0.000,0.000000,0.000000,0.000000,0.000000,0.0
8,CFB Toronto,0.0,0.000000,0.0,0.000000,0.0,0.037037,0.000000,0.000000,0.0,...,0.000000,0.037037,0.0,0.00,0.000,0.000000,0.000000,0.000000,0.000000,0.0
9,"CN Tower, King and Spadina, Railway Lands, Har...",0.0,0.000000,0.0,0.000000,0.0,0.014493,0.014493,0.000000,0.0,...,0.000000,0.000000,0.0,0.00,0.000,0.000000,0.000000,0.000000,0.014493,0.0


In [125]:
venues_grouped.shape

(100, 344)

Let's print each neighborhood along with the top 5 most common venues

In [126]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [128]:
import numpy as np
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = venues_grouped['Neighborhood']

for ind in np.arange(venues_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(venues_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Agincourt,Chinese Restaurant,Shopping Mall,Coffee Shop,Cantonese Restaurant,Breakfast Spot,Asian Restaurant,Caribbean Restaurant,Department Store,Bakery,Gym / Fitness Center
1,"Alderwood, Long Branch",Discount Store,Coffee Shop,Café,Light Rail Station,Restaurant,Bank,Clothing Store,Toy / Game Store,Grocery Store,Park
2,"Bathurst Manor, Wilson Heights, Downsview North",Park,Coffee Shop,Gas Station,Pizza Place,Bank,Ski Chalet,Baseball Field,Fried Chicken Joint,Sushi Restaurant,French Restaurant
3,Bayview Village,Japanese Restaurant,Gas Station,Park,Trail,Bank,Intersection,Grocery Store,Skating Rink,Café,Restaurant
4,"Bedford Park, Lawrence Manor East",Coffee Shop,Bakery,Italian Restaurant,Sushi Restaurant,Café,Bagel Shop,Pizza Place,Pub,Asian Restaurant,Ice Cream Shop
5,Berczy Park,Coffee Shop,Hotel,Café,Japanese Restaurant,Restaurant,Italian Restaurant,Beer Bar,Park,Bakery,Gastropub
6,"Birch Cliff, Cliffside West",Park,Skating Rink,Restaurant,General Entertainment,Golf Course,Café,Gym,Chinese Restaurant,Gym Pool,Diner
7,"Brockton, Parkdale Village, Exhibition Place",Café,Coffee Shop,Gift Shop,Restaurant,Bar,Bakery,Arts & Crafts Store,Soccer Stadium,Furniture / Home Store,Japanese Restaurant
8,CFB Toronto,Athletics & Sports,Gym / Fitness Center,Turkish Restaurant,Racetrack,Coffee Shop,Men's Store,Food Court,Beer Store,Basketball Court,Liquor Store
9,"CN Tower, King and Spadina, Railway Lands, Har...",Park,Coffee Shop,Café,Restaurant,Harbor / Marina,Gym,Boat or Ferry,Pizza Place,Hotel,Brewery


Now we cluster our data

In [129]:
# set number of clusters
kclusters = 7

venues_grouped_clustering = venues_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(venues_grouped_clustering)

In [130]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

venues_merged = df

venues_merged = venues_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

venues_merged

Unnamed: 0,Postalcode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M3A,North York,Parkwoods,43.753259,-79.329656,4,Pharmacy,Coffee Shop,Bus Stop,Fast Food Restaurant,Bank,Gas Station,Park,Supermarket,Liquor Store,Beer Store
1,M4A,North York,Victoria Village,43.725882,-79.315572,4,Coffee Shop,Gym,Grocery Store,Middle Eastern Restaurant,Fast Food Restaurant,Rental Car Location,Gym / Fitness Center,Pizza Place,Portuguese Restaurant,Bakery
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.654260,-79.360636,0,Coffee Shop,Café,Restaurant,Park,Pub,Italian Restaurant,Bar,Bakery,Thai Restaurant,Theater
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763,3,Clothing Store,Restaurant,Coffee Shop,Fast Food Restaurant,Dessert Shop,Sandwich Place,Furniture / Home Store,Vietnamese Restaurant,Greek Restaurant,Fried Chicken Joint
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494,0,Coffee Shop,Japanese Restaurant,Park,Italian Restaurant,Pizza Place,Gastropub,Ramen Restaurant,Café,Restaurant,Steakhouse
5,M9A,Etobicoke,Islington Avenue,43.667856,-79.532242,2,Bank,Skating Rink,Bus Line,Liquor Store,Golf Course,Café,Grocery Store,Shopping Mall,Park,Bus Stop
6,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353,5,Zoo Exhibit,Fast Food Restaurant,Pizza Place,Restaurant,Coffee Shop,Gas Station,Caribbean Restaurant,Liquor Store,Other Great Outdoors,Chinese Restaurant
7,M3B,North York,Don Mills,43.745906,-79.352188,3,Coffee Shop,Japanese Restaurant,Restaurant,Bank,Pizza Place,Italian Restaurant,Burger Joint,Fried Chicken Joint,Liquor Store,Sandwich Place
8,M4B,East York,"Parkview Hill, Woodbine Gardens",43.706397,-79.309937,4,Pharmacy,Fast Food Restaurant,Pizza Place,Park,Bakery,Convenience Store,Brewery,Gym / Fitness Center,Coffee Shop,Gastropub
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937,0,Coffee Shop,Japanese Restaurant,Restaurant,Café,Gastropub,Theater,Clothing Store,Cosmetics Shop,Plaza,American Restaurant


Now we visualize our clusters on the map

In [None]:

import math

# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(venues_merged['Latitude'], venues_merged['Longitude'], venues_merged['Neighborhood'], venues_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    if(math.isnan(cluster)):
        continue
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[int(cluster)-1],
        fill=True,
        fill_color=rainbow[int(cluster)-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters