# This Notebook is for the IBM Data Science Professional Certificate

### Installing and Importing packages needed

In [3]:
!pip install selenium

Collecting selenium
  Using cached selenium-3.141.0-py2.py3-none-any.whl (904 kB)
Installing collected packages: selenium
Successfully installed selenium-3.141.0


In [3]:
import pandas as pd
import numpy as np
from selenium import webdriver
from time import sleep

### Getting Data from Wikipedia page into Data Frame

In [4]:
# creating lists
postal_codes = []
boroughs = []
neighborhoods = []

# initializing chrome driver and opening wikipedia page
browser = webdriver.Chrome()
browser.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')
sleep(5)

# locating table columns by xpath
postal = browser.find_elements_by_xpath ("//*[@id='mw-content-text']/div[1]/table[1]/tbody/tr/td[1]")
borough = browser.find_elements_by_xpath ("//*[@id='mw-content-text']/div[1]/table[1]/tbody/tr/td[2]")
neighbor = browser.find_elements_by_xpath ("//*[@id='mw-content-text']/div[1]/table[1]/tbody/tr/td[3]")

# copying data of fist column to postal_codes list
for i in postal:
    postal_codes.append(i.text)

# copying data of second column to boroughs list
for i in borough:
    boroughs.append(i.text)

# copying data of third column to neighborhoods list
for i in neighbor:
    neighborhoods.append(i.text)

browser.close()

# combining lists created above
lst = zip(postal_codes,boroughs,neighborhoods)

# creating dataframe containing list data
df = pd.DataFrame(lst, columns = ["PostalCodes", "Boroughs", "Neighborhoods"])
df


Unnamed: 0,PostalCodes,Boroughs,Neighborhoods
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
...,...,...,...
175,M5Z,Not assigned,Not assigned
176,M6Z,Not assigned,Not assigned
177,M7Z,Not assigned,Not assigned
178,M8Z,Etobicoke,"Mimico NW, The Queensway West, South of Bloor,..."


### Removing Missing Values from Data Frame

In [6]:
# removing "Not Assigned" Boroughs
df.drop(df[df['Boroughs'] == "Not assigned"].index, inplace = True)
df = df.reset_index(drop = True)

# replacing "Not Assigned" Neighborhoods with Borough name
df.loc[df.Neighborhoods == "Not assigned", 'Neighborhoods'] = df['Boroughs']

print("The size of the table after dropping Not Assigned boroughs and renaming neighborhoods: ")
df.shape

The size of the table after dropping Not Assigned boroughs and renaming neighborhoods: 


(103, 3)

### Getting Latitude and Longitude data

In [30]:
# reading csv and transforming into data frame
lat_long = pd.read_csv("Geospatial_Coordinates.csv")

# formatting column names and merging columns
lat_long.columns = ["PostalCodes", "Latitude", "Longitude"]
df = pd.merge(df, lat_long, on = "PostalCodes", how = "inner")
df

# Reducing dataframe into the necessary columns
df = df[["PostalCodes", "Boroughs", "Neighborhoods", "Latitude", "Longitude"]]
df

Unnamed: 0,PostalCodes,Boroughs,Neighborhoods,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.654260,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
...,...,...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",43.653654,-79.506944
99,M4Y,Downtown Toronto,Church and Wellesley,43.665860,-79.383160
100,M7Y,East Toronto,"Business reply mail Processing Centre, South C...",43.662744,-79.321558
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu...",43.636258,-79.498509


### Exploring Neighborhoods in Toronto

In [33]:
!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

import requests # library to handle requests

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

Collecting package metadata (current_repodata.json): done
Solving environment: done

# All requested packages already installed.

Collecting package metadata (current_repodata.json): done
Solving environment: done

# All requested packages already installed.



In [58]:
from itertools import chain

# return list from series of comma-separated strings
def chainer(s):
    return list(chain.from_iterable(s.str.split(',')))

# get only Boroughs that are in Toronto
toronto = df[df['Boroughs'].str.contains('Toronto')]

# calculate lengths of splits
lens = toronto['Neighborhoods'].str.split(',').map(len)

# create new dataframe, repeating all values except for neighborhoods which are chained
split_t = pd.DataFrame({'PostalCodes': np.repeat(toronto['PostalCodes'], lens),
                    'Boroughs': np.repeat(toronto['Boroughs'], lens),
                    'Neighborhoods': chainer(toronto['Neighborhoods']),
                    'Latitude': np.repeat(toronto['Latitude'], lens),
                    'Longitude': np.repeat(toronto['Longitude'], lens)})

# resulting dataframe neighborhoods that are in Toronto only and has one row per neighborhood
split_t = split_t.reset_index(drop=True)
split_t

Unnamed: 0,PostalCodes,Boroughs,Neighborhoods,Latitude,Longitude
0,M5A,Downtown Toronto,Regent Park,43.654260,-79.360636
1,M5A,Downtown Toronto,Harbourfront,43.654260,-79.360636
2,M7A,Downtown Toronto,Queen's Park,43.662301,-79.389494
3,M7A,Downtown Toronto,Ontario Provincial Government,43.662301,-79.389494
4,M5B,Downtown Toronto,Garden District,43.657162,-79.378937
...,...,...,...,...,...
73,M5X,Downtown Toronto,First Canadian Place,43.648429,-79.382280
74,M5X,Downtown Toronto,Underground city,43.648429,-79.382280
75,M4Y,Downtown Toronto,Church and Wellesley,43.665860,-79.383160
76,M7Y,East Toronto,Business reply mail Processing Centre,43.662744,-79.321558


In [59]:
address = 'Toronto, Ontario'

# gets latitude and longitude values for Toronto
geolocator = Nominatim(user_agent="ca_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto, Ontario are {}, {}.'.format(latitude, longitude))

# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(split_t['Latitude'], split_t['Longitude'], split_t['Boroughs'], split_t['Neighborhoods']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

The geograpical coordinate of Toronto, Ontario are 43.6534817, -79.3839347.


In [60]:
# @hidden cell
CLIENT_ID = 'RD3GK2KL021H3ZNCPR1R0XA3FP3MIC1KYTKBF3NTNO43NL0X' # your Foursquare ID
CLIENT_SECRET = 'ZH5IAHSASZ1HDBAOLN4BO2KAOSB2IPACQLXOXJW2BP5PJRWK' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version
LIMIT = 100 # A default Foursquare API limit value

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: RD3GK2KL021H3ZNCPR1R0XA3FP3MIC1KYTKBF3NTNO43NL0X
CLIENT_SECRET:ZH5IAHSASZ1HDBAOLN4BO2KAOSB2IPACQLXOXJW2BP5PJRWK


In [43]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [None]:
# gets 100 nearby venues in each neighborhood
toronto_venues = getNearbyVenues(names=split_t['Neighborhoods'],
                                   latitudes= split_t['Latitude'],
                                   longitudes=split_t['Longitude']
                                  )

In [96]:
print(toronto_venues.shape)
toronto_venues.head()

(3187, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Regent Park,43.65426,-79.360636,Roselle Desserts,43.653447,-79.362017,Bakery
1,Regent Park,43.65426,-79.360636,Tandem Coffee,43.653559,-79.361809,Coffee Shop
2,Regent Park,43.65426,-79.360636,Cooper Koo Family YMCA,43.653249,-79.358008,Distribution Center
3,Regent Park,43.65426,-79.360636,Morning Glory Cafe,43.653947,-79.361149,Breakfast Spot
4,Regent Park,43.65426,-79.360636,Body Blitz Spa East,43.654735,-79.359874,Spa


In [68]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()

Unnamed: 0,Yoga Studio,Adult Boutique,Afghan Restaurant,Airport,Airport Food Court,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,...,Theme Restaurant,Tibetan Restaurant,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wine Shop
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [69]:
toronto_onehot.shape

(3187, 236)

In [70]:
# gets frequency of venue per neighborhood as an average value of venue type
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_grouped

Unnamed: 0,Neighborhood,Yoga Studio,Adult Boutique,Afghan Restaurant,Airport,Airport Food Court,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Theme Restaurant,Tibetan Restaurant,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wine Shop
0,Adelaide,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.020619,...,0.0,0.000000,0.0,0.000000,0.00,0.010309,0.000000,0.000000,0.000000,0.0
1,Bathurst Quay,0.000000,0.0,0.0,0.058824,0.058824,0.117647,0.176471,0.117647,0.000000,...,0.0,0.000000,0.0,0.000000,0.00,0.000000,0.000000,0.000000,0.000000,0.0
2,Cabbagetown,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.000000,0.0,0.000000,0.00,0.000000,0.000000,0.000000,0.000000,0.0
3,Chinatown,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.000000,0.0,0.000000,0.00,0.064516,0.000000,0.048387,0.016129,0.0
4,Deer Park,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.071429,...,0.0,0.000000,0.0,0.000000,0.00,0.000000,0.000000,0.071429,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72,The Annex,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.000000,0.0,0.000000,0.00,0.000000,0.000000,0.000000,0.000000,0.0
73,The Beaches,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.000000,0.0,0.250000,0.00,0.000000,0.000000,0.000000,0.000000,0.0
74,The Danforth West,0.023256,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.023256,...,0.0,0.023256,0.0,0.023256,0.00,0.000000,0.000000,0.000000,0.000000,0.0
75,Toronto Dominion Centre,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.030000,...,0.0,0.000000,0.0,0.000000,0.01,0.010000,0.000000,0.000000,0.010000,0.0


In [74]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

# resulting data frame contains each neighborhood and their 10 most common venue types
neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Adelaide,Coffee Shop,Café,Restaurant,Hotel,Gym,Thai Restaurant,Deli / Bodega,Burrito Place,Pizza Place,Bakery
1,Bathurst Quay,Airport Service,Airport Lounge,Airport Terminal,Coffee Shop,Bar,Plane,Rental Car Location,Sculpture Garden,Boutique,Boat or Ferry
2,Cabbagetown,Coffee Shop,Bakery,Pizza Place,Pub,Chinese Restaurant,Café,Restaurant,Italian Restaurant,Butcher,Jewelry Store
3,Chinatown,Café,Vegetarian / Vegan Restaurant,Vietnamese Restaurant,Coffee Shop,Mexican Restaurant,Bar,Park,Arts & Crafts Store,Grocery Store,Gaming Cafe
4,Deer Park,Coffee Shop,Sushi Restaurant,Pizza Place,Bagel Shop,Fried Chicken Joint,Bank,Restaurant,Pub,American Restaurant,Liquor Store


### Clustering Neighborhoods in Toronto

In [93]:
# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([0, 2, 0, 0, 0, 0, 0, 0, 4, 0], dtype=int32)

In [94]:
toronto_merged = split_t

# merge toronto_grouped with split_t to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhoods')

toronto_merged.head() # check the last columns!

Unnamed: 0,PostalCodes,Boroughs,Neighborhoods,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M5A,Downtown Toronto,Regent Park,43.65426,-79.360636,0,Coffee Shop,Pub,Bakery,Park,Café,Breakfast Spot,Theater,Wine Shop,Event Space,Restaurant
1,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636,0,Coffee Shop,Pub,Bakery,Park,Café,Breakfast Spot,Theater,Wine Shop,Event Space,Restaurant
2,M7A,Downtown Toronto,Queen's Park,43.662301,-79.389494,0,Coffee Shop,Sushi Restaurant,Yoga Studio,College Cafeteria,Beer Bar,Smoothie Shop,Sandwich Place,Burrito Place,Café,Portuguese Restaurant
3,M7A,Downtown Toronto,Ontario Provincial Government,43.662301,-79.389494,0,Coffee Shop,Sushi Restaurant,Yoga Studio,College Cafeteria,Beer Bar,Smoothie Shop,Sandwich Place,Burrito Place,Café,Portuguese Restaurant
4,M5B,Downtown Toronto,Garden District,43.657162,-79.378937,0,Coffee Shop,Clothing Store,Café,Cosmetics Shop,Japanese Restaurant,Middle Eastern Restaurant,Bubble Tea Shop,Diner,Ramen Restaurant,Bookstore


In [95]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhoods'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters