# Third Week Assignment - ALexander Soto

## First task

Read html table from wikipedia and clean the data

### Import libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from unicodedata import normalize

In [2]:
# Install missing lib to parse webpages and reload the kernel
!pip3 install lxml



### Read data from wikipedia - scrapping

In [3]:
# Read table from wikipedia
df = pd.read_html('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M', match="Postal Code")[0]
df.rename(columns={'Postal Code': 'PostalCode'}, inplace=True)
df.shape

(180, 3)

### Clean Data

In [4]:
# Remove not assigned Borough
df = df[df.Borough != 'Not assigned']
df.shape

(103, 3)

In [5]:
# Assign not assigned Neighbourhood to Borough value
not_assigned_neighbourhood = df[df.Neighbourhood=='Not assigned'].index
for x in not_assigned_neighbourhood:
    df.loc[x].Neighbourhood = df.loc[x].Borough

In [6]:
# Merge duplicated PostalCode
repeated_postal_code = df['PostalCode'].value_counts() > 1
repeated_postal_code = repeated_postal_code[repeated_postal_code].index
for zipc in repeated_postal_code:
    row = df[df.PostalCode == zipc]
    borough = row.Borough.values[0]
    neighbourhood = ', '.join(row.Neighbourhood.values.tolist())
    df = df[df.PostalCode != zipc]
    df.append({'PostalCode': zipc, 'Borough': borough, 'Neighbourhood': neighbourhood}, ignore_index=True)
    

In [7]:
df

Unnamed: 0,PostalCode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
...,...,...,...
160,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
165,M4Y,Downtown Toronto,Church and Wellesley
168,M7Y,East Toronto,"Business reply mail Processing Centre, South C..."
169,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


In [8]:
df.shape

(103, 3)

## Second task - Import lat and long

In [9]:
pd_lat_long = pd.read_csv("http://cocl.us/Geospatial_data")

In [10]:
pd_lat_long.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [11]:
df = df.merge(pd_lat_long, left_on='PostalCode', right_on='Postal Code').drop('Postal Code', axis=1)

In [12]:
df.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


## Third task

Explore the data and visualize your findings

In [13]:
# Import needed lib

import json # library to handle JSON files

!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')

zsh:1: command not found: conda
zsh:1: command not found: conda
Libraries imported.


#### Visualize map of the Neighbourhood

In [14]:
def visualize_on_map(center_lat, center_long, df):
    latitude, longitude = [center_lat, center_long]
    generated_map = folium.Map(location=[latitude, longitude], zoom_start=10)

    # add markers to map
    for lat, lng, borough, neighborhood, zipcode in zip(df['Latitude'], df['Longitude'], df['Borough'], df['Neighbourhood'], df['PostalCode']):
        label = '{}, {}, {}'.format(neighborhood, borough, zipcode)
        label = folium.Popup(label, parse_html=True)
        folium.CircleMarker(
            [lat, lng],
            radius=5,
            popup=label,
            color='blue',
            fill=True,
            fill_color='#3186cc',
            fill_opacity=0.7,
            parse_html=False).add_to(generated_map)  

    return(generated_map)



In [15]:
generated_map = visualize_on_map(43.753259, -79.329656, df)
generated_map

#### Select a limited set of data

For the sake of this task, data will be limited to use boroughs containing Toronto on their name. We will compare which of them are similar using K-n-clustering based on the venues they have nearby

In [16]:
df_limited_toronto = df[df['Borough'].str.contains("Toronto")]
df_limited_toronto.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
15,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
19,M4E,East Toronto,The Beaches,43.676357,-79.293031


In [17]:
df_limited_toronto.shape

(39, 5)

### Visualize select sample

In [18]:
generated_map = visualize_on_map(df_limited_toronto['Latitude'].values[0], df_limited_toronto['Longitude'].values[0], df_limited_toronto)
generated_map

### Getting the venues for each neihboorhood

We will use foursquare api to get the venue and then extract their categories to then compare each neighborhood based on that.

In [51]:
# Define credentials for foursquare
CLIENT_ID = '-' # your Foursquare ID
CLIENT_SECRET = '0' # your Foursquare Secret
ACCESS_TOKEN = '-' # your FourSquare Access Token
VERSION = '20180605' # Foursquare API version
LIMIT = 100 # A default Foursquare API limit value

In [20]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&ll={},{}&oauth_token={}&v={}&radius={}&limit={}'.format(CLIENT_ID, CLIENT_SECRET, lat, lng,ACCESS_TOKEN, VERSION, radius, LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['PostalCode', 
                  'PostalCode Latitude', 
                  'PostalCode Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [21]:
venues_toronto = getNearbyVenues(df_limited_toronto['PostalCode'], df_limited_toronto.Latitude, df_limited_toronto.Longitude)

M5A
M7A
M5B
M5C
M4E
M5E
M5G
M6G
M5H
M6H
M5J
M6J
M4K
M5K
M6K
M4L
M5L
M4M
M4N
M5N
M4P
M5P
M6P
M4R
M5R
M6R
M4S
M5S
M6S
M4T
M5T
M4V
M5V
M4W
M5W
M4X
M5X
M4Y
M7Y


In [22]:
venues_toronto.head()

Unnamed: 0,PostalCode,PostalCode Latitude,PostalCode Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,M5A,43.65426,-79.360636,Roselle Desserts,43.653447,-79.362017,Bakery
1,M5A,43.65426,-79.360636,Tandem Coffee,43.653559,-79.361809,Coffee Shop
2,M5A,43.65426,-79.360636,Morning Glory Cafe,43.653947,-79.361149,Breakfast Spot
3,M5A,43.65426,-79.360636,Cooper Koo Family YMCA,43.653249,-79.358008,Distribution Center
4,M5A,43.65426,-79.360636,Body Blitz Spa East,43.654735,-79.359874,Spa


In [31]:
venues_categories_toronto = pd.get_dummies(venues_toronto[['Venue Category']], prefix="", prefix_sep="")

In [32]:
venues_categories_toronto

Unnamed: 0,ATM,Accessories Store,Adult Boutique,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Wings Joint,Yoga Studio
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2184,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2185,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2186,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2187,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [33]:
venues_categories_toronto.insert(0, 'PostalCode', venues_toronto['PostalCode'], True)

In [34]:
venues_categories_toronto

Unnamed: 0,PostalCode,ATM,Accessories Store,Adult Boutique,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,...,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Wings Joint,Yoga Studio
0,M5A,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,M5A,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,M5A,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,M5A,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,M5A,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2184,M7Y,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2185,M7Y,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2186,M7Y,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2187,M7Y,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [35]:
toronto_grouped = venues_categories_toronto.groupby('PostalCode').mean().reset_index()
toronto_grouped

Unnamed: 0,PostalCode,ATM,Accessories Store,Adult Boutique,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,...,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Wings Joint,Yoga Studio
0,M4E,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.111111,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,M4K,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.011111,0.011111,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.011111
2,M4L,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,M4M,0.018519,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.018519
4,M4N,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,M4P,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,M4R,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.025641,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.025641
7,M4S,0.019608,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.019608,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,M4T,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,M4V,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.052632,0.0,0.0,0.0,0.0


### Clustering

Now with all the data we are going to compare each postal code to the other and see which ones are similar to each other and then visualize that on the map


In [36]:
# set number of clusters
kclusters = 5

toronto_cluster = toronto_grouped.drop('PostalCode', 1)
# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_cluster)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([0, 0, 0, 0, 2, 0, 0, 0, 1, 0], dtype=int32)

In [37]:
# Assign each label to the groups
toronto_grouped.insert(0, 'Cluster Labels', kmeans.labels_)

In [45]:
single_df = toronto_grouped[['PostalCode', 'Cluster Labels']]
single_df

Unnamed: 0,PostalCode,Cluster Labels
0,M4E,0
1,M4K,0
2,M4L,0
3,M4M,0
4,M4N,2
5,M4P,0
6,M4R,0
7,M4S,0
8,M4T,1
9,M4V,0


In [46]:
# lets merge all the data with the first dataframe to later visualize on the map

df_with_clusters = df_limited_toronto.join(single_df.set_index('PostalCode'), on='PostalCode')

In [47]:
df_with_clusters

Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude,Cluster Labels
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636,0
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494,0
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937,0
15,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418,0
19,M4E,East Toronto,The Beaches,43.676357,-79.293031,0
20,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306,0
24,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383,0
25,M6G,Downtown Toronto,Christie,43.669542,-79.422564,0
30,M5H,Downtown Toronto,"Richmond, Adelaide, King",43.650571,-79.384568,0
31,M6H,West Toronto,"Dufferin, Dovercourt Village",43.669005,-79.442259,0


### Visualize the result of the clusters

We are going to graph in the map with colors each cluster. To see how the downtown toronto is distrubuted.

In [49]:

map_clusters = folium.Map(location=[df_with_clusters['Latitude'].values[0], df_with_clusters['Longitude'].values[0]], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(df_with_clusters['Latitude'], df_with_clusters['Longitude'], df_with_clusters['PostalCode'], df_with_clusters['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

## Conclusion

As we see on the map basically the whole center of Toronto is a big cluster, is really similiar in terms of venues, we have 3 other clusters but they consist of just 1 element, while the center (Downtown area) is totally 1 cluster.