# Toronto Data Assignment 

#### Answer quick link.

[Jump to Answer 1 for box 1](#step1)

[Jump to Answer 2 for box 2](#step2)

[Jump to Answer 3 for box 3](#step3)

<a id="step1"></a>

# Step 1

### Libraries

In [1]:
### Conda Installs
#!conda install -c conda-forge geopy --yes 
#!conda install -c conda-forge folium=0.5.0 --yes
#!conda install -c anaconda pandas --yes 
#!conda install -c anaconda wget --yes 
#!conda install -c conda-forge matplotlib
#!conda install -c anaconda beautifulsoup4

In [35]:
import numpy as np
import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
import requests
from bs4 import BeautifulSoup
import matplotlib
import matplotlib.cm as cm # Matplotlib and associated plotting modules
import matplotlib.colors as colors # Matplotlib and associated plotting modules
import lxml 
import geocoder
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe
import json # library to handle JSON files
import config
from sklearn.cluster import KMeans # import k-means from clustering stage
import folium # map rendering library

### Import website data. Find table.

In [3]:
weblink = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")
soup = BeautifulSoup(weblink.content,'lxml')
table = soup.find_all('tbody')[0]
#print(table) 


#### Select table with data needed.

In [4]:
table = soup.find('table', attrs={'class':'wikitable sortable'})
table_rows = table.find_all('tr')

In [None]:
### Data for table
data = []
for tr in table_rows:
    td = tr.find_all('td')
    row = [tr.text for tr in td]
    data.append(row)
print(data)
#print(df[0].to_json(orient='records'))

In [None]:
### Convert imported data to pandas DataFrame
df = pd.DataFrame(data, columns=["Postcode", "Borough", "Neighbourhood"])
df

### Clean Data

In [None]:
# Remove \n
df = df.replace(r'\n','', regex=True)
df.columns = df.columns.str.strip()
df

In [8]:
# verify column headers
df.columns

Index(['Postcode', 'Borough', 'Neighbourhood'], dtype='object')

In [None]:
# Drop row with value 'Not assigned'
df = df[df.Borough != 'Not assigned']
df

In [None]:
# Group columns by postal code and borough, removing duplicate postcode value
# and moving the duplicate data in borough to Neighbourhood.
df = df.groupby(['Postcode','Borough'])['Neighbourhood'].apply(', '.join).reset_index()
df.head

In [None]:
df.rename(columns ={'Postcode': 'PostalCode'}, inplace=True)
df.rename(columns ={'Neighbourhood': 'Neighborhood'}, inplace=True)
df

In [12]:
# Verify neighbourhood has values in all rows.
df.isin(['Not Available']).any().any()

False

### Dataframe Shape

In [13]:
# Shape of DataFrame
df.shape

(103, 3)

<a id="step2"></a>

# Step 2

### Generating Lat & Lng by Postal Code and Borough

In [16]:
# Main list to run
postal_code = df['PostalCode'].tolist()

In [None]:
## Test for 1 postalcode
#geoCodeUrl = "https://maps.googleapis.com/maps/api/geocode/json?components=postal_code:M1R|country:CA&key={}".format(
#api_key)

In [None]:
#  List all values for latlng. example shown below.
#for i in latlng.values():
#    print(i)

In [None]:
#  Geocode
#postal_code = df['Postcode'].tolist()
#geoCodeUrl = "https://maps.googleapis.com/maps/api/geocode/json?components=postal_code:{}|country:CA&key={}".format(
#           list1[i],
#            api_key)

In [None]:
# Test list
#list1 = ['M2M', 'M2N', 'M2P', 'M2R']

### Call all postcodes, retreive Lat & Lng at once, Save in new JSON file.

In [17]:
# Main list
list1 = postal_code

In [None]:
# Test list
#list1 = ['M2M', 'M2N', 'M2P', 'M2R']
#list1[0]

In [None]:
postal_code

In [None]:
# Getting length of list 
api_key = config.api_key
i = 0
# Iterating using while loop 
while i < len(list1):  
    element = list1[i]
    geoCodeUrl = "https://maps.googleapis.com/maps/api/geocode/json?components=postal_code:{}|country:CA&key={}".format(
        element,
        api_key) 
    
    # make the GET request
    lookup = requests.get(geoCodeUrl)
    data = lookup.json()  
    
    # If no results are found for postal code, skip and move on.
    if (data['status'] == 'ZERO_RESULTS'):
        i += 1
        continue
    # Instantiate an empty dict
    latlng = {}
    # latlng information
    latlng['PostalCode'] = data['results'][0]['address_components'][0]['long_name']
    latlng['Longname'] = data['results'][0]['address_components'][1]['long_name']
    latlng['Latitude'] = data['results'][0]['geometry']['location']['lat']
    latlng['Longitude'] = data['results'][0]['geometry']['location']['lng']

    with open('data.json', 'r') as j:
        json_data = json.load(j)
    # convert data to list if not
        if type(json_data) is dict:
            json_data = [json_data]

    # use append() to add to list
    json_data.append(latlng)    

    #write list to file
    with open('data.json', 'w') as outfile:
        json.dump(json_data, outfile)
    # Normalize data. Flatten JSON.
    data_normalized = pd.json_normalize(json_data) # flatten JSON
    i += 1 
       
    if i == len(list1):
        break

print(data_normalized)

In [21]:
# Open new json file created above.
with open('data.json', 'r') as j:
    json_data = json.load(j)

In [22]:
# Rename longname to borough on new generated dataframe
data_df = pd.DataFrame(json_data)
data_df.rename(columns = {'Longname':'Borough'}, inplace=True)
data_df.head()

Unnamed: 0,PostalCode,Borough,Latitude,Longitude
0,M1J,Scarborough,43.744734,-79.239476
1,M1B,Scarborough,43.806686,-79.194353
2,M1C,Scarborough,43.784535,-79.160497
3,M1E,Scarborough,43.763573,-79.188711
4,M1G,Scarborough,43.770992,-79.216917


In [None]:
#######################################
#### Combine Dataframes step1+ste2 ####
#######################################

In [23]:
# Drop duplicate rows
# Useful if while loop is ran multiple times.
df = df.drop_duplicates()
data_df = data_df.drop_duplicates()

In [24]:
# Merge df and df_data
# Postal code with 
df_merge_col = pd.merge(df, data_df, on='PostalCode')

In [None]:
# Clean df_merge_col
df_merge_col.rename(columns = {'Borough_x':'Borough'}, inplace=True)
df_merge_col.rename(columns = {'Neighbourhood':'Neighborhood'}, inplace=True)
df_merge_col = df_merge_col.drop(['Borough_y'], axis=1)
df_merge_col.head()

### Completed Merged DataFrame w/ latlng

In [26]:
df_merge_col.columns

Index(['PostalCode', 'Borough', 'Neighborhood', 'Latitude', 'Longitude'], dtype='object')

<a id="step3"></a>

# Step 3

### Cluster neighborhoods in Toronto. Generate map and visualize clusters.

In [27]:
# set number of clusters
kclusters = 5

toronto_grouped_clustering = df_merge_col.drop(['PostalCode', 'Borough', 'Neighborhood'], 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([3, 3, 3, 3, 3, 3, 3, 0, 3, 0])

In [28]:
# add clustering labels to full dataset
df_merge_col.insert(0, 'Cluster Labels', kmeans.labels_)

In [49]:
location = geolocator.geocode(address)

NameError: name 'geolocator' is not defined

In [32]:
latitude = data['results'][0]['geometry']['location']['lat']
latitude

43.706876

In [33]:
longitude = data['results'][0]['geometry']['location']['lng']
longitude

-79.5181884

In [36]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(df_merge_col['Latitude'], df_merge_col['Longitude'], df_merge_col['Neighborhood'], df_merge_col['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

### Examine Clusters

#### Cluster 1

In [37]:
df_merge_col.loc[df_merge_col['Cluster Labels'] == 0, df_merge_col.columns[[1] + list(range(5, df_merge_col.shape[1]))]]

Unnamed: 0,PostalCode,Longitude
7,M1L,-79.284577
9,M1N,-79.264848
11,M1R,-79.295849
13,M1T,-79.304302
15,M1W,-79.318389
17,M2H,-79.363452
18,M2J,-79.346556
19,M2K,-79.385975
20,M2L,-79.374714
24,M3A,-79.329656


#### Cluster 1

In [38]:
df_merge_col.loc[df_merge_col['Cluster Labels'] == 1, df_merge_col.columns[[1] + list(range(5, df_merge_col.shape[1]))]]

Unnamed: 0,PostalCode,Longitude
40,M4K,-79.352188
42,M4M,-79.340923
44,M4P,-79.390197
45,M4R,-79.405678
46,M4S,-79.38879
47,M4T,-79.38316
48,M4V,-79.400049
49,M4W,-79.377529
50,M4X,-79.367675
51,M4Y,-79.38316


#### Cluster 2

In [39]:
df_merge_col.loc[df_merge_col['Cluster Labels'] == 2, df_merge_col.columns[[1] + list(range(5, df_merge_col.shape[1]))]]

Unnamed: 0,PostalCode,Longitude
80,M6N,-79.487262
83,M6S,-79.48445
85,M8V,-79.501321
86,M8W,-79.543484
87,M8X,-79.506944
88,M8Y,-79.498509
89,M8Z,-79.520999
90,M9A,-79.532242
91,M9B,-79.554724
92,M9C,-79.577201


#### Cluster 3

In [40]:
df_merge_col.loc[df_merge_col['Cluster Labels'] == 3, df_merge_col.columns[[1] + list(range(5, df_merge_col.shape[1]))]]

Unnamed: 0,PostalCode,Longitude
0,M1B,-79.194353
1,M1C,-79.160497
2,M1E,-79.188711
3,M1G,-79.216917
4,M1H,-79.239476
5,M1J,-79.239476
6,M1K,-79.262029
8,M1M,-79.239476
10,M1P,-79.273304
12,M1S,-79.262029


#### Cluster 4

In [41]:
df_merge_col.loc[df_merge_col['Cluster Labels'] == 4, df_merge_col.columns[[1] + list(range(5, df_merge_col.shape[1]))]]

Unnamed: 0,PostalCode,Longitude
21,M2N,-79.408493
22,M2P,-79.400049
23,M2R,-79.442259
27,M3H,-79.442259
28,M3J,-79.487262
29,M3K,-79.464763
30,M3L,-79.506944
31,M3M,-79.495697
32,M3N,-79.520999
61,M5M,-79.41975


#### Cluster 5

In [42]:
df_merge_col.loc[df_merge_col['Cluster Labels'] == 1, df_merge_col.columns[[1] + list(range(5, df_merge_col.shape[1]))]]

Unnamed: 0,PostalCode,Longitude
40,M4K,-79.352188
42,M4M,-79.340923
44,M4P,-79.390197
45,M4R,-79.405678
46,M4S,-79.38879
47,M4T,-79.38316
48,M4V,-79.400049
49,M4W,-79.377529
50,M4X,-79.367675
51,M4Y,-79.38316
