# Toronto Clustering Project - Alex Newman

## Question 1 - Create Postal Code Dataset

In [54]:
# install pgeocode to generate post codes for Canada

!pip install pgeocode



In [55]:
#import required packages

import requests
import pandas as pd
import numpy as np

# grab information from wikipedia

url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

#identify tables and create a dataframe

html = requests.get(url).content
df_list = pd.read_html(html)
df = df_list[0]

#print dataframe

print(df)


    Postal Code           Borough  \
0           M1A      Not assigned   
1           M2A      Not assigned   
2           M3A        North York   
3           M4A        North York   
4           M5A  Downtown Toronto   
5           M6A        North York   
6           M7A  Downtown Toronto   
7           M8A      Not assigned   
8           M9A         Etobicoke   
9           M1B       Scarborough   
10          M2B      Not assigned   
11          M3B        North York   
12          M4B         East York   
13          M5B  Downtown Toronto   
14          M6B        North York   
15          M7B      Not assigned   
16          M8B      Not assigned   
17          M9B         Etobicoke   
18          M1C       Scarborough   
19          M2C      Not assigned   
20          M3C        North York   
21          M4C         East York   
22          M5C  Downtown Toronto   
23          M6C              York   
24          M7C      Not assigned   
25          M8C      Not assigned   
2

In [56]:
df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [57]:
# Data Clean up

# identify Borough information that has not been assigned

df['Borough'].replace("Not assigned", np.nan, inplace=True)

# drop rows that don't have the correct information

df.dropna(subset=["Borough"], axis=0, inplace=True)

# identify Neighbourhoods that haven't been assigned

df.loc[df["Neighborhood"]== 'Not assigned']

# reset indexes

df.reset_index(drop=True, inplace=True)

In [58]:
# check null values have been removed

missing_data = df.isnull()
missing_data.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,False,False,False
1,False,False,False
2,False,False,False
3,False,False,False
4,False,False,False


In [59]:
# print shape of dataframe

df.shape

(103, 3)

## Question 2 - Append Post Codes

In [60]:
#import package to geocode post codes

import pgeocode

#set country code to Canada

nomi = pgeocode.Nominatim('ca')

#Create 2 new columns for dataframe

df['Latitude'] = np.nan
df['Longitude'] = np.nan

#identify the length of the dataframe

N = len(df)

# use a for loop to populate the latitude and longitude attributes passing post code info

for i in range(N):
    #print(df['Postal Code'].values[i])
    df['Latitude'].values[i] = nomi.query_postal_code(df['Postal Code'].values[i]).latitude
    df['Longitude'].values[i] = nomi.query_postal_code(df['Postal Code'].values[i]).longitude
    
#print head
    
    df.head()

In [61]:
#clean up data

df.dropna(subset=["Latitude"], axis=0, inplace=True)

#check for missing data

for column in missing_data.columns.values.tolist():
    print(column)
    print (missing_data[column].value_counts())
    print("")  

Postal Code
False    103
Name: Postal Code, dtype: int64

Borough
False    103
Name: Borough, dtype: int64

Neighborhood
False    103
Name: Neighborhood, dtype: int64



In [62]:
#print dataframe

df

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.7545,-79.3300
1,M4A,North York,Victoria Village,43.7276,-79.3148
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.6555,-79.3626
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.7223,-79.4504
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.6641,-79.3889
5,M9A,Etobicoke,Islington Avenue,43.6662,-79.5282
6,M1B,Scarborough,"Malvern, Rouge",43.8113,-79.1930
7,M3B,North York,Don Mills,43.7450,-79.3590
8,M4B,East York,"Parkview Hill, Woodbine Gardens",43.7063,-79.3094
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.6572,-79.3783
