# Coursera: Applied Data Science Capstone

In this notebook I will document all steps to completing the Coursera Applied Data Science Capstone.

In [1]:
import pandas as pd
import numpy as np

In [2]:
print("Hello Capstone Project Course!")

Hello Capstone Project Course!


# Segmenting and Clustering Neighborhoods in Toronto

## Part 1: Get postal codes and neighbourhoods

In [3]:
website_url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"

In [4]:
# read table from Wikipedia
postal_codes = pd.read_html(website_url)[0]

In [5]:
# drop rows that do not have a borough
postal_codes = postal_codes[postal_codes.Borough != "Not assigned"].reset_index(drop = True)

In [6]:
# combine neighbourhoods within the same borough and postcode
df_postal = pd.DataFrame(postal_codes.groupby(["Postcode","Borough"],as_index=False)["Neighbourhood"].apply(list)).reset_index()
df_postal.rename(columns={0:"Neighbourhood"}, inplace = True)
df_postal.Neighbourhood = df_postal.Neighbourhood.apply(lambda x: ', '.join([str(i) for i in x]))

In [7]:
# check which entries do not have a neighborhood
df_postal[df_postal.Neighbourhood == "Not assigned"]

# only Queen's Park is affected so we can simply replace it
df_postal = df_postal.replace(to_replace="Not assigned", value="Queen's Park")

### Results 

In [8]:
df_postal

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
...,...,...,...
98,M9N,York,Weston
99,M9P,Etobicoke,Westmount
100,M9R,Etobicoke,"Kingsview Village, Martin Grove Gardens, Richv..."
101,M9V,Etobicoke,"Albion Gardens, Beaumond Heights, Humbergate, ..."


In [9]:
df_postal.shape

(103, 3)

## Part 2: Get geo coordinates

In [15]:
coords = pd.read_csv("Geospatial_Coordinates.csv")
coords

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
...,...,...,...
98,M9N,43.706876,-79.518188
99,M9P,43.696319,-79.532242
100,M9R,43.688905,-79.554724
101,M9V,43.739416,-79.588437


In [21]:
df_coords = df_postal.merge(coords, how = "left", left_on = "Postcode", right_on = "Postal Code")
df_coords.drop("Postal Code", axis = 1, inplace = True)
df_coords

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
...,...,...,...,...,...
98,M9N,York,Weston,43.706876,-79.518188
99,M9P,Etobicoke,Westmount,43.696319,-79.532242
100,M9R,Etobicoke,"Kingsview Village, Martin Grove Gardens, Richv...",43.688905,-79.554724
101,M9V,Etobicoke,"Albion Gardens, Beaumond Heights, Humbergate, ...",43.739416,-79.588437


In [22]:
df_coords.to_csv("df_coords.csv")

## Part 3: