# Capstone Project | Coursera

## Introduction

This notebook will be mainly used for the capstone project.

## Setup

In [1]:
import pandas as pd
import numpy as np

In [2]:
print("Hello Capstone Project Course!")

Hello Capstone Project Course!


## Clustering

In [4]:
import requests
from bs4 import BeautifulSoup
from urllib.request import urlopen

# access wiki
html = urlopen(
    "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
    ).read().decode("utf-8")

soup = BeautifulSoup(html, features = "lxml")

In [5]:
# dataframe will consist of three columns: PostalCode, Borough, and Neighborhood

# define the dataframe columns
column_names = ['PostalCode', 'Borough', 'Neighborhood'] 

# instantiate the dataframe
df = pd.DataFrame(columns=column_names)
df

Unnamed: 0,PostalCode,Borough,Neighborhood


In [6]:
tab = soup.find('table')
tab_items = tab.find_all('td')

postcodes = []
boroughs = []
neighbors = []

for index, item in enumerate(tab_items):
    if index%3==0:
        postcodes.append(item.text.strip())
    elif index%3==1:
        boroughs.append(item.text.strip())
    else:
        neighbors.append(item.text.strip())
    
print("Length: postcodes-{}, boroughs-{}, neighbors-{}".format(len(postcodes), len(boroughs), len(neighbors)))

Length: postcodes-287, boroughs-287, neighbors-287


In [7]:
df.PostalCode = postcodes
df.Borough = boroughs
df.Neighborhood = neighbors
df

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor
7,M7A,Downtown Toronto,Queen's Park
8,M8A,Not assigned,Not assigned
9,M9A,Queen's Park,Not assigned


In [58]:
# ignore cells with a borough that is Not assigned
df_filtered = df[df.Borough != 'Not assigned']
df_filtered.reset_index(inplace=True)
df_filtered.drop(['index'], axis=1, inplace=True)
df_filtered

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,Lawrence Heights
4,M6A,North York,Lawrence Manor
5,M7A,Downtown Toronto,Queen's Park
6,M9A,Queen's Park,Not assigned
7,M1B,Scarborough,Rouge
8,M1B,Scarborough,Malvern
9,M3B,North York,Don Mills North


In [69]:
# if more than one neighborhood exists in one postal code area, 
#   these two rows will be combined into one row with the neighborhoods separated with a comma
df_combined = df_filtered

postal_hash = {}
for index, row in df_filtered.iterrows():
    if row['PostalCode'] in postal_hash:
        df_combined.drop(index, inplace=True)
        df_combined.loc[postal_hash[row['PostalCode']], 'Neighborhood'] += ", " + row['Neighborhood']
    else:
        postal_hash[row['PostalCode']] = index
        
df_combined

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,M7A,Downtown Toronto,Queen's Park
5,M9A,Queen's Park,Queen's Park
6,M1B,Scarborough,"Rouge, Malvern"
7,M3B,North York,Don Mills North
8,M4B,East York,"Woodbine Gardens, Parkview Hill"
9,M5B,Downtown Toronto,"Ryerson, Garden District"


In [70]:
# if a cell has a borough but a Not assigned neighborhood, 
#   then the neighborhood will be the same as the borough
#df_combined.loc[df_combined.Neighborhood == 'Not assigned', 'Neighborhood'] = df_combined[df_combined.Neighborhood == 'Not assigned']['Borough']
df_clean = df_combined[df_combined.Neighborhood == 'Not assigned']
df_combined.loc[df_clean.index, 'Neighborhood'] = df_combined.loc[df_clean.index, 'Borough']
df_combined

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,M7A,Downtown Toronto,Queen's Park
5,M9A,Queen's Park,Queen's Park
6,M1B,Scarborough,"Rouge, Malvern"
7,M3B,North York,Don Mills North
8,M4B,East York,"Woodbine Gardens, Parkview Hill"
9,M5B,Downtown Toronto,"Ryerson, Garden District"


In [71]:
# reset index

df_combined.reset_index(inplace=True)
df_combined.drop(['index'], axis=1, inplace=True)
#df_combined.drop(['level_0'], axis=1, inplace=True)
df_combined

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,M7A,Downtown Toronto,Queen's Park
5,M9A,Queen's Park,Queen's Park
6,M1B,Scarborough,"Rouge, Malvern"
7,M3B,North York,Don Mills North
8,M4B,East York,"Woodbine Gardens, Parkview Hill"
9,M5B,Downtown Toronto,"Ryerson, Garden District"
