# Week 3: Segmenting and Clustering Neighborhoods in Toronto

## Part 1.

### 1. Import the necessary libraries

In [10]:
import numpy as np
import pandas as pd 
import requests # Library handles requests
from IPython.display import FileLink, FileLinks

### 2. Scrape the given Wikipedia page 

In [3]:
# Send a GET request to grab the data from the Wikipedia page
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
wiki_data = requests.get(url)

# Reading the table
df = pd.read_html(url, header = 0)[0]
df.head(5)

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


### 3. Remove cells with a borough that is "Not assigned".

In [4]:
df = df[df["Borough"] != "Not assigned"]

df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


### 4. Group neighborhoods that exist under one postal code area (aka same borough)

In [5]:
# Group data by postal code & aggregate columns for neighborhood
df = df.groupby(["Postal Code", "Borough"], as_index = False).agg(lambda x: ", ".join(x))

df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


### 5. Make neighborhood have same borough when neighborhood = "Not assigned" 

In [6]:
# Iterate through table and replace the values under these conditions 
for i, row in df.iterrows():
    if row["Neighborhood"] == "Not assigned":
        row["Neighborhood"] = row["Borough"]
        
df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


### 6. Print the number of rows in dataframe

In [7]:
df.shape

(103, 3)

### 7. Recreate the dataframe initially provided

In [60]:
original_df = pd.DataFrame(columns = ["Postal Code", "Borough", "Neighborhood"])

PC = ["M5G", "M2H", "M4B", "M1J", "M4G", "M4M", "M1R", "M9V", "M9L", "M5V", "M1B", "M5A"]

for i in PC:
    original_df = original_df.append(df[df["Postal Code"] == i])
    
original_df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
40,M5G,Downtown Toronto,Central Bay Street
46,M2H,North York,Hillcrest Village
12,M4B,East York,"Parkview Hill, Woodbine Gardens"
54,M1J,Scarborough,Scarborough Village
39,M4G,East York,Leaside


### 8. Export dataframe into csv file (to be used in Part 2)

In [15]:
df.to_csv("Toronto_df.csv", index = False, header = True)