# Segmenting and Clustering Neighborhoods in Toronto

## Part 1

1. Importing the libraries necessary for the assignment

In [1]:
#Import Libraries
import pandas as pd
import requests
from bs4 import BeautifulSoup

2. Scraping data from wikipedia

In [2]:
#Get Data from Website
res = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')
soup = BeautifulSoup(res.content, 'lxml')
table= soup.find_all('table')[0]
df = pd.read_html(str(table))[0]

3. Creating a dataframe

In [3]:
#Transform Data into DataFrame
PostalCode = df['Postcode'].tolist()
Neighborhood_Borough = df['Borough'].tolist()
Neighborhood_Name = df['Neighbourhood'].tolist()

Columns = list(zip(PostalCode, Neighborhood_Borough, Neighborhood_Name))
Trt_df = pd.DataFrame(Columns, columns=['PostalCode','Borough', 'Neighborhood'])
Trt_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


4. Removing rows where the borough is not assigned, combining neighborhoods by postal code and replacing not assigned neighborhoods by their borough.

In [4]:
#Cleaning Data
Trt_df = Trt_df[Trt_df.Borough != 'Not assigned'].reset_index(drop=True)
Trt_df = Trt_df.groupby(['PostalCode', 'Borough'])['Neighborhood'].apply(lambda x: '%s' % ', '.join(x)).reset_index()
Trt_df["Neighborhood"].replace('Not assigned', Trt_df['Borough'], inplace=True)
Trt_df.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


In [5]:
Trt_df.shape

(103, 3)

## Part 2

5. Importing Geo-spatial data from csv file

In [6]:
Geo_df = pd.read_csv('http://cocl.us/Geospatial_data/Geospatial_Coordinates.csv')
Geo_df.head() 

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [7]:
df = pd.merge(Trt_df, Geo_df, how='inner', on=None, left_on='PostalCode', right_on='Postal Code',
         left_index=False, right_index=False, sort=True, suffixes=('_x', '_y'), copy=True, indicator=False, validate=None)
df.drop('Postal Code', axis = 1,  inplace = True)
df.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park",43.727929,-79.262029
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff, Cliffside West",43.692657,-79.264848
