# Segmenting and Clustering Neighborhoods in Toronto

## Importing Libraries

In [27]:
# Set up import statements for all of the packages that plan to use.

import numpy as np
import pandas as pd
import random # library for random number generation
import os
import folium 
import requests
from bs4 import BeautifulSoup

import matplotlib.pyplot as plt # plotting library
# backend for rendering plots within the browser
%matplotlib inline 

from sklearn.cluster import KMeans 
from sklearn.datasets.samples_generator import make_blobs

from geopy.geocoders import Nominatim 
import matplotlib.cm as cm
import matplotlib.colors as colors


print('Libraries imported.')


Libraries imported.


## Import Postal Code Lists from Wikipedia

In [2]:
url='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
source = requests.get(url).text

## Create Column Names for Tables

In [3]:
soup = BeautifulSoup(source, 'xml')
table=soup.find('table')
column_names=['Postalcode','Borough','Neighborhood']
df = pd.DataFrame(columns=column_names)

In [4]:
df

Unnamed: 0,Postalcode,Borough,Neighborhood


## Import Data into dataframe

In [5]:
for tr_cell in table.find_all('tr'):
    data=[]
    for td_cell in tr_cell.find_all('td'):
        data.append(td_cell.text.strip())
    if len(data)==3:
        df.loc[len(df)] = data


In [6]:
df

Unnamed: 0,Postalcode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
...,...,...,...
175,M5Z,Not assigned,Not assigned
176,M6Z,Not assigned,Not assigned
177,M7Z,Not assigned,Not assigned
178,M8Z,Etobicoke,"Mimico NW, The Queensway West, South of Bloor,..."


## Clean Data

In [7]:
df=df[df['Borough']!='Not assigned']


In [21]:
join_df=df.groupby('Postalcode')['Neighborhood'].apply(lambda x: "%s" % ', '.join(x))
join_df=join_df.reset_index(drop=False)
join_df.rename(columns={'Neighborhood':'Neighborhood_joined'},inplace=True)

In [22]:
df

Unnamed: 0,Postalcode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
...,...,...,...
160,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
165,M4Y,Downtown Toronto,Church and Wellesley
168,M7Y,East Toronto,"Business reply mail Processing Centre, South C..."
169,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


In [24]:
merge_df = pd.merge(df, join_df, on='Postalcode')

merge_df.drop(['Neighborhood'],axis=1,inplace=True)
merge_df.drop_duplicates(inplace=True)
merge_df.rename(columns={'Neighborhood_joined':'Neighborhood'},inplace=True)

merge_df.head()


Unnamed: 0,Postalcode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [26]:
merge_df.shape


(103, 3)