## Segmenting and Clustering Neighborhoods in Toronto

In [1]:
# Importing all dependencies we'll need
import numpy as np 

import pandas as pd 
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json 

# !conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

# !conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

import warnings
warnings.filterwarnings('ignore')

print('Libraries imported.')

Libraries imported.


In [2]:
import bs4 # BeautifulSoup Package
from bs4 import BeautifulSoup

#### From the Wikipedia Page - Toroto is made up of 140 Neighbours - We are going to create a dataset with all the 140 Neighbours alongside their Municipality Codes

In [3]:
wikipedia_page = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
results = requests.get(wikipedia_page)

In [4]:
wikipedia_html = results.text

In [5]:
soup = BeautifulSoup(wikipedia_html, 'html.parser')
for table in soup.find_all('table'):
    print(table.get('class'))

['wikitable', 'sortable']
['multicol']
None
['navbox']
None


In [6]:
neighborhood_html_table = soup.find_all('table')[0]
df = pd.read_html("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M", header=0)
df[0]

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Not assigned
9,M8A,Not assigned,Not assigned


In [7]:
toronto_df = df[0]

### Remove the Not Assigned from the Borough Column of the DataFrame

In [8]:
toronto_df = toronto_df[toronto_df.Borough != "Not assigned"]

### Explore Postal Codes with More than one Neighbourhood

In [9]:
toronto_df.Postcode.value_counts().head()

M9V    8
M8Y    8
M5V    7
M9B    5
M4V    5
Name: Postcode, dtype: int64

### Combine All the Neighbourhoods with the Same Postal Code

In [10]:
toronto_df.groupby(["Postcode", "Borough"]).agg(lambda x: ', '.join(set(x))).reset_index()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"West Hill, Morningside, Guildwood"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"Kennedy Park, East Birchmount Park, Ionview"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffside, Scarborough Village West, Cliffcrest"
9,M1N,Scarborough,"Cliffside West, Birch Cliff"


### Replace all Not Assigned Neighbourhoods with the value of the Borough

In [11]:
toronto_df[toronto_df.Neighbourhood == 'Not assigned']

Unnamed: 0,Postcode,Borough,Neighbourhood
8,M7A,Queen's Park,Not assigned


In [12]:
toronto_df.loc[8, "Neighbourhood"] = toronto_df.loc[8, "Borough"]
toronto_df[toronto_df.Neighbourhood == 'Not assigned']

Unnamed: 0,Postcode,Borough,Neighbourhood


In this first notebook I have grabed the html table using pandas read_html() and made various transformation to the DataFrame. 