<h1>Exploring the Neighboorhoods in the city of Toronto</h1>

<h3>1. Installing and Importing necessary libraries</h3>

In [53]:
pip install beautifulsoup4

Note: you may need to restart the kernel to use updated packages.


In [54]:
pip install geocoder

Note: you may need to restart the kernel to use updated packages.


In [55]:
!conda install -c conda-forge folium=0.5.0 --yes

Collecting package metadata (current_repodata.json): done
Solving environment: done

# All requested packages already installed.



In [56]:
pip install geopy

Note: you may need to restart the kernel to use updated packages.


In [57]:
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
import requests
import json
import csv
from geopy.geocoders import Nominatim 
from pandas.io.json import json_normalize

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

import folium # map rendering library

print('Libraries imported Successfully!!!')

Libraries imported Successfully!!!


In [58]:
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)

<h3>2. Scrap data from wikipedia into dataframe</h3>

In [59]:
#sending get request
data = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M").text

In [60]:
#Parse data from beautifulsoup object 
soup = BeautifulSoup(data, 'html.parser')

<h3>3. Creating a DataFrame from table contents

In [61]:
#Create three list to store table data
postcode = []
borough = []
neighborhood = []

In [62]:
for rows in soup.find('table').find_all('tr'):
    columns = rows.find_all('td')
    if (len(columns)>0):
        #avoid new line in cell
        postcode.append(columns[0].text.rstrip('\n'))
        borough.append(columns[1].text.rstrip('\n'))
        neighborhood.append(columns[2].text.rstrip('\n'))

In [63]:
#Create new dataframe from three list
toronto_df = pd.DataFrame({"Postcode": postcode,
                           "Borough": borough,
                           "Neighborhood": neighborhood})
#toronto_df.style.set_properties(**{'text-align': 'left'})
toronto_df.head()

Unnamed: 0,Postcode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


<h3>4. Drop cells in which Boroughs are Not assigned</h3>

In [64]:
#Drop cells that are not assigned
toronto_drop = toronto_df[toronto_df.Borough != 'Not assigned'].reset_index(drop=True)
toronto_drop.head()

Unnamed: 0,Postcode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


<h3>5. Grouping Neighborhood with same Borough</h3>

In [65]:
#Group Neighborhoods in same Borough
toronto_group = toronto_drop.groupby(['Postcode','Borough'], as_index = False).agg(lambda x: ",".join(x))
toronto_group.head()

Unnamed: 0,Postcode,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


<h3>6.  If Neighborhood = Not Assigned | Assign them same as Borough</h3>

In [66]:
#For Neighborhood = Not assigned make it same as Borough
for index, row in toronto_group.iterrows():
    if row["Neighborhood"] == "Not assigned":
        row["Neighborhood"]=row["Borough"]
toronto_group.head()

Unnamed: 0,Postcode,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


<h3>7. Checking whether all the requirements are meet</h3>

In [67]:
#Checking whether all requirements are meet
column_names = ["Postcode", "Borough", "Neighborhood"]
test_toronto_group = pd.DataFrame(columns=column_names)

test_list = ["M5G", "M2H", "M4B", "M1J", "M4G", "M4M", "M1R", "M9V", "M9L", "M5V", "M1B", "M5A"]

for postcode in test_list:
    test_toronto_group = test_toronto_group.append(toronto_group[toronto_group["Postcode"]==postcode], ignore_index=True)
    
test_toronto_group

Unnamed: 0,Postcode,Borough,Neighborhood
0,M5G,Downtown Toronto,Central Bay Street
1,M2H,North York,Hillcrest Village
2,M4B,East York,"Parkview Hill, Woodbine Gardens"
3,M1J,Scarborough,Scarborough Village
4,M4G,East York,Leaside
5,M4M,East Toronto,Studio District
6,M1R,Scarborough,"Wexford, Maryvale"
7,M9V,Etobicoke,"South Steeles, Silverstone, Humbergate, Jamest..."
8,M9L,North York,Humber Summit
9,M5V,Downtown Toronto,"CN Tower, King and Spadina, Railway Lands, Har..."


<h3>8. Number of rows of DataFrame</h3>

In [68]:
toronto_group.shape

(103, 3)