<h1>Exploring the Neighboorhoods in the city of Toronto</h1>

<h3>1. Installing and Importing necessary libraries</h3>

In [1]:
pip install beautifulsoup4

Collecting beautifulsoup4
[?25l  Downloading https://files.pythonhosted.org/packages/66/25/ff030e2437265616a1e9b25ccc864e0371a0bc3adb7c5a404fd661c6f4f6/beautifulsoup4-4.9.1-py3-none-any.whl (115kB)
[K     |████████████████████████████████| 122kB 6.5MB/s eta 0:00:01
[?25hCollecting soupsieve>1.2 (from beautifulsoup4)
  Downloading https://files.pythonhosted.org/packages/6f/8f/457f4a5390eeae1cc3aeab89deb7724c965be841ffca6cfca9197482e470/soupsieve-2.0.1-py3-none-any.whl
Installing collected packages: soupsieve, beautifulsoup4
Successfully installed beautifulsoup4-4.9.1 soupsieve-2.0.1
Note: you may need to restart the kernel to use updated packages.


In [1]:
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
import requests
import json
import csv
print('Libraries imported Successfully!!!')

Libraries imported Successfully!!!


<h3>2. Scrap data from wikipedia into dataframe</h3>

In [2]:
#sending get request
data = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M").text

In [3]:
#Setting column width for better view
pd.set_option('max_colwidth', 800)

In [4]:
#Parse data from beautifulsoup object 
soup = BeautifulSoup(data, 'html.parser')

<h3>3. Creating a DataFrame from table contents

In [5]:
#Create three list to store table data
postcode = []
borough = []
neighborhood = []

In [6]:
for rows in soup.find('table').find_all('tr'):
    columns = rows.find_all('td')
    if (len(columns)>0):
        #avoid new line in cell
        postcode.append(columns[0].text.rstrip('\n'))
        borough.append(columns[1].text.rstrip('\n'))
        neighborhood.append(columns[2].text.rstrip('\n'))

In [7]:
#Create new dataframe from three list
toronto_df = pd.DataFrame({"Postcode": postcode,
                           "Borough": borough,
                           "Neighborhood": neighborhood})
toronto_df.head()

Unnamed: 0,Postcode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


<h3>4. Drop cells in which Boroughs are Not assigned</h3>

In [8]:
#Drop cells that are not assigned
toronto_drop = toronto_df[toronto_df.Borough != 'Not assigned'].reset_index(drop=True)
toronto_drop.head()

Unnamed: 0,Postcode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


<h3>5. Grouping Neighborhood with same Borough</h3>

In [9]:
#Group Neighborhoods in same Borough
toronto_group = toronto_drop.groupby(['Postcode','Borough'], as_index = False).agg(lambda x: ",".join(x))
toronto_group.head()

Unnamed: 0,Postcode,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


<h3>6.  If Neighborhood = Not Assigned | Assign them same as Borough</h3>

In [10]:
#For Neighborhood = Not assigned make it same as Borough
for index, row in toronto_group.iterrows():
    row['Neighborhood']=='Not assigned'
    row['Neighborhood']=row['Borough']
toronto_group.head()
    

Unnamed: 0,Postcode,Borough,Neighborhood
0,M1B,Scarborough,Scarborough
1,M1C,Scarborough,Scarborough
2,M1E,Scarborough,Scarborough
3,M1G,Scarborough,Scarborough
4,M1H,Scarborough,Scarborough


<h3>7. Checking whether all the requirements are meet</h3>

In [11]:
#Checking whether all requirements are meet
column_names = ["Postcode", "Borough", "Neighborhood"]
test_df = pd.DataFrame(columns=column_names)

test_list = ["M5G", "M2H", "M4B", "M1J", "M4G", "M4M", "M1R", "M9V", "M9L", "M5V", "M1B", "M5A"]

for postcode in test_list:
    test_df = test_df.append(toronto_group[toronto_group["Postcode"]==postcode], ignore_index=True)
    
test_df

Unnamed: 0,Postcode,Borough,Neighborhood
0,M5G,Downtown Toronto,Downtown Toronto
1,M2H,North York,North York
2,M4B,East York,East York
3,M1J,Scarborough,Scarborough
4,M4G,East York,East York
5,M4M,East Toronto,East Toronto
6,M1R,Scarborough,Scarborough
7,M9V,Etobicoke,Etobicoke
8,M9L,North York,North York
9,M5V,Downtown Toronto,Downtown Toronto


<h3>8. Number of rows of DataFrame</h3>

In [12]:
toronto_group.shape

(103, 3)