# Capstone Project - Week 3

## importing Libraries

In [1]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup # for scraping data from wikipedia

## Lets scrap

In [2]:
import requests

In [3]:
# requesting the wikipedia page
data = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')
data = data.text

In [4]:
soup = BeautifulSoup(data,'html.parser')

In [5]:
# Dictionary to store table
dc = {"PostalCode":[],
      "Borough":[],
      "Neighborhood":[]}

In [6]:
# Storing the table data in dictionary
for row in soup.find('table').find_all('tr')[1:]:
    cols = row.find_all('td')
    dc["PostalCode"].append(cols[0].text)
    # Removing '\n' from last of the text
    dc["Borough"].append(cols[1].text.rstrip('\n'))
    dc["Neighborhood"].append(cols[2].text.rstrip('\n'))
    

## Create Dataframe from the scraped table

In [7]:
# Creating DataFrame
df = pd.DataFrame(dc)
cols = ['PostalCode','Borough','Neighborhood']
df = df[cols]
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


## Removing the rows with missing values

In [8]:
df1 = df[df.Borough != 'Not assigned'].reset_index(drop=True)
df1.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights


## If Borough is same or there is no Neighborhood

In [9]:
# Gouping the neighbourhod if Borough is same
df_group = df1.groupby(['PostalCode','Borough'], as_index=False).agg(lambda x:', '.join(x))
df_group.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [10]:
# If there is no Neighborhood
for _,row in df_group.iterrows():
    if row['Neighborhood'] == 'Not assigned':
        row['Neighborhood'] = row['Borough']
df_group.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


## Shape of dataframe

In [11]:
df_group.shape

(103, 3)

## Geospatial Coordinates

In [12]:
cord = pd.read_csv('https://cocl.us/Geospatial_data')
cord.rename(columns={'Postal Code':'PostalCode'}, inplace=True)
cord.head()

Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


## Adding both dataframe

In [13]:
df_final = df_group.merge(cord, on="PostalCode")
df_final.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


## Making dataframe look_like as in Coursera assignment

In [14]:
col_list = df_final.columns.tolist()
#col_list
df_coursera = pd.DataFrame(columns=col_list)
row_list = ["M5G", "M2H", "M4B", "M1J", "M4G", "M4M", "M1R", "M9V", "M9L", "M5V", "M1B", "M5A"]
for pc in row_list:
    df_coursera = df_coursera.append(df_final[df_final["PostalCode"] ==  pc], ignore_index=True)
df_coursera

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383
1,M2H,North York,Hillcrest Village,43.803762,-79.363452
2,M4B,East York,"Woodbine Gardens, Parkview Hill",43.706397,-79.309937
3,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
4,M4G,East York,Leaside,43.70906,-79.363452
5,M4M,East Toronto,Studio District,43.659526,-79.340923
6,M1R,Scarborough,"Maryvale, Wexford",43.750072,-79.295849
7,M9V,Etobicoke,"Albion Gardens, Beaumond Heights, Humbergate, ...",43.739416,-79.588437
8,M9L,North York,Humber Summit,43.756303,-79.565963
9,M5V,Downtown Toronto,"CN Tower, Bathurst Quay, Island airport, Harbo...",43.628947,-79.39442


## Shape of Dataframe

In [15]:
df_final.shape

(103, 5)