# Github_Segmenting and Clustering  Neighbourhood in Toronto #

1. Start by creating a new Notebook for this assignment.

### Notebook book created ###

In [None]:
import numpy as np # library to handle data in a vectorized manner
import pandas as pd # library for data analsysis
import requests # Library for web scraping

print('Libraries imported.')

### Web page scraped ###

In [None]:
pip install "ipython-beautifulsoup[bs4]"

In [None]:
pip install "requests"

In [None]:
import requests
import parser
from urllib.request import urlopen
from bs4 import BeautifulSoup
import ssl
import csv
print('BeautifulSoup  & csv imported.')

In [None]:
# Ignore SSL certificate errors
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE

print('SSL certificate errors ignored.')

In [None]:
# request data from internet
link = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
source = requests.get(link).text
# using beautiful soup to parse the html codes.
soup = BeautifulSoup(source,'html')
print('soup ready')

#### Creating table ####

In [None]:
table = soup.find('table',{'class':'wikitable sortable'})

In [None]:
#table_rows
table_rows = table.find_all('tr')

#### Looking inside the table ####

##### 3.1. The dataframe will consist of three columns: PostalCode, Borough, and Neighborhood #####

In [None]:
data = []
for row in table_rows:
    data.append([t.text.strip() for t in row.find_all('td')])

df = pd.DataFrame(data, columns=['PostalCode', 'Borough', 'Neighbourhood'])
#df = df[~df['PostalCode'].isnull()]  # to filter out bad rows

#### Transformed data in pandas data frame ####

#### Data cleaning and annotiation ####

####  we can ignore cells with 'Not assigned' boroughs, like in rows 1 & 2. ####

In [None]:
table = soup.find('table',{'class':'wikitable sortable'})
table_rows = table.find_all('tr')

data = []
for row in table_rows:
    data.append([t.text.strip() for t in row.find_all('td')])

df = pd.DataFrame(data, columns=['PostalCode', 'Borough', 'Neighbourhood'])
df = df[~df['PostalCode'].isnull()]  # to filter out bad rows
df

#### Only process the cells that have an assigned borough. Ignore cells with a borough that is Not assigned. ####

In [None]:
df.drop(df[df['Borough']=="Not assigned"].index,axis=0, inplace=True)


In [None]:
df1 = df.reset_index(drop= True)

#### More than one neighborhood can exist in one postal code area. For example, in the table on the Wikipedia page, you will notice that M5A is listed twice and has two neighborhoods: Harbourfront and Regent Park. These two rows will be combined into one row with the neighborhoods separated with a comma as shown in row 11 in the above table. ####

In [None]:
df2= df1.groupby('PostalCode').agg(lambda x: ','.join(x))

#### If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough. So for the 9th cell in the table on the Wikipedia page, the value of the Borough and the Neighborhood columns will be Queen's Park. ####

In [None]:
df2.loc[df2['Neighbourhood']=="Not assigned",'Neighbourhood']=df2.loc[df2['Neighbourhood']=="Not assigned",'Borough']

In [None]:
df3 = df2.reset_index()

In [None]:
df3['Borough']= df3['Borough'].str.replace('nan|[{}\s]','').str.split(',').apply(set).str.join(',').str.strip(',').str.replace(",{2,}",",")

In [31]:
df3.head(12)

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park,Ionview,Kennedy Park"
7,M1L,Scarborough,"Clairlea,Golden Mile,Oakridge"
8,M1M,Scarborough,"Cliffcrest,Cliffside,Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff,Cliffside West"


In [None]:
df3.info()

#### Clean your Notebook and add Markdown cells to explain your work and any assumptions you are making ####

##### <b>Work </b> From wikipedia we scraped the page. Then convert it to the pandas dataframe. We ignored the <br /> Borough having no value. Combined all Neighbourhood having same Borough. Finally create a dataframe having three columns <br /> "PostalCode", "Borough", "Neighbourhood" ####
##### <b>Assumption </b> a Not assigned neighborhood has the same name as the borough. ####

##### In the last cell of your notebook, use the .shape method to print the number of rows of your dataframe. #####

In [32]:
df3.shape

(103, 3)

## Install the Geopy & Geocoder package ##

In [None]:
pip install geopy


In [None]:
pip install geocoder

## Download coordinate of Toronto from Geospatial_data ##

In [None]:
!wget -q -O "toronto_coordinates.csv" http://cocl.us/Geospatial_data
print('Coordinates downloaded!')
coors = pd.read_csv('toronto_coordinates.csv')

In [None]:
print(coors.shape)
coors.head()

In [30]:
toronto_df_temp = df3.set_index('PostalCode')
coors_temp = coors.set_index('Postal Code')
toronto_df_coors = pd.concat([toronto_df_temp, coors_temp], axis=1, join='inner')

toronto_df_coors.index.name = 'PostalCode'
toronto_df_coors.reset_index(inplace=True)

print(toronto_df_coors.shape)
toronto_df_coors.head(12)

(103, 5)


Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge,Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"East Birchmount Park,Ionview,Kennedy Park",43.727929,-79.262029
7,M1L,Scarborough,"Clairlea,Golden Mile,Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffcrest,Cliffside,Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff,Cliffside West",43.692657,-79.264848
