# Coursera - IBM Data Science Professional Certificate
## Applied Data Science Capstone
### Segmenting and Clustering Neighborhoods in Toronto
#### by: ljzlaomi

## ----------------------------------------Part 1----------------------------------------

In [1]:
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup

print("Library imported!")

Import Finished!


### Parse the page and find the tables

In [2]:
res = requests.get(r'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')
bs = BeautifulSoup(res.text, features='lxml')
tables = bs.findAll('table')

print("There are %d table(s) on this page!" % len(tables))
if len(tables) < 1:
	raise Exception("No tables on this page!")

There are 3 table(s) on this page!


### Select the right table with header: ["Postal code", "Borough", "Neighborhood"]

In [3]:
table = None
columns = ["Postal code", "Borough", "Neighborhood"]
for tab in tables:
	table_header_verify = []
	for th in tab.findAll('th'):
		txt = th.getText().strip()
		table_header_verify.append(txt)
	if table_header_verify == columns:
		table = tab
		break
else:
	raise Exception ('No table with header: %s' % str(columns))

print("table with header %s is successfully located!" % str(columns))

table with header ['Postal code', 'Borough', 'Neighborhood'] is successfully located!


### Iterate through the table, convert it to DataFrame

In [4]:
df = pd.DataFrame()
for tr in table.findAll('tr'):
	line = []
	for td in tr.findAll('td'):
		txt = td.getText().strip()
		line.append(txt)
	if line:
		df = df.append([line])

df = df.astype(str)
df.head(10)

Unnamed: 0,0,1,2
0,M1A,Not assigned,
0,M2A,Not assigned,
0,M3A,North York,Parkwoods
0,M4A,North York,Victoria Village
0,M5A,Downtown Toronto,Regent Park / Harbourfront
0,M6A,North York,Lawrence Manor / Lawrence Heights
0,M7A,Downtown Toronto,Queen's Park / Ontario Provincial Government
0,M8A,Not assigned,
0,M9A,Etobicoke,Islington Avenue
0,M1B,Scarborough,Malvern / Rouge


### The dataframe will consist of three columns: PostalCode, Borough, and Neighborhood

In [5]:
columns = ['Postal Code', 'Borough', 'Neighborhood']
df.columns = columns
df = df.reset_index()[columns]

df.head(10)

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Regent Park / Harbourfront
5,M6A,North York,Lawrence Manor / Lawrence Heights
6,M7A,Downtown Toronto,Queen's Park / Ontario Provincial Government
7,M8A,Not assigned,
8,M9A,Etobicoke,Islington Avenue
9,M1B,Scarborough,Malvern / Rouge


### Only process the cells that have an assigned borough, Ignore cells with a borough that is Not assigned

In [6]:
df = df[df['Borough'] != 'Not assigned']
df = df.reset_index()[columns]

df.head(10)

Unnamed: 0,Postal Code,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Regent Park / Harbourfront
5,M6A,North York,Lawrence Manor / Lawrence Heights
6,M7A,Downtown Toronto,Queen's Park / Ontario Provincial Government
8,M9A,Etobicoke,Islington Avenue
9,M1B,Scarborough,Malvern / Rouge
11,M3B,North York,Don Mills
12,M4B,East York,Parkview Hill / Woodbine Gardens
13,M5B,Downtown Toronto,"Garden District, Ryerson"


### If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough.

In [7]:
df.loc[df['Neighborhood'] == 'Not assigned', 'Neighborhood'] = df['Borough']
df = df.reset_index()[columns]

### More than one neighborhood can exist in one postal code area. For example, in the table on the Wikipedia page, you will notice that M5A is listed twice and has two neighborhoods: Harbourfront and Regent Park. These two rows will be combined into one row with the neighborhoods separated with a comma as shown in row 11 in the above table.

In [8]:
df["Neighborhood"] = df["Neighborhood"].apply(lambda x: x.replace(" /", ","))

df.head(10)

Unnamed: 0,Postal Code,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
8,M9A,Etobicoke,Islington Avenue
9,M1B,Scarborough,"Malvern, Rouge"
11,M3B,North York,Don Mills
12,M4B,East York,"Parkview Hill, Woodbine Gardens"
13,M5B,Downtown Toronto,"Garden District, Ryerson"


### In the last cell of your notebook, use the .shape method to print the number of rows of your dataframe.

In [9]:
df.shape

(103, 3)

## -------------------------------------End of Part 1-------------------------------------

## ----------------------------------------Part 2----------------------------------------

In [10]:
geo_data = pd.read_csv("http://cocl.us/Geospatial_data")

geo_data.head(10)

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
5,M1J,43.744734,-79.239476
6,M1K,43.727929,-79.262029
7,M1L,43.711112,-79.284577
8,M1M,43.716316,-79.239476
9,M1N,43.692657,-79.264848


### Iterate through the DataFrame, adding two columns according to geo_data

In [11]:
for index, row in df.iterrows():
	# Just in case there is no match for this PostalCode
	series_Latitude = geo_data[geo_data['Postal Code'] == row['Postal Code']]['Latitude']
	df.loc[index, 'Latitude'] = series_Latitude.tolist()[0] if len(series_Latitude) >= 1 else "Unknown Latitude"
	series_Longitude = geo_data[geo_data['Postal Code'] == row['Postal Code']]['Longitude']
	df.loc[index, 'Longitude'] = series_Longitude.tolist()[0] if len(series_Longitude) >= 1 else "Unknown Longitude"

df.head(10)

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
2,M3A,North York,Parkwoods,43.753259,-79.329656
3,M4A,North York,Victoria Village,43.725882,-79.315572
4,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
5,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
8,M9A,Etobicoke,Islington Avenue,43.667856,-79.532242
9,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
11,M3B,North York,Don Mills,43.745906,-79.352188
12,M4B,East York,"Parkview Hill, Woodbine Gardens",43.706397,-79.309937
13,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937


## -------------------------------------End of Part 2-------------------------------------

## ----------------------------------------Part 3----------------------------------------

In [12]:
import folium

### Toronto's Geographical Coordinates (From Google)

In [13]:
latitude = 43.6532
longitude= -79.3832

print('The geographical coordinate of Toronto are %s, %s.' % (latitude, longitude))

The geographical coordinate of Toronto are 43.6532, -79.3832.


### Create map of Toronto using latitude and longitude values above:

In [14]:
map_geo = folium.Map(location=[latitude, longitude], zoom_start=11)

### Adding markers to map

In [15]:
for lat, lng, label in zip(df['Latitude'], df['Longitude'], df['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        (lat, lng),
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False
    ).add_to(map_geo)

### Display map

In [16]:
map_geo


## -------------------------------------End of Part 3-------------------------------------