# PART 1

## 1. Import the libraries

In [112]:
# Import the libraries

import requests
import urllib.request
import time
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
print('Done')

Done


## 2. Scrap and transform the data to a Dataframe

In [113]:
# Tutorial on how to scrap data from Wikipedia here: https://simpleanalytical.com/how-to-web-scrape-wikipedia-python-urllib-beautiful-soup-pandas
# Get the webpage where to extract the data

URL = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
page = urllib.request.urlopen(URL)

In [114]:
# Scrap the data

soup = BeautifulSoup(page, "lxml")
all_tables=soup.find_all("table")
right_table=soup.find('table', class_='wikitable sortable')


# Analyse the source code and classsify the data in columns

A=[]
B=[]
C=[]

for row in right_table.findAll('tr'):
    cells=row.findAll('td')
    if len(cells)==3:
        A.append(cells[0].find(text=True))
        B.append(cells[1].find(text=True))
        C.append(cells[2].find(text=True))

In [115]:
# Transfrom the data into a readable dataframe with pandas

df=pd.DataFrame(A,columns=["Postal Code"])
df["Borough"]=B
df["Neighborhood"]=C
df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


## 3. Remove the empty or "Not assigned" rows

In [116]:
# drop empty or "Not assigned" rows

df1=df[df['Neighborhood'].str.strip().astype(bool)]
df1.reset_index(drop=True, inplace=True) # it is crucial to reset the index when making a new dataframe
df1.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


## 4. Check the type and the shape of the Dataframe

In [117]:
df1.dtypes

Postal Code     object
Borough         object
Neighborhood    object
dtype: object

In [118]:
df1.shape

(103, 3)

# PART 2

## 1. Import the CSV file with geographical coordinates

In [119]:
# Importing and reading CSV file with geographical coordinates

url2='http://cocl.us/Geospatial_data'
df2=pd.read_csv(url2)
df2.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


## 2. Merge the 2 Dataframes 

In [120]:
# Merge the 2 dataframes 

df3=pd.concat([df1,df2],sort=True, ignore_index=True, axis=1, verify_integrity=True)
df3.head()

Unnamed: 0,0,1,2,3,4,5
0,M3A,North York,Parkwoods,M1B,43.806686,-79.194353
1,M4A,North York,Victoria Village,M1C,43.784535,-79.160497
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",M1E,43.763573,-79.188711
3,M6A,North York,"Lawrence Manor, Lawrence Heights",M1G,43.770992,-79.216917
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",M1H,43.773136,-79.239476


In [121]:
#Name the columns and delete undesirable columns

df3.columns=["Postal Code","Borough","Neighborhood","ToErase","Longitude","Latitude"]
df3.drop("ToErase",axis=1, inplace=True)
df3.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Longitude,Latitude
0,M3A,North York,Parkwoods,43.806686,-79.194353
1,M4A,North York,Victoria Village,43.784535,-79.160497
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.763573,-79.188711
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.770992,-79.216917
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.773136,-79.239476


## 3. Check the type and the shape of the new Dataframe

In [122]:
df3.dtypes

Postal Code      object
Borough          object
Neighborhood     object
Longitude       float64
Latitude        float64
dtype: object

In [123]:
df3.shape

(103, 5)

In [124]:
print('The dataframe has {} boroughs and {} neighborhoods.'.format(
        len(df3['Borough'].unique()),
        df3.shape[0]
    )
)

The dataframe has 10 boroughs and 103 neighborhoods.


# PART 3

## 1. Import the libraries

In [150]:
# Import some libraries to cluster and display maps

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors
print("Matplot done")

# import k-means from clustering stage
from sklearn.cluster import KMeans
print("Sklearn done")

# !conda install -c conda-forge geopy --yes # uncomment if not installed
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
print ("Geopy done")


# !conda install -c conda-forge folium=0.5.0 --yes #uncomment if not installed
import folium # map rendering library
print("Folium done")

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe
print ("Requests json done")

print('All Libraries imported.')

Matplot done
Sklearn done
Geopy done
Folium done
Requests json done
All Libraries imported.


## 2. Define an area and create specific dataframes

In [126]:
# Define the area

address = 'Toronto, CA'

geolocator = Nominatim(user_agent="toronto_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


In [127]:
# Segment and cluster only the neighborhoods in Downtown Toronto

dttoronto=df3[df3["Borough"].str.contains("Downtown Toronto")]
dttoronto.reset_index(drop=True, inplace=True)
dttoronto.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Longitude,Latitude
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.763573,-79.188711
1,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.773136,-79.239476
2,M5B,Downtown Toronto,"Garden District, Ryerson",43.692657,-79.264848
3,M5C,Downtown Toronto,St. James Town,43.799525,-79.318389
4,M5E,Downtown Toronto,Berczy Park,43.75749,-79.374714
