# Segmenting and Clustering Neighbourhoods in Toronto, Part 2

In [1]:
!pip install geocoder



In [18]:
#import libraries

import pandas as pd
import requests
import numpy as np
from bs4 import BeautifulSoup
import geocoder
print("imported")

imported


#### below is stuff from part 1, scroll down further to find the part 2

### Web Scraping

In [3]:
source = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")
soup =BeautifulSoup(source.text,'lxml')

In [4]:
data = []
table = soup.find(class_='wikitable')
index=0
for tr in table.find_all('tr'):
    index += 1
    if index>1:
        section = [] 
        for td in tr.find_all('td'):
            section.append(td.text.rstrip())
        data.append(section)

In [5]:
df = pd.DataFrame(data = data,columns=['Postcode','Borough','Neighbourhood'])
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


### Data Cleaning

In [6]:
#drop the rows that have 'not assigned' in the borough column

df = df[df.Borough!='Not assigned']

In [7]:
df['Neighbourhood'].replace("not assigned",df['Borough'],inplace=True)

In [8]:
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [9]:
df.shape

(103, 3)

In [10]:
def get_latilong(postal_code):
    lati_long_coords = None
    while(lati_long_coords is None):
        g = geocoder.arcgis('{}, Toronto, Ontario'.format(postal_code))
        lati_long_coords = g.latlng
    return lati_long_coords
    
get_latilong('M4G')

[43.70909000000006, -79.36409999999995]

In [13]:
# retrieving postal code co-ordinates

postal_codes = df['Postcode']    
coords = [ get_latilong(postal_code) for postal_code in postal_codes.tolist() ]

In [14]:
# Adding Columns Latitude & Longitude
df_coords = pd.DataFrame(coords, columns=['Latitude', 'Longitude'])
df['Latitude'] = df_coords['Latitude']
df['Longitude'] = df_coords['Longitude']

In [15]:
df[df.Postcode == 'M5G']

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
40,M5G,Downtown Toronto,Central Bay Street,43.73903,-79.46732


In [16]:
df.head(15)

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
2,M3A,North York,Parkwoods,43.65514,-79.36265
3,M4A,North York,Victoria Village,43.72321,-79.45141
4,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.66449,-79.39302
5,M6A,North York,"Lawrence Manor, Lawrence Heights",43.66277,-79.52831
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.81153,-79.19552
8,M9A,Etobicoke,"Islington Avenue, Humber Valley Village",43.70794,-79.3116
9,M1B,Scarborough,"Malvern, Rouge",43.65736,-79.37818
11,M3B,North York,Don Mills,43.65279,-79.55406
12,M4B,East York,"Parkview Hill, Woodbine Gardens",43.78564,-79.15871
13,M5B,Downtown Toronto,"Garden District, Ryerson",43.72184,-79.3434


In [17]:
df.to_csv('toronto_part2.csv',index=False)