# Peer-graded Assignment: Segmenting and Clustering Neighborhoods in Toronto - Adding Geographical Coordinates


## Get the page with the Toronto neighborhoods from Wikipedia

https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M


In [26]:
import pandas as pd # library to process data as dataframes
import numpy as np # library to handle data in a vectorized manner

#read in the Wikipedia article to get a local copy

import urllib.request

url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
req = urllib.request.urlopen(url)
article = req.read().decode()

with open('ISO_3166-1_alpha-2.html', 'w') as fo:
    fo.write(article)

## This section will screen scrape the Wikipedia page

In [27]:
!pip install beautifulsoup4 # install beautifulsoup4 to scrape website tables
from bs4 import BeautifulSoup



In [28]:
# Load article, turn into soup and get the <table>s.
article = open('ISO_3166-1_alpha-2.html').read()
soup = BeautifulSoup(article, 'html.parser')
tables = soup.find_all('table', class_='sortable')

In [29]:
# Search through the tables for the table with the headings we want.
for table in tables:
    ths = table.find_all('th')
    headings = [th.text.strip() for th in ths]
    if headings[:5] == ['Postcode', 'Borough', 'Neighborhood']:
        break

In [30]:
# Extract the columns we want and write to a semicolon-delimited text file.
with open('iso_3166-1_alpha-2_codes.txt', 'w') as fo:
    for tr in table.find_all('tr'):
        tds = tr.find_all('td')
        if not tds:
            continue
        postcode, borough, neighborhood = [td.text.strip() for td in tds[:3]]
        print('; '.join([postcode, borough, neighborhood]), file=fo) # write the data to a text file

## Put the table into a dataframe

In [31]:
# read the text file containing the table into a dataframe
df = pd.read_csv('iso_3166-1_alpha-2_codes.txt', sep=';', header=None, names=['PostalCode', 'Borough', 'Neighborhood'])

In [32]:
df.head() #check to ensure there is data in the dataframe and it is in the correct format

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


In [33]:
df.tail() #check to ensure there is data in the dataframe and it is in the correct format

Unnamed: 0,PostalCode,Borough,Neighborhood
282,M8Z,Etobicoke,Mimico NW
283,M8Z,Etobicoke,The Queensway West
284,M8Z,Etobicoke,Royal York South West
285,M8Z,Etobicoke,South of Bloor
286,M9Z,Not assigned,Not assigned


In [34]:
df.info() #get info about the dataframe

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 287 entries, 0 to 286
Data columns (total 3 columns):
PostalCode      287 non-null object
Borough         287 non-null object
Neighborhood    287 non-null object
dtypes: object(3)
memory usage: 6.9+ KB


In [35]:
list(df.columns) #list the dataframe columns

['PostalCode', 'Borough', 'Neighborhood']

## Dataframe cleanup

### Drop the rows where 'Borough' is 'Not assigned'

In [36]:
# drop the rows where 'Borough' is 'Not assigned' and put the remaining rows into a new dataframe
df_assigned_borough = df[df['Borough'].str.contains('Not assigned') == False].copy()


In [37]:
df_assigned_borough.shape # get the shape of the dataframe after removing rows with 'Not assigned' Borough

(210, 3)

In [38]:
df_assigned_borough.head() #look at the first 5 rows after removing rows the 'Not Assigned' Borough

Unnamed: 0,PostalCode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor


### If Neighborhood is "Not assigned", put the value of Borough into Neighborhood

In [39]:
#Show the value of Queen's Park Neighborhood before changing the value
df_assigned_borough[df_assigned_borough['Borough'] == " Queen's Park"]

Unnamed: 0,PostalCode,Borough,Neighborhood
7,M7A,Queen's Park,Not assigned
9,M9A,Queen's Park,Queen's Park


In [40]:
# This changes Neighborhood to the value of Borough if Neighborhood = " Not assigned"
df_assigned_borough['Neighborhood'] = np.where((df_assigned_borough['Neighborhood'] == " Not assigned"), df_assigned_borough['Borough'],df_assigned_borough['Neighborhood'])

In [41]:
#Show that Queen's Park has modified the Neighborhood
df_assigned_borough[df_assigned_borough['Borough'] == " Queen's Park"]

Unnamed: 0,PostalCode,Borough,Neighborhood
7,M7A,Queen's Park,Queen's Park
9,M9A,Queen's Park,Queen's Park


### Group the neighborhoods having the same postcode and borough

In [42]:
#Group the neighborhoods having the same postcode and borough
df_grouped_neighborhoods = df_assigned_borough.groupby(['PostalCode','Borough'])['Neighborhood'].apply(lambda x: "%s" % ', '.join(x)).to_frame().reset_index()
df_grouped_neighborhoods.head() #Look at the first 5 rows after the groupby


Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [43]:
#sample some data and compare it with the assignment requirements
df_grouped_neighborhoods[df_grouped_neighborhoods['Borough'] == ' Downtown Toronto']

Unnamed: 0,PostalCode,Borough,Neighborhood
50,M4W,Downtown Toronto,Rosedale
51,M4X,Downtown Toronto,"Cabbagetown, St. James Town"
52,M4Y,Downtown Toronto,Church and Wellesley
53,M5A,Downtown Toronto,Harbourfront
54,M5B,Downtown Toronto,"Ryerson, Garden District"
55,M5C,Downtown Toronto,St. James Town
56,M5E,Downtown Toronto,Berczy Park
57,M5G,Downtown Toronto,Central Bay Street
58,M5H,Downtown Toronto,"Adelaide, King, Richmond"
59,M5J,Downtown Toronto,"Harbourfront East, Toronto Islands, Union S..."


In [44]:
#Get the shape of the dataframe
print("The shape is: ", df_grouped_neighborhoods.shape)


The shape is:  (103, 3)


## This section will add the geographical coordinates to the dataframe

### Read in the file with the geographical coordinates

In [45]:
coordinates = pd.read_csv("https://cocl.us/Geospatial_data") #read in the geographical coordinates file

In [46]:
coordinates.head() #check to ensure the coordinates dataframe looks ok

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


### Merge the df_grouped_neighborhoods dataframe with the coordinates dataframe

In [47]:
df_merged_with_coordinates = pd.merge(df_grouped_neighborhoods, coordinates, left_on='PostalCode', right_on='Postal Code').copy()

In [48]:
del df_merged_with_coordinates['Postal Code'] #remove the redundant Postal Code column

### Show the merged dataframe that includes the geographic coordinates

In [49]:
df_merged_with_coordinates.head(10) #show the merged dataframe

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park",43.727929,-79.262029
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village ...",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff, Cliffside West",43.692657,-79.264848


In [None]:
df_merged_with_coordinates.to_csv('Week3_Part2_df_merged_with_coordinates.csv', sep=',', encoding='utf-8') #copy dataframe to csv

In [50]:
#Get the shape of the dataframe
print("The shape is: ", df_grouped_neighborhoods.shape)

The shape is:  (103, 3)
