### Exploring the Neighborhoods In Toronto with Python

In [1]:
from bs4 import BeautifulSoup
import requests

Scrape contents from url using Beautifulsoup and find the table that contains the neighborhood data

In [2]:
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
data = requests.get(url)
soup = BeautifulSoup(data.text, 'html.parser')
table = soup.find('table',{'class':'wikitable sortable'} )


Get all rows in the table. Put the cell strings in the 1st row into an array for column names.

In [3]:
rows = table.findChildren(['tr'])
row1cells = rows[0].findChildren(['th'])
colnames = []

for cell in row1cells:
    colnames.append(cell.string.rstrip())


Loop through all rows to extract the three cell values. If the string in the 2nd cell of each row is 'Not assigned', discard the row. 
If the cell string is empty, look for a string in the 'a' tag. 

In [4]:
allRows = []
for row in rows[1:]:
    rowVals = []
    discardRow = False
    for ind, cell in enumerate(row.findChildren(['td'])):
        #print(cell.string)
        if cell.string and cell.string.startswith('Not assigned'):
            if ind == 1:
                discardRow = True
                break
            elif ind == 2:
                rowVals.append(rowVals[1])
        elif not cell.string:
            rowVals.append(cell.a.string.rstrip())
        else:
            rowVals.append(cell.string.rstrip())
    if not discardRow: 
        allRows.append(rowVals)


Convert allRows to DataFrame

In [5]:
import pandas as pd
df = pd.DataFrame.from_records(allRows, columns=colnames)


Group df by Postcode and Borough and concatenate the Neighbourhoods of each group then reset index.

In [18]:
tor_df = df.groupby(['Postcode','Borough'])['Neighbourhood'].apply(lambda x: "%s" %', '.join(x)).to_frame()
tor_df.reset_index(inplace=True)
tor_df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [7]:
tor_df.shape

(103, 3)

Import Geospatial_Coordinates.csv

In [19]:
geocoords = pd.read_csv('Geospatial_Coordinates.csv')
geocoords.columns = ['Postcode','Latitude', 'Longitude']
geocoords.head()

Unnamed: 0,Postcode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


Merge geocoords and tor_df

In [20]:
tor_df = tor_df.merge(geocoords, how='left',on='Postcode')
tor_df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
