**Here, I import the libraries that we'll need for all 3 parts of the assignment.**

In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
import folium
from sklearn.cluster import KMeans
import matplotlib.cm as cm
import matplotlib.colors as colors

** In the next few cells, I will create a data frame based on the table from 
the Wikipedia article. **

In [2]:
source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text

soup = BeautifulSoup(source,'lxml')
td_elements = soup.find('table').find_all('td')

content = []
data = []

for el in td_elements:
    content.append(el.text.split('\n')[0])

# Create rough draft of data frame
for j in range(0,len(content),3):
    Postcode = content[j]
    Borough = content[j+1]
    Neighborhood = content[j+2]
    
    # 'Process' if Borough is assigned
    if Borough != 'Not assigned':
    
        # Set Neighborhood to Borough if Neighborhood is not assigned
        if Neighborhood == 'Not assigned':
            Neighborhood = Borough 
    
        data.append({'Postcode':Postcode,'Borough':Borough,'Neighborhood':Neighborhood})
                   
df = pd.DataFrame(data)

print('This is the first 5 rows of the first draft of the dataframe:')
df.head()

This is the first 5 rows of the first draft of the dataframe:


Unnamed: 0,Borough,Neighborhood,Postcode
0,North York,Parkwoods,M3A
1,North York,Victoria Village,M4A
2,Downtown Toronto,Harbourfront,M5A
3,Downtown Toronto,Regent Park,M5A
4,North York,Lawrence Heights,M6A


In [3]:
# Change order of the columns of the data frame to the right order
df = pd.DataFrame([df['Postcode'],df['Borough'],df['Neighborhood']]).T

print('This is the first 5 rows of the second draft of the dataframe:')
df.head()

This is the first 5 rows of the second draft of the dataframe:


Unnamed: 0,Postcode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights


** Lets streamline the dataframe by grouping neighborhoods with the same postal code together. Then, lets order the rows alphabetically by postal code**

In [4]:
# Group neighborhoods with same postal code together 

# For the loop
i = 1
j = 0

# List of indexes of the rows to be deleted
itd = []

# Group neighborhoods with the same postal code together 
# in the row where the postal code first appears
while (i < len(df)):
    if (df['Postcode'][i] == df['Postcode'][j]):
        df['Neighborhood'][j] = df['Neighborhood'][i] + ', ' + df['Neighborhood'][j]
        itd.append(i)
        i = i + 1
    else:
        i = i + 1
        j = i - 1
        
# Delete redundant rows
df.drop(itd,inplace = True)

# Order rows alphabetically by Postcode
df.sort_values('Postcode',inplace = True)
df.reset_index(drop = True, inplace = True)

print('This is the first 11 rows of the third draft of the data frame')
df.head(11)

This is the first 11 rows of the third draft of the data frame


Unnamed: 0,Postcode,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Port Union, Rouge Hill, Highland Creek"
2,M1E,Scarborough,"West Hill, Morningside, Guildwood"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"Kennedy Park, Ionview, East Birchmount Park"
7,M1L,Scarborough,"Oakridge, Golden Mile, Clairlea"
8,M1M,Scarborough,"Scarborough Village West, Cliffside, Cliffcrest"
9,M1N,Scarborough,"Cliffside West, Birch Cliff"


In [5]:
print('Number of rows: ', df.shape[0])

Number of rows:  103


**Lets add latitudes and longitudes corresponding to each postal code by using the CSV file**

In [6]:
# Code that was written to get the the coordinates without the csv file,
# but it did not work

"""
# Add latitude and longitude columns to describe the Postcode
import geocoder
latitudes = []
longitudes = []

for i in range(0,len(df)):
    
    lat_lng_coords = None
    
    while(lat_lng_coords is None):
        g = geocoder.google('{}, Toronto, Ontario'.format(df['Postcode'][i]))
        lat_lng_coords = g.latlng
        print('i:',i)
        
    latitudes.append(lat_lng_coords[0])
    longitudes.append(lat_lng_coords[1])
    
df['Latitude'] = latitudes
df['Longitude'] = longitudes
"""

"\n# Add latitude and longitude columns to describe the Postcode\nimport geocoder\nlatitudes = []\nlongitudes = []\n\nfor i in range(0,len(df)):\n    \n    lat_lng_coords = None\n    \n    while(lat_lng_coords is None):\n        g = geocoder.google('{}, Toronto, Ontario'.format(df['Postcode'][i]))\n        lat_lng_coords = g.latlng\n        print('i:',i)\n        \n    latitudes.append(lat_lng_coords[0])\n    longitudes.append(lat_lng_coords[1])\n    \ndf['Latitude'] = latitudes\ndf['Longitude'] = longitudes\n"

In [7]:
# Create dataframe from given csv file
lat_lng_df = pd.read_csv('/Users/asamarakone/Geospatial_Coordinates.csv')

print('First 5 rows of lat_lng_df:')

lat_lng_df.head()

First 5 rows of lat_lng_df:


Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [8]:
# Make empty columns for latitude and longitude
df['Latitude']=''
df['Longitude']=''

In [9]:
# Populate latitude and longitude columns
for i,postalCode in enumerate(df['Postcode']):  
    j = 0
    while(j < len(lat_lng_df)):
        if lat_lng_df['Postal Code'][j] == postalCode:
            df['Latitude'][i] = lat_lng_df['Latitude'][j]
            df['Longitude'][i] = lat_lng_df['Longitude'][j]
            break
        else:
            j = j + 1   
            
print('First 11 rows of the final draft of the data frame:')
df.head(11)

First 11 rows of the final draft of the data frame:


Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.8067,-79.1944
1,M1C,Scarborough,"Port Union, Rouge Hill, Highland Creek",43.7845,-79.1605
2,M1E,Scarborough,"West Hill, Morningside, Guildwood",43.7636,-79.1887
3,M1G,Scarborough,Woburn,43.771,-79.2169
4,M1H,Scarborough,Cedarbrae,43.7731,-79.2395
5,M1J,Scarborough,Scarborough Village,43.7447,-79.2395
6,M1K,Scarborough,"Kennedy Park, Ionview, East Birchmount Park",43.7279,-79.262
7,M1L,Scarborough,"Oakridge, Golden Mile, Clairlea",43.7111,-79.2846
8,M1M,Scarborough,"Scarborough Village West, Cliffside, Cliffcrest",43.7163,-79.2395
9,M1N,Scarborough,"Cliffside West, Birch Cliff",43.6927,-79.2648
