__Segmenting and Clustering Neighborhoods in Toronto__

__Extracting table data:__

In [1]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import pandas as pd
#!pip install geocoder
import geocoder

The Required table is available in object: __tbody__. Using __BeautifulSoup__ Library we extract the table values:

In [2]:
#Get the table body object:
html = urlopen("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")
bsObj = BeautifulSoup(html.read(), 'html.parser')

In [3]:
tablebody_bsObj = bsObj.find("tbody")
#print(tablebody_bsObj)

We take in all the values into a list for better handling

In [4]:
toronto_data = tablebody_bsObj.get_text()
toronto_data = toronto_data.split('\n\n')
toronto_data[0:20]

['\nPostal Code',
 'Borough',
 'Neighbourhood',
 '\nM1A',
 'Not assigned',
 'Not assigned',
 '\nM2A',
 'Not assigned',
 'Not assigned',
 '\nM3A',
 'North York',
 'Parkwoods',
 '\nM4A',
 'North York',
 'Victoria Village',
 '\nM5A',
 'Downtown Toronto',
 'Regent Park, Harbourfront',
 '\nM6A',
 'North York']

In [5]:
#Get the Table data formatted as a List
toronto_data_formated = []
for val in toronto_data:
  val = val.replace('\n','')
  toronto_data_formated.append(val)
toronto_data_formated[0:20]

['Postal Code',
 'Borough',
 'Neighbourhood',
 'M1A',
 'Not assigned',
 'Not assigned',
 'M2A',
 'Not assigned',
 'Not assigned',
 'M3A',
 'North York',
 'Parkwoods',
 'M4A',
 'North York',
 'Victoria Village',
 'M5A',
 'Downtown Toronto',
 'Regent Park, Harbourfront',
 'M6A',
 'North York']

Based on Position in List, the column values are extracted into seperate lists. 

In [6]:
#Extract column values as seperate list:
count = 0
postalcode = []
borough = []
neighborhood = []
for table_element in toronto_data_formated:
  #print(count)
  #print(pc)
  count = count + 1
  if count % 3 == 1 :
    postalcode.append(table_element)
  elif count % 3 == 2:
    borough.append(table_element)
  elif count % 3 == 0:
    neighborhood.append(table_element)

#print(postalcode)
#print(borough)
#print(neighborhood)

In [7]:
#Finalized values to feed into dataframe:
postalcode = postalcode[1:]
borough = borough[1:]
neighborhood = neighborhood[1:]

print(postalcode)
print(borough)
print(neighborhood)

['M1A', 'M2A', 'M3A', 'M4A', 'M5A', 'M6A', 'M7A', 'M8A', 'M9A', 'M1B', 'M2B', 'M3B', 'M4B', 'M5B', 'M6B', 'M7B', 'M8B', 'M9B', 'M1C', 'M2C', 'M3C', 'M4C', 'M5C', 'M6C', 'M7C', 'M8C', 'M9C', 'M1E', 'M2E', 'M3E', 'M4E', 'M5E', 'M6E', 'M7E', 'M8E', 'M9E', 'M1G', 'M2G', 'M3G', 'M4G', 'M5G', 'M6G', 'M7G', 'M8G', 'M9G', 'M1H', 'M2H', 'M3H', 'M4H', 'M5H', 'M6H', 'M7H', 'M8H', 'M9H', 'M1J', 'M2J', 'M3J', 'M4J', 'M5J', 'M6J', 'M7J', 'M8J', 'M9J', 'M1K', 'M2K', 'M3K', 'M4K', 'M5K', 'M6K', 'M7K', 'M8K', 'M9K', 'M1L', 'M2L', 'M3L', 'M4L', 'M5L', 'M6L', 'M7L', 'M8L', 'M9L', 'M1M', 'M2M', 'M3M', 'M4M', 'M5M', 'M6M', 'M7M', 'M8M', 'M9M', 'M1N', 'M2N', 'M3N', 'M4N', 'M5N', 'M6N', 'M7N', 'M8N', 'M9N', 'M1P', 'M2P', 'M3P', 'M4P', 'M5P', 'M6P', 'M7P', 'M8P', 'M9P', 'M1R', 'M2R', 'M3R', 'M4R', 'M5R', 'M6R', 'M7R', 'M8R', 'M9R', 'M1S', 'M2S', 'M3S', 'M4S', 'M5S', 'M6S', 'M7S', 'M8S', 'M9S', 'M1T', 'M2T', 'M3T', 'M4T', 'M5T', 'M6T', 'M7T', 'M8T', 'M9T', 'M1V', 'M2V', 'M3V', 'M4V', 'M5V', 'M6V', 'M7V', 'M8V'

__Creating Pandas Dataframe from extracted Table:__ 

In [8]:
toronto_df = pd.DataFrame({'PostalCode': postalcode, 
                           'Borough': borough,
                          'Neighborhood' : neighborhood})
toronto_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


Rows without borough values are not considered

In [9]:
#Ignore without a borough
#toronto_df.drop(toronto_df.loc[toronto_df['Borough'] == 'Not assigned'])
toronto_df = toronto_df[toronto_df['Borough'] != 'Not assigned'].reset_index(drop=True)
toronto_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [34]:
toronto_df.shape

(103, 3)

In [10]:
#Unable to implement below - Unstable
'''
# import geocoder

# initialize your variable to None
lat_lng_coords = None

# loop until you get the coordinates
while(lat_lng_coords is None):
  g = geocoder.google('{}, Toronto, Ontario'.format('M5G'))
  lat_lng_coords = g.latlng

latitude = lat_lng_coords[0]
longitude = lat_lng_coords[1]
'''

"\n# import geocoder\n\n# initialize your variable to None\nlat_lng_coords = None\n\n# loop until you get the coordinates\nwhile(lat_lng_coords is None):\n  g = geocoder.google('{}, Toronto, Ontario'.format('M5G'))\n  lat_lng_coords = g.latlng\n\nlatitude = lat_lng_coords[0]\nlongitude = lat_lng_coords[1]\n"

Read __Coordinates__ details from CSV into a dataframe. In order to execute a merge we rename the common column to same value as toronto dataset:

In [21]:
coordinates_df = pd.read_csv('/Users/baraths/Documents/DataScience_Course_Online/Data Science - Full Courses/Jupyter Notebook files/Geospatial_Coordinates.csv')

coordinates_df.rename(columns = {'Postal Code':'PostalCode'}, inplace = True)
coordinates_df.head()

Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [24]:
toronto_df = toronto_df.merge(coordinates_df, on='PostalCode')
toronto_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
