# Peer-graded Assignment: Segmenting and Clustering Neighborhoods in Toronto

## pt.1 Scraping wiki table to get the first DataFrame

### We start importing libraries in order to scrape the table

In [7]:
import pandas as pd
print('pandas imported')

pandas imported


### We now read HTML as list and convert it into DF

In [8]:
raw_list = pd.read_html('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')
raw_df = pd.DataFrame(raw_list[0],columns =['Postal Code','Borough','Neighborhood'])
print(raw_df.head(),'\r\n nice...')

  Postal Code           Borough               Neighborhood
0         M1A      Not assigned               Not assigned
1         M2A      Not assigned               Not assigned
2         M3A        North York                  Parkwoods
3         M4A        North York           Victoria Village
4         M5A  Downtown Toronto  Regent Park, Harbourfront 
 nice...


### Lets' drop boroughs marked as 'not assigned'

In [9]:
can_df = raw_df.loc[raw_df['Borough'] != 'Not assigned'].reset_index(drop=True)

### Let's now check the dataframe through "head" and "shape" methods

In [10]:
print('This is what the dataframe looks like now:\r\n')
can_df.head()

This is what the dataframe looks like now:



Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [16]:
print('This is the shape of the Toronto Dataframe: ',can_df.shape)

This is the shape of the Toronto Dataframe:  (103, 3)


## pt.2 Get Latitude and Logitude for each of the Postal Codes

### I found this method researching for a while
#### credits -> https://amaral.northwestern.edu/blog/getting-long-lat-list-cities

#### we start by importing OpenCageGeocode and sign in with the key provided upon sign up (free)

In [11]:
from opencage.geocoder import OpenCageGeocode
key = 'b15e50d653d74a1c81ada85bacc6f5f9'  # get api key from:  https://opencagedata.com
geocoder = OpenCageGeocode(key)

#### let's create two empty lists

In [12]:
list_lat = []   # create empty lists

list_long = []

#### Let's try a query with a postal code and take a look at the result...

In [3]:
# testing....
query = 'M6A'  
	
results = geocoder.geocode(query)
	
print (results)

[{'annotations': {'DMS': {'lat': "53° 47' 38.99112'' N", 'lng': "1° 45' 7.22088'' W"}, 'MGRS': '30UWE8220661343', 'Maidenhead': 'IO93ct90so', 'Mercator': {'x': -195032.394, 'y': 7096766.491}, 'OSM': {'edit_url': 'https://www.openstreetmap.org/edit?node=496729463#map=16/53.79416/-1.75201', 'note_url': 'https://www.openstreetmap.org/note/new#map=16/53.79416/-1.75201&layers=N', 'url': 'https://www.openstreetmap.org/?mlat=53.79416&mlon=-1.75201#map=16/53.79416/-1.75201'}, 'UN_M49': {'regions': {'EUROPE': '150', 'GB': '826', 'NORTHERN_EUROPE': '154', 'WORLD': '001'}, 'statistical_groupings': ['MEDC']}, 'callingcode': 44, 'currency': {'alternate_symbols': [], 'decimal_mark': '.', 'html_entity': '&#x00A3;', 'iso_code': 'GBP', 'iso_numeric': '826', 'name': 'British Pound', 'smallest_denomination': 1, 'subunit': 'Penny', 'subunit_to_unit': 100, 'symbol': '£', 'symbol_first': 1, 'thousands_separator': ','}, 'flag': '🇬🇧', 'geohash': 'gcwf00ujxr1bc62m9p6g', 'qibla': 119.26, 'roadinfo': {'drive_on'

#### Looks like we need to append ', Canada' to the postal code query in order to filter correctly.

In [4]:
### try again
# testing....
query = 'M6A, Canada'  
	
results = geocoder.geocode(query)
	
print (results)


[{'annotations': {'DMS': {'lat': "43° 43' 20.28000'' N", 'lng': "79° 27' 1.44000'' W"}, 'MGRS': '17TPJ2481642196', 'Maidenhead': 'FN03gr53wi', 'Mercator': {'x': -8844378.071, 'y': 5393024.958}, 'OSM': {'note_url': 'https://www.openstreetmap.org/note/new#map=16/43.72230/-79.45040&layers=N', 'url': 'https://www.openstreetmap.org/?mlat=43.72230&mlon=-79.45040#map=16/43.72230/-79.45040'}, 'UN_M49': {'regions': {'AMERICAS': '019', 'CA': '124', 'NORTHERN_AMERICA': '021', 'WORLD': '001'}, 'statistical_groupings': ['MEDC']}, 'callingcode': 1, 'currency': {'alternate_symbols': ['C$', 'CAD$'], 'decimal_mark': '.', 'disambiguate_symbol': 'C$', 'html_entity': '$', 'iso_code': 'CAD', 'iso_numeric': '124', 'name': 'Canadian Dollar', 'smallest_denomination': 5, 'subunit': 'Cent', 'subunit_to_unit': 100, 'symbol': '$', 'symbol_first': 1, 'thousands_separator': ','}, 'flag': '🇨🇦', 'geohash': 'dpz88p3uz7ryt5c41mqw', 'qibla': 54.53, 'roadinfo': {'drive_on': 'right', 'speed_in': 'km/h'}, 'sun': {'rise': {

#### NICE! We can now start and loop through our DF to get lat and lon data.

In [13]:
#looping through

for index, row in can_df.iterrows(): # iterate over rows in dataframe

    PostalCode = row['Postal Code']
    Borough = row['Borough']     
    Neighborhood = row['Neighborhood']
    query = str(PostalCode)+', Canada'

    results = geocoder.geocode(query)   
    lat = results[0]['geometry']['lat']
    long = results[0]['geometry']['lng']

    list_lat.append(lat)
    list_long.append(long)

	
# create new columns from lists    

can_df['lat'] = list_lat   

can_df['lon'] = list_long

#### Let's take a look at it.

In [14]:
### this is the Dataframe with lat and lon data
can_df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,lat,lon
0,M3A,North York,Parkwoods,36.433833,28.233327
1,M4A,North York,Victoria Village,43.7276,-79.3148
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.6555,-79.3626
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.7223,-79.4504
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",36.433833,28.233327
