<center>
    <h1>Segmenting and Clustering Neighborhood of Toronto</h1>
</center>

## Importing Libraries

In [92]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib.colors as colors
%matplotlib inline

import requests
from geopy.geocoders import Nominatim
from bs4 import BeautifulSoup

from sklearn.cluster import KMeans

import folium

In [93]:
## Storing web content
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
page = requests.get(url)

In [94]:
## Parsing html file to a instance of Beautiful Soup
soup = BeautifulSoup(page.content, 'html.parser')

In [95]:
## Printing soup 
print(soup.prettify())

<!DOCTYPE html>
<html class="client-nojs" dir="ltr" lang="en">
 <head>
  <meta charset="utf-8"/>
  <title>
   List of postal codes of Canada: M - Wikipedia
  </title>
  <script>
   document.documentElement.className="client-js";RLCONF={"wgBreakFrames":!1,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"a86e7902-e676-4496-a01d-dbd4be3ad5ba","wgCSPNonce":!1,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":!1,"wgNamespaceNumber":0,"wgPageName":"List_of_postal_codes_of_Canada:_M","wgTitle":"List of postal codes of Canada: M","wgCurRevisionId":960187814,"wgRevisionId":960187814,"wgArticleId":539066,"wgIsArticle":!0,"wgIsRedirect":!1,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Articles with short description","Communications in Ontario","Postal codes in Canada","Toron

In [96]:
## Scraping Toronto Neighborhoods
toronto_table = soup.find('table', attrs = {'class': 'wikitable sortable'})

## Fetching Toronto table columns
toronto_table_col = toronto_table.find_all('tr')
col = []

for head in toronto_table_col[0].find_all('th'):
     # Remove any extra lines or spaces or \n from left or right
    col.append(head.text.replace('\n', ' ').strip())

# List to add the data of Toronto Neoghborhood
table_data = []

# Fetching each row in Toronto Neighborhood
for tr in toronto_table.find_all('tr'):
    
    t_row = {}
    for td, th in zip(tr.find_all('td'), col):
        t_row[th] = td.text.replace('\n', ' ').strip()
    
    # Appendong each row in the list
    table_data.append(t_row)
    
# Deleting the 1st element
table_data.pop(0)

# Creating DataFrame of Toronto Data
toronto_df = pd.DataFrame(table_data)
toronto_df

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
...,...,...,...
175,M5Z,Not assigned,Not assigned
176,M6Z,Not assigned,Not assigned
177,M7Z,Not assigned,Not assigned
178,M8Z,Etobicoke,"Mimico NW, The Queensway West, South of Bloor,..."


In [97]:
toronto_df.drop(toronto_df[toronto_df['Borough'] == 'Not assigned'].index, inplace = True)
toronto_df = toronto_df.reset_index(drop = True)

In [98]:
toronto_df

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
99,M4Y,Downtown Toronto,Church and Wellesley
100,M7Y,East Toronto,"Business reply mail Processing Centre, South C..."
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


## Using geopy to fetch Toronto Cordinates

### Geocoders throughs an exception of TimeOutError. We'll solve this by using the following function

In [99]:
from geopy.exc import GeocoderTimedOut

def do_geocode(address):
    geopy = Nominatim(user_agent = 'tor_agent')
    
    try:
        return geopy.geocode(address)
    except:
        return do_geocode(address)

In [100]:
address_tor = 'Toronto, ON'
location = do_geocode(address_tor)
lat_tor = location.latitude
lng_tor = location.longitude
print(lat_tor, lng_tor)

43.6534817 -79.3839347


### Fetching location of all the neighborhoods in Toronto Region

In [101]:
col = ['Latitude', 'Longitude']
neigh_df = pd.DataFrame(columns = col)
neigh_df['Neighborhood'] = toronto_df['Neighborhood']

lat_list = []
lng_list = []
for hood in toronto_df['Neighborhood']:
    address = str(hood) + ', ON'
    location = do_geocode(address)
    try:
        lat = location.latitude
        lng = location.longitude
        
    except:
        lat = None
        lng = None
    lat_list.append(lat)
    lng_list.append(lng)
    
print(lat_list)
print(lng_list)

[43.7587999, 43.732658, 43.64076885, 43.7152827, None, 43.6684992, 43.8091955, 43.775347, None, None, 43.7087117, None, None, 43.775347, 43.6999302, 43.6694032, None, None, 43.7534804, 43.6710244, 43.64798435, None, 43.7598243, 43.7047983, 22.2839934, 43.6641106, 43.75646655, 43.7996637, None, 43.704553, None, 43.6602019, 43.7437422, None, 43.7697182, None, None, None, None, 43.7691966, 43.7492988, None, None, None, None, 43.75191235, 43.7492988, 43.6685545, 44.6085054, None, 43.7600778, None, None, 43.7492988, 51.8323893, None, None, None, None, 43.7739798, 43.7492988, 43.729199, 41.1417024, 43.6655802, 43.7001608, None, 43.7440391, 43.697936, None, None, 42.8636751, None, 43.7739798, None, None, 43.6412811, None, None, 43.7853531, 43.697936, 43.6640959, 43.6517776, None, 43.6861645, None, None, None, None, None, None, 43.7986103, 43.6783556, None, 43.6017173, None, 43.8049304, None, None, None, 43.6655242, None, None, None]
[-79.3201966, -79.3111892, -79.37989177980148, -79.4439143, 

<b>As mentioned in the assignment, geocoder was not able to fetch the locations of some Neighborhoods. <br>
Hence, we'll be using the given data to merge the locations of the neighborhoods with the Toronto Data Frame.</b>

In [102]:
neigh_loc = pd.read_csv('Geospatial_Coordinates.csv')
neigh_loc

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
...,...,...,...
98,M9N,43.706876,-79.518188
99,M9P,43.696319,-79.532242
100,M9R,43.688905,-79.554724
101,M9V,43.739416,-79.588437


In [103]:
### Merging the two data frames
toronto_df = pd.merge(toronto_df, neigh_loc, on = 'Postal Code')

In [104]:
toronto_df

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.654260,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
...,...,...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",43.653654,-79.506944
99,M4Y,Downtown Toronto,Church and Wellesley,43.665860,-79.383160
100,M7Y,East Toronto,"Business reply mail Processing Centre, South C...",43.662744,-79.321558
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu...",43.636258,-79.498509


## Mapping Toronto Map with Neighborhoods

In [119]:
# Assigning Toronto Map
tor_map = folium.Map(location = [lat_tor +0.05, lng_tor], zoom_start = 11)

# Adding markers to the map
for lat, lng, pc, borough, neigh in zip(toronto_df['Latitude'], toronto_df['Longitude'], toronto_df['Postal Code'], 
                                       toronto_df['Borough'], toronto_df['Neighborhood']):
    
    label = '{}, {}, {}'.format(neigh, pc, borough)
    label = folium.Popup(label, parse_html = True)
    
    folium.CircleMarker([lat, lng],
                        radius = 3,
                        popup = label,
                        color = 'blue',
                        fill = True,
                        fill_color = '#3186cc',
                        fill_opacity = 0.7,
                        parse_html = False
                        ).add_to(tor_map)
tor_map