# Segmenting and Clustering Neighborhoods in Toronto, Canada

# I. Generate DataFrame from Wiki

## 1. Import BeautifulSoup and request for scrapy data from Wiki

In [1]:
from bs4 import BeautifulSoup #you need install BeautifulSoup before using !conda install -c anaconda beautifulsoup4

In [2]:
import requests

In [3]:
url = r'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

In [4]:
html_content = requests.get(url).text

In [5]:
soup = BeautifulSoup(html_content, 'lxml')
print(soup.prettify())

<!DOCTYPE html>
<html class="client-nojs" dir="ltr" lang="en">
 <head>
  <meta charset="utf-8"/>
  <title>
   List of postal codes of Canada: M - Wikipedia
  </title>
  <script>
   document.documentElement.className="client-js";RLCONF={"wgBreakFrames":!1,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgMonthNamesShort":["","Jan","Feb","Mar","Apr","May","Jun","Jul","Aug","Sep","Oct","Nov","Dec"],"wgRequestId":"XmNawApAIC0AAEfUsD4AAABO","wgCSPNonce":!1,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":!1,"wgNamespaceNumber":0,"wgPageName":"List_of_postal_codes_of_Canada:_M","wgTitle":"List of postal codes of Canada: M","wgCurRevisionId":942851379,"wgRevisionId":942851379,"wgArticleId":539066,"wgIsArticle":!0,"wgIsRedirect":!1,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Communi

## 2. Find all 'tr' tag and get data 

In [6]:
tr_data = soup.find_all('tr')
canada_df = []
headings = ['Postcode', 'Borough', 'Neighbourhood']
for tr in tr_data:
    t_row = {}
    for td, th in zip(tr.find_all('td'),headings):
        t_row[th] = td.text.replace('\n', '').strip()
        
    if 'Postcode' in t_row and t_row['Postcode'].strip().upper() != 'M' and len(t_row['Postcode'].strip()) != 3:
        continue
        
    if 'Borough' in t_row and t_row['Borough'].strip().upper() == 'Not Assigned'.upper():
        continue
        
    if 'Borough' in t_row and t_row['Borough'].strip().upper() != 'Not Assigned'.upper() and \
       'Neighbourhood' in t_row and t_row['Neighbourhood'].strip().upper() == 'Not Assigned'.upper():
        t_row['Neighbourhood'] = t_row['Borough']
    if  t_row:
        canada_df.append(t_row)
        
canada_df

[{'Postcode': 'M3A', 'Borough': 'North York', 'Neighbourhood': 'Parkwoods'},
 {'Postcode': 'M4A',
  'Borough': 'North York',
  'Neighbourhood': 'Victoria Village'},
 {'Postcode': 'M5A',
  'Borough': 'Downtown Toronto',
  'Neighbourhood': 'Harbourfront'},
 {'Postcode': 'M6A',
  'Borough': 'North York',
  'Neighbourhood': 'Lawrence Heights'},
 {'Postcode': 'M6A',
  'Borough': 'North York',
  'Neighbourhood': 'Lawrence Manor'},
 {'Postcode': 'M7A',
  'Borough': 'Downtown Toronto',
  'Neighbourhood': "Queen's Park"},
 {'Postcode': 'M9A',
  'Borough': 'Etobicoke',
  'Neighbourhood': 'Islington Avenue'},
 {'Postcode': 'M1B', 'Borough': 'Scarborough', 'Neighbourhood': 'Rouge'},
 {'Postcode': 'M1B', 'Borough': 'Scarborough', 'Neighbourhood': 'Malvern'},
 {'Postcode': 'M3B',
  'Borough': 'North York',
  'Neighbourhood': 'Don Mills North'},
 {'Postcode': 'M4B',
  'Borough': 'East York',
  'Neighbourhood': 'Woodbine Gardens'},
 {'Postcode': 'M4B', 'Borough': 'East York', 'Neighbourhood': 'Parkvie

## 3. Use pandas to create DataFrame to process data more easier.

In [7]:
import pandas as pd

In [8]:
can_df = pd.DataFrame(canada_df)
can_df.head()
can_df.dtypes

Borough          object
Neighbourhood    object
Postcode         object
dtype: object

## 4. Group by data by Postcode and Borough

In [9]:
can_df.groupby(['Postcode', 'Borough'], as_index = False).agg({'Neighbourhood': ', '.join})

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


In [10]:
can_df.shape

(210, 3)

# II. Explore and cluster the neighborhood in Toronto

## 1. Import library and read latitude, longitude data from CSV file.

In [21]:
from geopy.geocoders import Nominatim
import folium

In [22]:
can_latlngs = pd.read_csv('http://cocl.us/Geospatial_data')

In [23]:
can_latlngs.rename(columns={'Postal Code': 'Postcode'}, inplace=True)
can_latlngs.head()

Unnamed: 0,Postcode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


## 2. Merge data with Canada Toronto Borough

In [24]:
result = pd.merge(left=can_df, right=can_latlngs, how='left', left_on='Postcode', right_on='Postcode')

In [17]:
result.head()

Unnamed: 0,Borough,Neighbourhood,Postcode,Latitude,Longitude
0,North York,Parkwoods,M3A,43.753259,-79.329656
1,North York,Victoria Village,M4A,43.725882,-79.315572
2,Downtown Toronto,Harbourfront,M5A,43.65426,-79.360636
3,North York,Lawrence Heights,M6A,43.718518,-79.464763
4,North York,Lawrence Manor,M6A,43.718518,-79.464763


## 3. Load data into map

In [19]:
address = 'Toronto City, CA-ON'

geolocator = Nominatim(user_agent="to_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto City are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto City are 43.653963, -79.387207.


In [25]:
# create map of New York using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(result['Latitude'], result['Longitude'], result['Borough'], result['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto