# Segmenting and Clustering Neighborhoods in Toronto

# 1) Create dataframe of three columns: PostalCode, Borough, and Neighborhood

## Import libraries and packages

In [5]:
from bs4 import BeautifulSoup
import requests
import numpy as np
import pandas as pd 

## indicate the HTML page to read

In [6]:
source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text

## convert html using lxml package

In [7]:
soup=BeautifulSoup(source,'lxml')

## Extract data from website 

In [8]:
table = soup.find('table')
rows = table.tbody.tr

## Read data and create panda dataframe (exclude 'Not Assigned' and blanks)

In [9]:
tpd = pd.DataFrame(columns=['Postcode','Borough LIST','Neighbourhood'])

x = 1
Postcode=''
Borough=''
Neighbourhood=''

for rows in table.find_all('td'):
  
    row = rows.text
    if x == 1:
        Postcode=row
        
    if x == 2:
        Borough=row
        
    if x == 3:
        Neighbourhood=row
        
    x = x + 1
    
    if x > 3:
        
        if Borough.strip() != 'Not assigned':
            
            if Neighbourhood.strip() =='Not assigned':
                Neighbourhood=Borough
            tpd = tpd.append({'Postcode': Postcode, 'Borough LIST': Borough, 'Neighbourhood': Neighbourhood}, ignore_index=True)
        
        x=1
        Postcode=''
        Borough=''
        Neighbourhood=''

## Replace '\n' with blank

In [10]:
#tpd.sort_values(by ='Postcode' )
tpd['Neighbourhood'] = tpd['Neighbourhood'].str.replace('\n',' ')

## Group data frame by Postcode, Borough and Neighbourhood and clean data

In [11]:
#tpd.groupby('Postcode')['Neighbourhood'].apply(list)
#tpd.groupby('Postcode').agg(lambda x: tuple(x))

#Group data
tpd_group = tpd.groupby(by='Postcode').agg({'Borough LIST':lambda x: list(x),'Neighbourhood':lambda x: list(x)},as_index=False)

#Create new column with first value from the list
tpd_group['Borough'] = tpd_group["Borough LIST"].str[0]

#Remove old column
tpd_group.drop(['Borough LIST'], axis=1, inplace=True)

# Convert List to a String
tpd_group['Neighbourhood'] = tpd_group.Neighbourhood.apply(', '.join)

tpd_group


Unnamed: 0_level_0,Neighbourhood,Borough
Postcode,Unnamed: 1_level_1,Unnamed: 2_level_1
M1B,"Rouge , Malvern",Scarborough
M1C,"Highland Creek , Rouge Hill , Port Union",Scarborough
M1E,"Guildwood , Morningside , West Hill",Scarborough
M1G,Woburn,Scarborough
M1H,Cedarbrae,Scarborough
M1J,Scarborough Village,Scarborough
M1K,"East Birchmount Park , Ionview , Kennedy Park",Scarborough
M1L,"Clairlea , Golden Mile , Oakridge",Scarborough
M1M,"Cliffcrest , Cliffside , Scarborough Village W...",Scarborough
M1N,"Birch Cliff , Cliffside West",Scarborough


# 2) Geospatial section

## Retrieve geographical coordinates from a CSV file

In [12]:
df_geo = pd.DataFrame(pd.read_csv('https://cocl.us/Geospatial_data'))
df_geo=df_geo.rename(columns = {'Postal Code':'Postcode'})
df_geo.set_index('Postcode',inplace=True)

## Merge neighborhoods from Toronto areas with geographical coordinates

In [50]:
df_all_toronto = tpd_group.join(df_geo)
df_all_toronto.head()

Unnamed: 0_level_0,Neighbourhood,Borough,Latitude,Longitude
Postcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
M1B,"Rouge , Malvern",Scarborough,43.806686,-79.194353
M1C,"Highland Creek , Rouge Hill , Port Union",Scarborough,43.784535,-79.160497
M1E,"Guildwood , Morningside , West Hill",Scarborough,43.763573,-79.188711
M1G,Woburn,Scarborough,43.770992,-79.216917
M1H,Cedarbrae,Scarborough,43.773136,-79.239476


## Get Geospatial libraries and packages

In [22]:
# Get geo library and map rendering library
from geopy.geocoders import Nominatim

!conda install -c conda-forge folium=0.5.0 --yes 
import folium 

Fetching package metadata .............
Solving package specifications: .

Package plan for installation in environment /opt/conda/envs/DSX-Python35:

The following NEW packages will be INSTALLED:

    altair:  2.2.2-py35_1 conda-forge
    branca:  0.3.1-py_0   conda-forge
    folium:  0.5.0-py_0   conda-forge
    vincent: 0.4.4-py_1   conda-forge

altair-2.2.2-p 100% |################################| Time: 0:00:00  50.86 MB/s
branca-0.3.1-p 100% |################################| Time: 0:00:00  36.43 MB/s
vincent-0.4.4- 100% |################################| Time: 0:00:00  37.93 MB/s
folium-0.5.0-p 100% |################################| Time: 0:00:00  48.93 MB/s
The geograpical coordinate of Toronto are 43.653963, -79.387207.


## Get Toronto location

In [48]:
address = 'Toronto, Ontario'

geolocator = Nominatim(user_agent="to_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.653963, -79.387207.


## Filter only boroughs that contain the word Toronto 

In [49]:
df_toronto= df_all_toronto[df_all_toronto['Borough'].str.contains("Toronto")]
df_toronto.head()

Unnamed: 0_level_0,Neighbourhood,Borough,Latitude,Longitude
Postcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
M4E,The Beaches,East Toronto,43.676357,-79.293031
M4K,"The Danforth West , Riverdale",East Toronto,43.679557,-79.352188
M4L,"The Beaches West , India Bazaar",East Toronto,43.668999,-79.315572
M4M,Studio District,East Toronto,43.659526,-79.340923
M4N,Lawrence Park,Central Toronto,43.72802,-79.38879


## Map to visualize the neighborhoods with word Toronto in Borough

In [52]:
# create map of New York using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=13)

# add markers to map
for lat, lng, borough, neighbourhood in zip(df_toronto['Latitude'], df_toronto['Longitude'], df_toronto['Borough'], df_toronto['Neighbourhood']):
    label = '{}, {}'.format(Neighbourhood, Borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

## Map to visualize All neighborhoods in Toronto

In [59]:
# create map of New York using latitude and longitude values
map_all_toronto = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, borough, neighbourhood in zip(df_all_toronto['Latitude'], df_all_toronto['Longitude'], df_all_toronto['Borough'], df_all_toronto['Neighbourhood']):
    label = '{}, {}'.format(Neighbourhood, Borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='',
        fill=True,
        fill_color='#cc3139',
        fill_opacity=0.7,
        parse_html=False).add_to(map_all_toronto)  
    
map_all_toronto

## Print the number of rows 

In [17]:
print('The total dataframe has {} Boroughs and {} Neighborhoods.'.format(
        len(tpd_group['Borough'].unique()),
        tpd_group.shape[0]
    )
)

The dataframe has 11 Boroughs and 103 Neighborhoods.


In [41]:
print('The Toronto dataframe has {} Boroughs and {} Neighborhoods.'.format(
        len(df_toronto['Borough'].unique()),
        df_toronto.shape[0]
    )
)

The Toronto dataframe has 4 Boroughs and 38 Neighborhoods.
