# Part -1 | Explore and cluster the neighborhoods in Toronto.

Import required Libraries

In [4]:
import pandas as pd
import requests
from urllib.request import urlopen
from bs4 import BeautifulSoup

Add a utility function to get the html content from the wiki page - https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M

In [5]:
def getHTMLContent(link):
    html = urlopen(link)
    soup = BeautifulSoup(html, 'html.parser')
    return soup

Get Content from the page and from all the content retrieved from the page, retrieve only the 'tables'

In [6]:
content = getHTMLContent('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')
tables = content.find_all('table')
for table in tables:
    print(table.prettify())

<table class="wikitable sortable">
 <tbody>
  <tr>
   <th>
    Postcode
   </th>
   <th>
    Borough
   </th>
   <th>
    Neighborhood
   </th>
  </tr>
  <tr>
   <td>
    M1A
   </td>
   <td>
    Not assigned
   </td>
   <td>
    Not assigned
   </td>
  </tr>
  <tr>
   <td>
    M2A
   </td>
   <td>
    Not assigned
   </td>
   <td>
    Not assigned
   </td>
  </tr>
  <tr>
   <td>
    M3A
   </td>
   <td>
    <a href="/wiki/North_York" title="North York">
     North York
    </a>
   </td>
   <td>
    <a href="/wiki/Parkwoods" title="Parkwoods">
     Parkwoods
    </a>
   </td>
  </tr>
  <tr>
   <td>
    M4A
   </td>
   <td>
    <a href="/wiki/North_York" title="North York">
     North York
    </a>
   </td>
   <td>
    <a href="/wiki/Victoria_Village" title="Victoria Village">
     Victoria Village
    </a>
   </td>
  </tr>
  <tr>
   <td>
    M5A
   </td>
   <td>
    <a href="/wiki/Downtown_Toronto" title="Downtown Toronto">
     Downtown Toronto
    </a>
   </td>
   <td>
    <a href="/

As seen from the output the table we are interested is of class 'wikitable sortable'. Hence extract/find that table from the content retrieved

In [7]:
table = content.find('table', {'class': 'wikitable sortable'})
rows_from_wiki = table.find_all('tr')

In [8]:
rows = []
# Get the data from each cell in the row and if it is available append it in the row list
for cell in rows_from_wiki:
    i = cell.find_all('td')
    if i:
        rows.append(i)

# From the rows (list), add the validations as per the requirements given in part 1 of this assignment
lst = []
for row in rows:
    postalcode = row[0].text.rstrip()
    borough = row[1].text.rstrip()
    neighborhood = row[2].text.rstrip()
    if borough != 'Not assigned':
        if neighborhood == 'Not assigned':
            neighborhood = borough
        lst.append([postalcode, borough, neighborhood])

In [9]:
lst

[['M3A', 'North York', 'Parkwoods'],
 ['M4A', 'North York', 'Victoria Village'],
 ['M5A', 'Downtown Toronto', 'Harbourfront'],
 ['M6A', 'North York', 'Lawrence Heights'],
 ['M6A', 'North York', 'Lawrence Manor'],
 ['M7A', 'Downtown Toronto', "Queen's Park"],
 ['M9A', "Queen's Park", "Queen's Park"],
 ['M1B', 'Scarborough', 'Rouge'],
 ['M1B', 'Scarborough', 'Malvern'],
 ['M3B', 'North York', 'Don Mills North'],
 ['M4B', 'East York', 'Woodbine Gardens'],
 ['M4B', 'East York', 'Parkview Hill'],
 ['M5B', 'Downtown Toronto', 'Ryerson'],
 ['M5B', 'Downtown Toronto', 'Garden District'],
 ['M6B', 'North York', 'Glencairn'],
 ['M9B', 'Etobicoke', 'Cloverdale'],
 ['M9B', 'Etobicoke', 'Islington'],
 ['M9B', 'Etobicoke', 'Martin Grove'],
 ['M9B', 'Etobicoke', 'Princess Gardens'],
 ['M9B', 'Etobicoke', 'West Deane Park'],
 ['M1C', 'Scarborough', 'Highland Creek'],
 ['M1C', 'Scarborough', 'Rouge Hill'],
 ['M1C', 'Scarborough', 'Port Union'],
 ['M3C', 'North York', 'Flemingdon Park'],
 ['M3C', 'North

In [10]:
# Add  columns to the above list and convert it into a data frame
cols = ['PostalCode', 'Borough', 'Neighborhood']
dataset = pd.DataFrame(lst, columns=cols)

In [11]:
dataset

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,Lawrence Heights
4,M6A,North York,Lawrence Manor
5,M7A,Downtown Toronto,Queen's Park
6,M9A,Queen's Park,Queen's Park
7,M1B,Scarborough,Rouge
8,M1B,Scarborough,Malvern
9,M3B,North York,Don Mills North


In [12]:
print(dataset.shape)

(210, 3)


### ---------------------------------------------------------------End of Part 1----------------------------------------------------------------------------- 

# Part-2 |  Dataset with the latitude and the longitude coordinates of each neighborhood.

In [13]:
dataset_with_coordinates = pd.read_csv('http://cocl.us/Geospatial_data')
dataset_with_coordinates.columns = ['PostalCode', 'Latitude', 'Longitude']

In [42]:
toronto_dataset = pd.merge(dataset, dataset_with_coordinates, on=['PostalCode'], how='inner')

In [43]:
toronto_dataset.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636
3,M6A,North York,Lawrence Heights,43.718518,-79.464763
4,M6A,North York,Lawrence Manor,43.718518,-79.464763


In [44]:
toronto_dataset.shape

(210, 5)

### ---------------------------------------------------------------End of Part 2----------------------------------------------------------------------------- 

# Part-3 | Explore and cluster the neighborhoods in Toronto

In [1]:
import folium # map rendering library

In [2]:
from geopy.geocoders import Nominatim # converts an address into latitude and longitude values

In [3]:
address = 'Toronto, Canada'

geolocator = Nominatim()
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of the City of Toronto are {}, {}.'.format(latitude, longitude))

  This is separate from the ipykernel package so we can avoid doing imports until


The geograpical coordinate of the City of Toronto are 43.653963, -79.387207.


In [18]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(toronto_dataset['Latitude'], toronto_dataset['Longitude'], toronto_dataset['Borough'], toronto_dataset['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

**Folium** is a great visualization library. Feel free to zoom into the above map, and click on each circle mark to reveal the name of the neighborhood and its respective borough.

However, for illustration purposes, let's simplify the above map and segment and cluster only the neighborhoods in Toronto. So let's slice the original dataframe and create a new dataframe of the Toronto data.

In [45]:
toronto_dataset["ContainsToronto"]= toronto_dataset["Borough"].str.find('Toronto') 

In [46]:
toronto_dataset.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,ContainsToronto
0,M3A,North York,Parkwoods,43.753259,-79.329656,-1
1,M4A,North York,Victoria Village,43.725882,-79.315572,-1
2,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636,9
3,M6A,North York,Lawrence Heights,43.718518,-79.464763,-1
4,M6A,North York,Lawrence Manor,43.718518,-79.464763,-1


**Note** - -1 indicates that the word 'Toronto' doesnt exists in the Borough name. Hence,we will get only those records where value of ContainsToronto column is not -1

In [47]:
toronto_dataset = toronto_dataset[toronto_dataset['ContainsToronto'] != -1]

In [48]:
toronto_dataset.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,ContainsToronto
2,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636,9
5,M7A,Downtown Toronto,Queen's Park,43.662301,-79.389494,9
12,M5B,Downtown Toronto,Ryerson,43.657162,-79.378937,9
13,M5B,Downtown Toronto,Garden District,43.657162,-79.378937,9
26,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418,9


In [49]:
toronto_dataset.shape

(74, 6)

So, we have 74 records where Borough name contains 'Toronto' in it.

In [40]:
# create map of the above dataset using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(toronto_dataset['Latitude'], toronto_dataset['Longitude'], toronto_dataset['Borough'], toronto_dataset['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

Hence, we can see the map which contains cluster for the areas/neighbourhood for Toronto