# Clustering Neighbourhoods in Toronto: Part 1
##### This notebook contains the code for the code that will be submitted to obtain the IBM Data Science Professional Certificate

In [None]:
from bs4 import BeautifulSoup 
import pandas as pd 
import requests  

## Part 1: First we will scrape the data about the Toronto neighbourhoods from Wikipedia

In [None]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
data  = requests.get(url).text 
soup = BeautifulSoup(data,"html5lib") 

In [None]:
#find all html tables in the web page
tables = soup.find_all('table') # in html table is represented by the tag <table>

for index,table in enumerate(tables):
    if ("Toronto" in str(table)):
        table_index = index
print(table_index) 

0


In [None]:
print(tables[table_index].prettify())

<table cellpadding="2" cellspacing="0" rules="all" style="width:100%; border-collapse:collapse; border:1px solid #ccc;">
 <tbody>
  <tr>
   <td style="width:11%; vertical-align:top; color:#ccc;">
    <p>
     <b>
      M1A
     </b>
     <br/>
     <span style="font-size:85%;">
      <i>
       Not assigned
      </i>
     </span>
    </p>
   </td>
   <td style="width:11%; vertical-align:top; color:#ccc;">
    <p>
     <b>
      M2A
     </b>
     <br/>
     <span style="font-size:85%;">
      <i>
       Not assigned
      </i>
     </span>
    </p>
   </td>
   <td style="width:11%; vertical-align:top;">
    <p>
     <b>
      M3A
     </b>
     <br/>
     <span style="font-size:85%;">
      <a href="/wiki/North_York" title="North York">
       North York
      </a>
      <br/>
      (
      <a href="/wiki/Parkwoods" title="Parkwoods">
       Parkwoods
      </a>
      )
     </span>
    </p>
   </td>
   <td style="width:11%; vertical-align:top;">
    <p>
     <b>
      M4A
     </b>
 

In [None]:
canada_data = pd.DataFrame(columns=["PostalCode", "Borough", "Neighbourhood"])
canada_data.set_index('PostalCode', inplace=True)

for row in tables[0].tbody.find_all("tr"):
    for col in row.find_all("td"):
      flag = False
      
      borough = None
      neighbourhood = None

      postalcode = col.b.text
      notassigned = col.i
  
      if notassigned == None:
        for anchors in col.find_all("a"):
          if flag==False:
            borough = anchors.text
            flag=True
          else:
            if neighbourhood == None:
              neighbourhood = anchors.text
            else:
              neighbourhood = neighbourhood + ", " + anchors.text
        if neighbourhood == None:
          neighbourhood = borough
        canada_data = canada_data.append({"PostalCode":postalcode, "Borough":borough, "Neighbourhood":neighbourhood}, ignore_index=True) 
canada_data

Unnamed: 0,Borough,Neighbourhood,PostalCode
0,North York,Parkwoods,M3A
1,North York,Victoria Village,M4A
2,Downtown Toronto,"Regent Park, Harbourfront",M5A
3,North York,"Lawrence Manor, Lawrence Heights",M6A
4,Queen's Park,Queen's Park,M7A
...,...,...,...
98,Etobicoke,"The Kingsway, Old Mill",M8X
99,Downtown Toronto,Church and Wellesley,M4Y
100,Business reply mail,Business reply mail,M7Y
101,Etobicoke,"Old Mill, Sunnylea, Humber Bay, Mimico, The Qu...",M8Y


In [None]:
canada_data.shape

(103, 3)

## Part 2: We now proceed to obtain the geospatial data and combine this with the neighbourhood data

In [None]:
%pip install geocoder



In [None]:
geo_df = pd.read_csv('Geospatial_Coordinates.csv')
geo_df.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [None]:
geo_df.rename(columns={'Postal Code': 'PostalCode'}, inplace=True)
geo_df.set_index('PostalCode', inplace=True)
geo_df.head()

Unnamed: 0_level_0,Latitude,Longitude
PostalCode,Unnamed: 1_level_1,Unnamed: 2_level_1
M1B,43.806686,-79.194353
M1C,43.784535,-79.160497
M1E,43.763573,-79.188711
M1G,43.770992,-79.216917
M1H,43.773136,-79.239476


In [None]:
new_df = pd.merge(geo_df,canada_data,on='PostalCode')

#rearange the columns
columns = ["PostalCode", "Borough", "Neighbourhood", "Latitude", "Longitude"]
new_df = new_df[columns]

new_df

Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
...,...,...,...,...,...
98,M9N,York,Weston,43.706876,-79.518188
99,M9P,Etobicoke,Etobicoke,43.696319,-79.532242
100,M9R,Etobicoke,Kingsview Village,43.688905,-79.554724
101,M9V,Etobicoke,"South Steeles, Silverstone, Jamestown, Mount O...",43.739416,-79.588437


## Part 3: We now proceed by clustering this data and displaying a map

Here we will keep things simple by working only with boroughs that contain the word Toronto

In [None]:
import folium
from geopy.geocoders import Nominatim

In [None]:
address = 'Toronto, ON'

geolocator = Nominatim(user_agent="to_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


In [None]:
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(new_df['Latitude'], new_df['Longitude'], new_df['Borough'], new_df['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto