In [None]:
!pip install beautifulsoup4

In [None]:
!pip install lxml

In [None]:
!pip install html5lib

In [None]:
!pip install requests

In [None]:
from bs4 import  BeautifulSoup
import requests
from urllib.request import urlopen
import pandas as pd

--> Scrape the Wikipedia page

In [None]:
html = urlopen("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")
soup = BeautifulSoup(html, 'lxml')
print(soup.prettify())

--> Copy data from the table in the Wikipedia page into a DataFrame

In [None]:
table = soup.find('table')
#print(table)
table_col = table.find_all('th')
table_rows = table.find_all('tr')
#print(table_rows)

df = []
for tr in table_rows:
    td = tr.find_all('td')
    row = [tr.text.strip() for tr in td if tr.text.strip()]
    if row:
        df.append(row)

df = pd.DataFrame(df, columns = ["Postcode", "Borough", "Neighbourhood"])
print(df)

--> Ignore cells with a borough that is Not assigned

In [None]:
df_2=df[df.Borough.str.contains("Not assigned") == False]
print(df_2)

--> Combine rows having the same Postcode into one row with the neighborhoods separated with a comma

In [None]:
df_3=df_2.groupby(['Postcode','Borough'])['Neighbourhood'].apply(','.join).reset_index()
print(df_3)

--> If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough. 

In [None]:
for index, row in df_3.iterrows():
    if row['Neighbourhood'] == 'Not assigned':
        row['Neighbourhood']=row['Borough']
        print(row)

-->  Use the .shape method to print the number of rows of your dataframe

In [None]:
df_3.shape

--> Use the Geocoder package or the csv file to create the new dataframe

In [None]:
geo_cord=pd.read_csv("http://cocl.us/Geospatial_data")
print(geo_cord.head(5))

In [None]:
df_4=df_3.merge(geo_cord, left_on='Postcode', right_on='Postal Code', how='outer')
df_4.drop(["Postal Code"], axis=1, inplace=True)
print(df_4.head(5))

--> Select boroughs that contain the word Toronto

In [24]:
df_5=df_4[df_4['Borough'].str.contains("Toronto")]
print(df_5.head(5))
df_5.shape

   Postcode          Borough                  Neighbourhood   Latitude  \
37      M4E     East Toronto                    The Beaches  43.676357   
41      M4K     East Toronto    The Danforth West,Riverdale  43.679557   
42      M4L     East Toronto  The Beaches West,India Bazaar  43.668999   
43      M4M     East Toronto                Studio District  43.659526   
44      M4N  Central Toronto                  Lawrence Park  43.728020   

    Longitude  
37 -79.293031  
41 -79.352188  
42 -79.315572  
43 -79.340923  
44 -79.388790  


(38, 5)

--> Cluster the Toronto neighborhoods into 5 clusters

In [37]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import LabelEncoder

In [109]:
kclusters = 5

#toronto_clustering = df_5.drop('Neighbourhood', 1)
toronto_clustering=df_5
print(toronto_clustering)
#toronto_clustering.dtypes

lb = LabelEncoder()
toronto_clustering["Postcode"] = lb.fit_transform(toronto_clustering["Postcode"])
toronto_clustering["Borough"] = lb.fit_transform(toronto_clustering["Borough"])
toronto_clustering["Neighbourhood"] = lb.fit_transform(toronto_clustering["Neighbourhood"])
#print(toronto_clustering)

kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_clustering)
kmeans.labels_

    Postcode  Borough                                      Neighbourhood  \
37         0        2                                        The Beaches   
41         1        2                        The Danforth West,Riverdale   
42         2        2                      The Beaches West,India Bazaar   
43         3        2                                    Studio District   
44         4        0                                      Lawrence Park   
45         5        0                                   Davisville North   
46         6        0                                 North Toronto West   
47         7        0                                         Davisville   
48         8        0                         Moore Park,Summerhill East   
49         9        0  Deer Park,Forest Hill SE,Rathnelly,South Hill,...   
50        10        1                                           Rosedale   
51        11        1                         Cabbagetown,St. James Town   
52        12

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


array([2, 2, 2, 2, 2, 0, 2, 0, 2, 0, 2, 0, 0, 3, 2, 2, 0, 0, 0, 3, 3, 3, 1,
       3, 1, 3, 4, 4, 1, 3, 4, 3, 1, 4, 1, 1, 1, 4], dtype=int32)

--> Generate maps to visualize your neighborhoods and how they cluster together

In [None]:
!conda install -c conda-forge folium=0.5.0 --yes
import folium

In [111]:
toronto_merged = toronto_clustering
toronto_merged["Cluster Labels"] = kmeans.labels_
print(toronto_merged.head(5))

    Postcode  Borough  Neighbourhood   Latitude  Longitude  Cluster Labels
37         0        2             35  43.676357 -79.293031               2
41         1        2             37  43.679557 -79.352188               2
42         2        2             36  43.668999 -79.315572               2
43         3        2             33  43.659526 -79.340923               2
44         4        0             22  43.728020 -79.388790               2


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


In [115]:
import numpy as np
from geopy.geocoders import Nominatim

--> Get coorindates of Toronto

In [121]:
address = 'Toronto'

geolocator = Nominatim()
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.653963, -79.387207.


In [118]:
import matplotlib.cm as cm
import matplotlib.colors as colors

In [120]:
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i+x+(i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighbourhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters