# Exam : Segmenting and Clustering Neighborhoods in Toronto

## Scrapping and preparation of dataframe

In [1]:
!pip install lxml



In [2]:
!pip install bs4
import pandas as pd
from bs4 import BeautifulSoup
import requests


Collecting bs4
  Downloading https://files.pythonhosted.org/packages/10/ed/7e8b97591f6f456174139ec089c769f89a94a1a4025fe967691de971f314/bs4-0.0.1.tar.gz
Collecting beautifulsoup4 (from bs4)
[?25l  Downloading https://files.pythonhosted.org/packages/cb/a1/c698cf319e9cfed6b17376281bd0efc6bfc8465698f54170ef60a485ab5d/beautifulsoup4-4.8.2-py3-none-any.whl (106kB)
[K     |████████████████████████████████| 112kB 30.2MB/s eta 0:00:01
[?25hCollecting soupsieve>=1.2 (from beautifulsoup4->bs4)
  Downloading https://files.pythonhosted.org/packages/81/94/03c0f04471fc245d08d0a99f7946ac228ca98da4fa75796c507f61e688c2/soupsieve-1.9.5-py2.py3-none-any.whl
Building wheels for collected packages: bs4
  Building wheel for bs4 (setup.py) ... [?25ldone
[?25h  Stored in directory: /home/jupyterlab/.cache/pip/wheels/a0/b0/b2/4f80b9456b87abedbc0bf2d52235414c3467d8889be38dd472
Successfully built bs4
Installing collected packages: soupsieve, beautifulsoup4, bs4
Successfully installed beautifulsoup4-4.8.2 bs

In [3]:
website_url = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(website_url,'lxml')
#print(soup.prettify())
wiki_tab = soup.find('table',{'class':'wikitable sortable'})
#wiki_tab = wiki_tab.find_all('tr')
#print(wiki_tab)

Now that the wiki_table is extracted lets extract informations from colomns and line in order to create the dataframe


In [4]:
postcode=[]
borough=[]
neighbourhood=[]
#Initialize datafram
for col in range(len(wiki_tab.findAll('tr'))):
    for line in range(len(wiki_tab.findAll('tr')[col].findAll('td'))):
        if line == 0 :
            postcode.append(str(wiki_tab.findAll('tr')[col].findAll('td')[line].getText()))
        elif line == 1 :
            borough.append(str(wiki_tab.findAll('tr')[col].findAll('td')[line].getText()))
        elif line == 2 :
            neighbourhood.append(str(wiki_tab.findAll('tr')[col].findAll('td')[line].getText()).strip('\n'))



In [5]:
wiki_data={'PostalCode':postcode,'Borough':borough,'Neighbourhood':neighbourhood}
wiki_df=pd.DataFrame(data=wiki_data)
wiki_df


Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
...,...,...,...
282,M8Z,Etobicoke,Mimico NW
283,M8Z,Etobicoke,The Queensway West
284,M8Z,Etobicoke,Royal York South West
285,M8Z,Etobicoke,South of Bloor


## Cleaning and requirements

1. Ignore cells with a borough that is Not assigned


In [6]:
wiki_data={'PostalCode':postcode,'Borough':borough,'Neighbourhood':neighbourhood}
wiki_df=pd.DataFrame(data=wiki_data)
not_assigned=wiki_df['Borough']!='Not assigned'
wiki_df=wiki_df[not_assigned]

In [7]:
print(wiki_df)

    PostalCode           Borough             Neighbourhood
2          M3A        North York                 Parkwoods
3          M4A        North York          Victoria Village
4          M5A  Downtown Toronto              Harbourfront
5          M6A        North York          Lawrence Heights
6          M6A        North York            Lawrence Manor
..         ...               ...                       ...
281        M8Z         Etobicoke  Kingsway Park South West
282        M8Z         Etobicoke                 Mimico NW
283        M8Z         Etobicoke        The Queensway West
284        M8Z         Etobicoke     Royal York South West
285        M8Z         Etobicoke            South of Bloor

[210 rows x 3 columns]


2. More than one neighborhood can exist in one postal code area. For example, in the table on the Wikipedia page, you will notice that M5A is listed twice and has two neighborhoods: Harbourfront and Regent Park. These two rows will be combined into one row with the neighborhoods separated with a comma as shown in row 11 in the above table.

In [8]:
df_3=wiki_df
df_3=df_3.groupby(['PostalCode','Borough'])['Neighbourhood'].apply(lambda x: ','.join(x)).reset_index()
df_3

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
...,...,...,...
98,M9N,York,Weston
99,M9P,Etobicoke,Westmount
100,M9R,Etobicoke,"Kingsview Village,Martin Grove Gardens,Richvie..."
101,M9V,Etobicoke,"Albion Gardens,Beaumond Heights,Humbergate,Jam..."


3.If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough. So for the 9th cell in the table on the Wikipedia page, the value of the Borough and the Neighborhood columns will be Queen's Park.

In [9]:
not_assigned_neigh=df_3['Neighbourhood']=='Not assigned'
df_3.loc[not_assigned_neigh,'Borough']=df_3.loc[not_assigned_neigh,'Neighbourhood']
df_3
    

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
...,...,...,...
98,M9N,York,Weston
99,M9P,Etobicoke,Westmount
100,M9R,Etobicoke,"Kingsview Village,Martin Grove Gardens,Richvie..."
101,M9V,Etobicoke,"Albion Gardens,Beaumond Heights,Humbergate,Jam..."


In [10]:
df_3.shape

(103, 3)

# Part2 Geo Data 

In [11]:
!wget -q -O 'geo_data.csv' http://cocl.us/Geospatial_data
print('Data downloaded!')

Data downloaded!


In [12]:
df=pd.read_csv('http://cocl.us/Geospatial_data')
df.shape


(103, 3)

In [13]:
df_geo=df_3
df_geo['Latitude']=df['Latitude']
df_geo['Longitude']= df['Longitude']
df_geo

Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge,Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
...,...,...,...,...,...
98,M9N,York,Weston,43.706876,-79.518188
99,M9P,Etobicoke,Westmount,43.696319,-79.532242
100,M9R,Etobicoke,"Kingsview Village,Martin Grove Gardens,Richvie...",43.688905,-79.554724
101,M9V,Etobicoke,"Albion Gardens,Beaumond Heights,Humbergate,Jam...",43.739416,-79.588437


# Part 3 Clustering

In [14]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')

Solving environment: done


  current version: 4.5.11
  latest version: 4.8.2

Please update conda by running

    $ conda update -n base -c defaults conda



## Package Plan ##

  environment location: /home/jupyterlab/conda/envs/python

  added / updated specs: 
    - geopy


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    certifi-2019.11.28         |           py36_0         149 KB  conda-forge
    scikit-learn-0.20.1        |   py36h22eb022_0         5.7 MB
    liblapack-3.8.0            |      11_openblas          10 KB  conda-forge
    numpy-1.18.1               |   py36h95a1406_0         5.2 MB  conda-forge
    liblapacke-3.8.0           |      11_openblas          10 KB  conda-forge
    geographiclib-1.50         |             py_0          34 KB  conda-forge
    libopenblas-0.3.6          |       h5a2b251_2         7.7 MB
    scipy-1.4.1                |   py36h921218d_0        

In [23]:
address = 'Toronto, CA'

geolocator = Nominatim(user_agent="to_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighbourhood in zip(df_geo['Latitude'], df_geo['Longitude'], df_geo['Borough'], df_geo['Neighbourhood']):
    label = '{}, {}'.format(neighbourhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto


The geograpical coordinate of Toronto are 43.653963, -79.387207.


In [24]:
import json
with open('creds.json') as f:
    data = json.load(f)
    CLIENT_ID = data['id']
    CLIENT_SECRET = data['secret']
VERSION = '20180605' # Foursquare API version

In [37]:
#Let's get the geographical coordinates of Downtown Toronto
address = 'Downtown Toronto, TO'

geolocator = Nominatim(user_agent="to_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Downtown Toronto are {}, {}.'.format(latitude, longitude))



GeocoderServiceError: [Errno 99] Cannot assign requested address

In [34]:
radius=500
neighbourhood_name
neighbourhood_latitude,neighbourhood_longitude
LIMIT=50
url='https://api.foursquare.com/v2/venues/search?explore?&client_id={}&client_secret={}&ll={},{}&v={}&radius={}&limit={}'.format(CLIENT_ID, CLIENT_SECRET, neighborhood_latitude, neighborhood_longitude, VERSION, radius, LIMIT)
results = requests.get(url).json()
results

NameError: name 'neighbourhood_name' is not defined