## Part I: obtain the dataset table from wikipedia

#### Install libraries and dependencies

In [2]:
!pip install bs4

Collecting bs4
  Downloading https://files.pythonhosted.org/packages/10/ed/7e8b97591f6f456174139ec089c769f89a94a1a4025fe967691de971f314/bs4-0.0.1.tar.gz
Collecting beautifulsoup4 (from bs4)
[?25l  Downloading https://files.pythonhosted.org/packages/d1/41/e6495bd7d3781cee623ce23ea6ac73282a373088fcd0ddc809a047b18eae/beautifulsoup4-4.9.3-py3-none-any.whl (115kB)
[K     |████████████████████████████████| 122kB 23.6MB/s eta 0:00:01
[?25hCollecting soupsieve>1.2; python_version >= "3.0" (from beautifulsoup4->bs4)
  Downloading https://files.pythonhosted.org/packages/36/69/d82d04022f02733bf9a72bc3b96332d360c0c5307096d76f6bb7489f7e57/soupsieve-2.2.1-py3-none-any.whl
Building wheels for collected packages: bs4
  Building wheel for bs4 (setup.py) ... [?25ldone
[?25h  Stored in directory: /home/jupyterlab/.cache/pip/wheels/a0/b0/b2/4f80b9456b87abedbc0bf2d52235414c3467d8889be38dd472
Successfully built bs4
Installing collected packages: soupsieve, beautifulsoup4, bs4
Successfully installed bea

In [3]:
# Import Libraries

from bs4 import BeautifulSoup # this module helps in web scrapping.
import requests  # this module helps us to download a web page
import pandas as pd

#### Import the wikipedia page and get the raw table from the url

In [4]:
# This is the wikipedia url with the data
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
data  = requests.get(url).text 
soup = BeautifulSoup(data,"html5lib")

In [5]:
tables = soup.find_all('table')
len(tables)

3

In [6]:
for index, table in enumerate(tables):
    if ("M3A" in str(table)):
        table_index = index
print(table_index)

0


In [7]:
toronto_data = pd.DataFrame(columns=["col1", "col2", "col3", "col4", "col5", "col6", "col7", "col8", "col9"])

for row in tables[0].tbody.find_all("tr"):
    col = row.find_all("td")
    if (col != []):
        col1 = col[0].text
        col2 = col[1].text
        col3 = col[2].text
        col4 = col[3].text
        col5 = col[4].text
        col6 = col[5].text
        col7 = col[6].text
        col8 = col[7].text
        col9 = col[8].text
        toronto_data = toronto_data.append({"col1":col1, "col2":col2, "col3":col3, "col4":col4, "col5":col5, "col6":col6,"col7":col7, "col8":col8, "col9":col9}, ignore_index=True)

#toronto_data.head()

### Rework the table to be able to work with it

In [8]:
toronto_data_new = toronto_data

# Make all 9 columns into one column:

df1 = toronto_data_new["col1"]
df2 = toronto_data_new["col2"]
df3 = toronto_data_new["col3"]
df4 = toronto_data_new["col4"]
df5 = toronto_data_new["col5"]
df6 = toronto_data_new["col6"]
df7 = toronto_data_new["col7"]
df8 = toronto_data_new["col8"]
df8 = toronto_data_new["col9"]

pieces = (df1,df2,df3,df4,df5,df6,df7,df8)

data_final = pd.concat(pieces, ignore_index = True)

data_final = pd.DataFrame(data_final)
data_final.rename(columns={0:'StrData'}, inplace=True)

#data_final.head()

In [9]:
# Remove all data points where the borough is not assigned

data_final = data_final[~data_final.StrData.str.contains("Not assigned")]
#data_final.head()

In [10]:
# Extract the postal code

data_final["PostCode"] = data_final['StrData'].str[1:4]
#data_final.head()

In [11]:
# Create the Borough column

data_final["Borough"] = data_final.StrData.str.extract(r'(?P<Str>[^(]+)')
data_final["Borough"] = data_final['Borough'].str[4:len(data_final['Borough'])]

#data_final.head()

In [12]:
# Create the Neighbourhood column

data_final["Neighbourhood"] = data_final.StrData.str.replace(r'[^(]*\(|\)[^)]*', '')
#data_final.head()

toronto_dataset = data_final
toronto_dataset.drop(columns=["StrData"], inplace = True)

#toronto_dataset.head()

In [13]:
# Replace the slash with a column

toronto_dataset = toronto_dataset.replace(r' /',',', regex=True)
toronto_dataset.head()

Unnamed: 0,PostCode,Borough,Neighbourhood
1,M1B,Scarborough,"Malvern, Rouge"
2,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
3,M1E,Scarborough,"Guildwood, Morningside, West Hill"
4,M1G,Scarborough,Woburn
5,M1H,Scarborough,Cedarbrae


### Dataframe obtained and now checked the no. of rows and columns

In [14]:
toronto_dataset.shape

(98, 3)

In [15]:
print(tables[0].prettify())

# M1A
# Not assigned

<table cellpadding="2" cellspacing="0" rules="all" style="width:100%; border-collapse:collapse; border:1px solid #ccc;">
 <tbody>
  <tr>
   <td style="width:11%; vertical-align:top; color:#ccc;">
    <p>
     <b>
      M1A
     </b>
     <br/>
     <span style="font-size:85%;">
      <i>
       Not assigned
      </i>
     </span>
    </p>
   </td>
   <td style="width:11%; vertical-align:top; color:#ccc;">
    <p>
     <b>
      M2A
     </b>
     <br/>
     <span style="font-size:85%;">
      <i>
       Not assigned
      </i>
     </span>
    </p>
   </td>
   <td style="width:11%; vertical-align:top;">
    <p>
     <b>
      M3A
     </b>
     <br/>
     <span style="font-size:85%;">
      <a href="/wiki/North_York" title="North York">
       North York
      </a>
      <br/>
      (
      <a href="/wiki/Parkwoods" title="Parkwoods">
       Parkwoods
      </a>
      )
     </span>
    </p>
   </td>
   <td style="width:11%; vertical-align:top;">
    <p>
     <b>
      M4A
     </b>
 

## Part II: obtain the longitude/latitude information for each postcode in the dataset

In [15]:
# Read in the geo data from the csv file

geo_data = pd.read_csv('Geospatial_Coordinates.csv')
#geo_data.head()

In [16]:
# merge the geo_data dataset with the toronto_dataset based on the postal code
toronto_data = toronto_dataset
toronto_data.rename(columns={'PostCode': 'Postal Code'}, inplace = True)

toronto_data = pd.merge(toronto_data, geo_data, on="Postal Code")
toronto_data.head()


Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


## Part III: Explore and cluster neighbourhoods in Toronto

#### The "Creative" plan: generate a map that can answer which neighbourhood one would want to live in
#### if a person like me required the following (sports and coffee):
##### 1. swimming pool AND
##### 2. cafes nearby


In [17]:
# Import libraries
import requests # library to handle requests
import random # library for random number generation


!pip install geopy
from geopy.geocoders import Nominatim # module to convert an address into latitude and longitude values

# libraries for displaying images
from IPython.display import Image 
from IPython.core.display import HTML 
    
# tranforming json file into a pandas dataframe library
from pandas.io.json import json_normalize


! pip install folium==0.5.0
import folium # plotting library

print('Folium installed')
print('Libraries imported.')

Collecting geopy
[?25l  Downloading https://files.pythonhosted.org/packages/0c/67/915668d0e286caa21a1da82a85ffe3d20528ec7212777b43ccd027d94023/geopy-2.1.0-py3-none-any.whl (112kB)
[K     |████████████████████████████████| 112kB 10.4MB/s eta 0:00:01
[?25hCollecting geographiclib<2,>=1.49 (from geopy)
  Downloading https://files.pythonhosted.org/packages/8b/62/26ec95a98ba64299163199e95ad1b0e34ad3f4e176e221c40245f211e425/geographiclib-1.50-py3-none-any.whl
Installing collected packages: geographiclib, geopy
Successfully installed geographiclib-1.50 geopy-2.1.0
Folium installed
Libraries imported.


In [18]:
toronto_data.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [19]:
# Select boroughs that are only in Toronto

#toronto_data.Borough.unique()

toronto_data_subset = toronto_data[toronto_data['Borough'].str.contains("Toronto")]
toronto_data_subset.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
37,M4E,East Toronto,The Beaches,43.676357,-79.293031
40,M4J,East YorkEast Toronto,The Danforth East,43.685347,-79.338106
41,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188
42,M4L,East Toronto,"India Bazaar, The Beaches West",43.668999,-79.315572
43,M4M,East Toronto,Studio District,43.659526,-79.340923


In [27]:
# Here obtain the geographical coordinates for Toronto

address = 'Toronto, Canada'
geolocator = Nominatim(user_agent="foursquare_agent")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto, Canada {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto, Canada 43.6534817, -79.3839347.


In [30]:
# Make a map to visualize where the neighbourhoods in Toronto are that we're intereseted in

map_toronto = folium.Map(location=[latitude, longitude], zoom_start=11)
# Adding markers to map
for lat, lng, neighborhood in zip(toronto_data_subset['Latitude'],  toronto_data_subset['Longitude'], toronto_data_subset['Neighbourhood']):
 label = '{}'.format(neighborhood)
 label = folium.Popup(label, parse_html=True)
 folium.CircleMarker([lat, lng],radius=5,popup=label,color='blue',fill=True,fill_color='#3186cc',fill_opacity=0.7).add_to(map_toronto)
map_toronto

In [31]:
# Include the FourSquare Credentials

CLIENT_ID = '5FQZACKYU0DLZYLIFLOB4I1FGTILI2M3RI32RPIC054FCM53' # your Foursquare ID
CLIENT_SECRET = 'W4FCR0PLPDV2P1HT5JHYSUQCOBB0PMASJJRIXB450LU2XB3V' # your Foursquare Secret
ACCESS_TOKEN = 'ZROUUEPHT0UOZ2NNDCQDKGIGCAVWC30DGYX4C34M1SGCHMWX' # your FourSquare Access Token
VERSION = '20180604'
LIMIT = 30
print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: 5FQZACKYU0DLZYLIFLOB4I1FGTILI2M3RI32RPIC054FCM53
CLIENT_SECRET:W4FCR0PLPDV2P1HT5JHYSUQCOBB0PMASJJRIXB450LU2XB3V


In [32]:
# Search for swimming pools

search_query = 'Pool'
radius = 500
print(search_query + ' .... OK!')

Pool .... OK!


In [33]:
# Define the url
url = 'https://api.foursquare.com/v2/venues/search?client_id={}&client_secret={}&ll={},{}&oauth_token={}&v={}&query={}&radius={}&limit={}'.format(CLIENT_ID, CLIENT_SECRET, latitude, longitude,ACCESS_TOKEN, VERSION, search_query, radius, LIMIT)
url

'https://api.foursquare.com/v2/venues/search?client_id=5FQZACKYU0DLZYLIFLOB4I1FGTILI2M3RI32RPIC054FCM53&client_secret=W4FCR0PLPDV2P1HT5JHYSUQCOBB0PMASJJRIXB450LU2XB3V&ll=43.6534817,-79.3839347&oauth_token=ZROUUEPHT0UOZ2NNDCQDKGIGCAVWC30DGYX4C34M1SGCHMWX&v=20180604&query=Pool&radius=500&limit=30'

In [34]:
# Send the GET request and examine the results
results = requests.get(url).json()
results

{'meta': {'code': 200, 'requestId': '605de387f4d76f5939b5c503'},
 'notifications': [{'type': 'notificationTray', 'item': {'unreadCount': 0}}],
 'response': {'venues': [{'id': '515e0d16e4b0e99f5f4764a8',
    'name': 'Pool',
    'location': {'address': 'Intercontinental',
     'lat': 43.65097492328021,
     'lng': -79.38405339932878,
     'labeledLatLngs': [{'label': 'display',
       'lat': 43.65097492328021,
       'lng': -79.38405339932878}],
     'distance': 279,
     'cc': 'CA',
     'state': 'Ontario',
     'country': 'Canada',
     'formattedAddress': ['Intercontinental', 'Ontario', 'Canada']},
    'categories': [{'id': '4bf58dd8d48988d15e941735',
      'name': 'Pool',
      'pluralName': 'Pools',
      'shortName': 'Pool',
      'icon': {'prefix': 'https://ss3.4sqi.net/img/categories_v2/parks_outdoors/pool_',
       'suffix': '.png'},
      'primary': True}],
    'referralId': 'v-1616765831',
    'hasPerk': False},
   {'id': '51c0b4bf498e78941e4f6a9d',
    'name': 'Swimming Pool'

In [35]:
# assign relevant part of JSON to venues
venues = results['response']['venues']

# tranform venues into a dataframe
dataframe = json_normalize(venues)
dataframe.head()

  """


Unnamed: 0,id,name,categories,referralId,hasPerk,location.address,location.lat,location.lng,location.labeledLatLngs,location.distance,location.cc,location.state,location.country,location.formattedAddress,location.crossStreet,location.city,location.postalCode,location.neighborhood
0,515e0d16e4b0e99f5f4764a8,Pool,"[{'id': '4bf58dd8d48988d15e941735', 'name': 'P...",v-1616765831,False,Intercontinental,43.650975,-79.384053,"[{'label': 'display', 'lat': 43.65097492328021...",279,CA,Ontario,Canada,"[Intercontinental, Ontario, Canada]",,,,
1,51c0b4bf498e78941e4f6a9d,Swimming Pool,"[{'id': '4bf58dd8d48988d132951735', 'name': 'H...",v-1616765831,False,123 Queen St W,43.652266,-79.384815,"[{'label': 'display', 'lat': 43.652266, 'lng':...",152,CA,ON,Canada,[123 Queen St W (between Bay St & University A...,between Bay St & University Ave,Toronto,,
2,4fe8ce26e4b08e1d007d1088,Doubletree Pool & Spa,"[{'id': '4bf58dd8d48988d15e941735', 'name': 'P...",v-1616765831,False,108 Chestnut Street,43.65458,-79.386118,"[{'label': 'display', 'lat': 43.65457985232015...",214,CA,ON,Canada,"[108 Chestnut Street (Dundas), Toronto ON M5G ...",Dundas,Toronto,M5G 1R3,
3,4bf46261cad2c9289bdc9b99,Deck 27 Pool & Fitness Facility,"[{'id': '4bf58dd8d48988d132951735', 'name': 'H...",v-1616765831,False,33 Gerrard St W,43.65817,-79.383062,"[{'label': 'display', 'lat': 43.65816982438936...",526,CA,ON,Canada,[33 Gerrard St W (in Delta Chelsea on 27th Flo...,in Delta Chelsea on 27th Floor,Toronto,,
4,5133f6d0e4b08f94a76512ab,The Pool at Shangri-La,"[{'id': '4bf58dd8d48988d132951735', 'name': 'H...",v-1616765831,False,188 University Ave,43.649251,-79.385598,"[{'label': 'display', 'lat': 43.64925135414941...",489,CA,ON,Canada,"[188 University Ave (Adelaide St. W), Toronto ...",Adelaide St. W,Toronto,M5H 0A3,Financial District


In [37]:
# Filter the dataframe to only keep points of interest

# keep only columns that include venue name, and anything that is associated with location
filtered_columns = ['name', 'categories'] + [col for col in dataframe.columns if col.startswith('location.')] + ['id']
dataframe_filtered = dataframe.loc[:, filtered_columns]

# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

# filter the category for each row
dataframe_filtered['categories'] = dataframe_filtered.apply(get_category_type, axis=1)

# clean column names by keeping only last term
dataframe_filtered.columns = [column.split('.')[-1] for column in dataframe_filtered.columns]

dataframe_filtered.head()

Unnamed: 0,name,categories,address,lat,lng,labeledLatLngs,distance,cc,state,country,formattedAddress,crossStreet,city,postalCode,neighborhood,id
0,Pool,Pool,Intercontinental,43.650975,-79.384053,"[{'label': 'display', 'lat': 43.65097492328021...",279,CA,Ontario,Canada,"[Intercontinental, Ontario, Canada]",,,,,515e0d16e4b0e99f5f4764a8
1,Swimming Pool,Hotel Pool,123 Queen St W,43.652266,-79.384815,"[{'label': 'display', 'lat': 43.652266, 'lng':...",152,CA,ON,Canada,[123 Queen St W (between Bay St & University A...,between Bay St & University Ave,Toronto,,,51c0b4bf498e78941e4f6a9d
2,Doubletree Pool & Spa,Pool,108 Chestnut Street,43.65458,-79.386118,"[{'label': 'display', 'lat': 43.65457985232015...",214,CA,ON,Canada,"[108 Chestnut Street (Dundas), Toronto ON M5G ...",Dundas,Toronto,M5G 1R3,,4fe8ce26e4b08e1d007d1088
3,Deck 27 Pool & Fitness Facility,Hotel Pool,33 Gerrard St W,43.65817,-79.383062,"[{'label': 'display', 'lat': 43.65816982438936...",526,CA,ON,Canada,[33 Gerrard St W (in Delta Chelsea on 27th Flo...,in Delta Chelsea on 27th Floor,Toronto,,,4bf46261cad2c9289bdc9b99
4,The Pool at Shangri-La,Hotel Pool,188 University Ave,43.649251,-79.385598,"[{'label': 'display', 'lat': 43.64925135414941...",489,CA,ON,Canada,"[188 University Ave (Adelaide St. W), Toronto ...",Adelaide St. W,Toronto,M5H 0A3,Financial District,5133f6d0e4b08f94a76512ab


In [46]:
# Now that we have the list of pools in Toronto, I would like to see them on the map

dataframe_filtered["name_categories"] = dataframe_filtered["name"] + "+" + dataframe_filtered["categories"]


pools_map = folium.Map(location=[latitude, longitude], zoom_start=15) # generate map centred around Toronto

# Adding neighbourhood name markers to map
for lat, lng, neighborhood in zip(toronto_data_subset['Latitude'],  toronto_data_subset['Longitude'], toronto_data_subset['Neighbourhood']):
 label = '{}'.format(neighborhood)
 label = folium.Popup(label, parse_html=True)
 folium.CircleMarker([lat, lng],radius=5,popup=label,color='blue',fill=True,fill_color='#3186cc',fill_opacity=0.7).add_to(pools_map)


# add pools to the map
for lat, lng, label in zip(dataframe_filtered.lat, dataframe_filtered.lng, dataframe_filtered.name_categories):
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        fill=True,
        color='black',
        fill_color='red',
        fill_opacity=0.6
        ).add_to(pools_map)

# display map
pools_map