In [85]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

#conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

import requests #for getting url request
from bs4 import BeautifulSoup #need for parsing html
import re #need regular expressions

Libraries imported.


Calling the webpage and parsing with BeautifulSoup

In [86]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
page = requests.get(url)

#parsing using BeautifulSoup
soup = BeautifulSoup(page.content, 'html.parser')

Using "find all" to just select the table

In [87]:
tables = soup.find_all("table", {"class": 'wikitable sortable'})

Code that iterates through the "tables" file to get 
the items from the soup file in to lists to make the data frame.

In [89]:
PostalCode = []
Borough = []
Neighborhood = []

for table in tables:
    rows = table.find_all('tr')
    
    for row in rows:
        cells = row.find_all('td')
        
        
        if len(cells) > 2:
            postalcodes = cells[0]
            PostalCode.append(postalcodes.text.strip())
            
            boroughs = cells[1]
            Borough.append(boroughs.text.strip())
            
            neighborhoods = cells[2]
            Neighborhood.append(neighborhoods.text.strip())

Creating the data frame with the needed fields.

In [90]:
df = pd.DataFrame({'PostalCode':PostalCode,'Borough':Borough,'Neighborhood':Neighborhood})
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


Remove rows where Borough is "Not assigned"

In [91]:
df = df[~df['Borough'].isin(['Not assigned'])]

Showing the number of rows. There are 103.

In [84]:
df.shape

(103, 3)

I was not able to get the geocoder to work, so I am downloading the "Geospatial_Coordinates.csv" file and creating a data from and then merging it with the data frame scraped from the Wikipedia page

In [92]:
df2 = pd.read_csv("Geospatial_Coordinates.csv")
df2.rename(columns={'Postal Code':'PostalCode'},inplace=True)
df2.head()

Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


Merging the original data frame "df" with the "df2" data frame that has the
lat / long for each row.

In [93]:
df3 = pd.merge(df,df2,on=['PostalCode'])
df3.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


Creating a data frame that has the mean lat / long for each borough so
I can make a map showing the Toronto boroughs

In [94]:
df_borough = df3.groupby(by=df['Borough'],axis=0).mean().reset_index()
df_borough.head()

Unnamed: 0,Borough,Latitude,Longitude
0,Central Toronto,43.646001,-79.552846
1,Downtown Toronto,43.709822,-79.378349
2,East Toronto,43.676374,-79.410248
3,East York,43.726694,-79.382409
4,Etobicoke,43.707683,-79.375538


Creating a map of Toronto Boroughs

In [95]:
map_boroughs = folium.Map(location=[43.653, -79.3156], zoom_start=10)

# add markers to map
for lat, lng, label in zip(df_borough['Latitude'], df_borough['Longitude'], df_borough['Borough']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_boroughs)  
    
map_boroughs

Creating a map of Toronto showing all neighborhoods

In [96]:
map_neighborhoods = folium.Map(location=[43.653, -79.3156], zoom_start=9)

# add markers to map
for lat, lng, label in zip(df3['Latitude'], df3['Longitude'], df3['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='darkred',
        fill=True,
        fill_color='red',
        fill_opacity=0.7,
        parse_html=False).add_to(map_neighborhoods)  
    
map_neighborhoods

Getting neighborhoods from the East York borough to make a map

In [97]:
east_york_data = df3[df3['Borough'] == 'East York'].reset_index(drop=True)
east_york_data.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M4B,East York,"Parkview Hill, Woodbine Gardens",43.706397,-79.309937
1,M4C,East York,Woodbine Heights,43.695344,-79.318389
2,M4G,East York,Leaside,43.70906,-79.363452
3,M4H,East York,Thorncliffe Park,43.705369,-79.349372
4,M4J,East York,"East Toronto, Broadview North (Old East York)",43.685347,-79.338106


Creating a map showing the East York neighborhoods

In [98]:
map_east_york = folium.Map(location=[43.7267, -79.3824], zoom_start=12)

# add markers to map
for lat, lng, label in zip(east_york_data['Latitude'], east_york_data['Longitude'], east_york_data['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='darkgreen',
        fill=True,
        fill_color='green',
        fill_opacity=0.7,
        parse_html=False).add_to(map_east_york)  
    
map_east_york

Getting thing set up to use the Foursquare API

In [99]:
CLIENT_ID = 'OB1AVHQTRTGOLNQE15NAX2NAYWHE4BZN0UBRWG0FWOXR4OQH' # your Foursquare ID
CLIENT_SECRET = '20XNQQRY5WMVT0VFOSUEM3UM5KP0GC4MCJCKHL2SJUJMYD0L' # your Foursquare Secret
VERSION = '20180604' # Foursquare API version
LIMIT = 100 # A default Foursquare API limit value

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: OB1AVHQTRTGOLNQE15NAX2NAYWHE4BZN0UBRWG0FWOXR4OQH
CLIENT_SECRET:20XNQQRY5WMVT0VFOSUEM3UM5KP0GC4MCJCKHL2SJUJMYD0L


Getting a neighborhood to do a venue search

In [100]:
east_york_data.loc[1, 'Neighborhood']

'Woodbine Heights'

Getting lat / long for Woodbine Heights

In [101]:
neighborhood_latitude = east_york_data.loc[1, 'Latitude'] # neighborhood latitude value
neighborhood_longitude = east_york_data.loc[1, 'Longitude'] # neighborhood longitude value

neighborhood_name = east_york_data.loc[1, 'Neighborhood'] # neighborhood name

Doing a search of 10 venues in the Woodbine Heights neighborhood

In [102]:
LIMIT = 10 # limit of number of venues returned by Foursquare API

radius = 500 # define radius

# create URL
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    neighborhood_latitude, 
    neighborhood_longitude, 
    radius, 
    LIMIT)
url # display URL
results = requests.get(url).json()

Function that extracts the category of each venue

In [103]:
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

Clean the json and structure it into a pandas dataframe. And then showing
5 venues with it's category in East York.

In [104]:
venues = results['response']['groups'][0]['items']
    
nearby_venues = pd.json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

Unnamed: 0,name,categories,lat,lng
0,East York Memorial Arena,Skating Rink,43.697224,-79.315397
1,East York Curling Club,Curling Ice,43.696827,-79.313658
2,The Beer Store,Beer Store,43.693731,-79.316759
3,Stan Wadlow Park,Park,43.697836,-79.314303
4,Woodbine & Cosburn,Intersection,43.696456,-79.316614
