# Applied Data Science Capstone

## Segmenting and clustering neighborhoods in Toronto

### Week 3

In [1]:
from bs4 import BeautifulSoup # this module helps in web scrapping.
import requests  # this module helps us to download a web page
import json # library to handle JSON files
import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [2]:
# scraping wikipedia using beautifulsoup

In [3]:
#creating link to the webpage
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

#request and store the data
data = requests.get(url).text

#create a BeautifulSoup object
soup = BeautifulSoup(data,"html5lib")

#find the html tabel
table = soup.find('table')

In [4]:
#transform data

In [5]:
toronto_data = pd.DataFrame(columns =['PostalCode','Borough'])
for row in table.tbody.find_all("p"):
        for c1, c2 in zip(row.find_all("b"), row.find_all("span")):
            PostalCode = c1.text
            Borough = c2.text
            toronto_data = toronto_data.append({'PostalCode':PostalCode,'Borough':Borough}, ignore_index=True)

toronto_data = toronto_data[toronto_data['Borough']!="Not assigned"]
toronto_data[['Borough','Neighborhood','temp']] = toronto_data['Borough'].str.split('(', expand=True)
toronto_data['Neighborhood'] = toronto_data['Neighborhood'].str.replace(')',' ')
toronto_data['Neighborhood'] = toronto_data['Neighborhood']+' / '+toronto_data['temp'].str.replace(')','').fillna('')
toronto_data.drop(labels='temp', axis=1, inplace=True)
toronto_data['Neighborhood'] = toronto_data['Neighborhood'].str.replace(r'/ $','')
toronto_data['Neighborhood'] = toronto_data['Neighborhood'].str.replace(' /',',').str.strip()
toronto_data = toronto_data.reset_index(drop=True)

toronto_data.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Queen's Park,Ontario Provincial Government


In [6]:
print('#of rows in the dataframe: ', toronto_data.shape[0])

#of rows in the dataframe:  103


In [7]:
#adding geo codes to neighbourhoods

In [8]:
#loading dataset and adding coordinates to data

In [9]:
import io

geo_data_url = "https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-DS0701EN-SkillsNetwork/labs_v1/Geospatial_Coordinates.csv"
content = requests.get(geo_data_url).content
geo_data = pd.read_csv(io.StringIO(content.decode('utf-8')))
geo_data.head()
geo_data.rename(columns={'Postal Code':'PostalCode'}, inplace=True)
toronto_data = pd.merge(toronto_data,geo_data,on='PostalCode',how='left')
toronto_data.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Queen's Park,Ontario Provincial Government,43.662301,-79.389494


In [10]:
print('NULL values after joining? :', toronto_data.isnull().values.any())
print('#of rows in the dataframe: ', toronto_data.shape[0])

NULL values after joining? : False
#of rows in the dataframe:  103


In [11]:
#exploring the neighborhoods

In [12]:
#loading libraries

In [13]:
import numpy as np # library to handle data in a vectorized manner
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

!pip install folium

import folium # map rendering library

  from cryptography.utils import int_from_bytes
  from cryptography.utils import int_from_bytes
Collecting folium
  Downloading folium-0.12.1-py2.py3-none-any.whl (94 kB)
[K     |████████████████████████████████| 94 kB 4.7 MB/s  eta 0:00:01
Collecting branca>=0.3.0
  Downloading branca-0.4.2-py3-none-any.whl (24 kB)
Installing collected packages: branca, folium
Successfully installed branca-0.4.2 folium-0.12.1


In [14]:
address = 'Toronto City, ON'

geolocator = Nominatim(user_agent="on_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto City are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto City are 43.6534817, -79.3839347.


In [15]:
#Explore dataset

In [16]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

neighborhoods = toronto_data

# add markers to map
for lat, lng, borough, neighborhood in zip(neighborhoods['Latitude'], neighborhoods['Longitude'], neighborhoods['Borough'], neighborhoods['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

In [17]:
toronto = neighborhoods[toronto_data['Borough'].str.contains('Toronto')].reset_index(drop=True)
toronto.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
1,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
2,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
3,M4E,East Toronto,The Beaches,43.676357,-79.293031
4,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306


In [18]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(toronto['Latitude'], toronto['Longitude'], toronto['Borough'], toronto['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

In [19]:
#Using foursquare api to explore and segment


In [20]:
import os


CLIENT_ID = os.getenv('HPGIU0LLNQIUQV02H5ZH3TTX5FA1DZUNB5JAZAEQTXMD033O')
CLIENT_SECRET = os.getenv('XICDZGDDOGGTDSTB3ACFDSUO1LVMFCGXJKJMZAVPUCNV0PKP')
VERSION = '20210611' # Foursquare API version
LIMIT = 100 # A default Foursquare API limit value

In [21]:
toronto.loc[0, 'Neighborhood']

'Regent Park, Harbourfront'

In [22]:
neighborhood_latitude = toronto.loc[0, 'Latitude'] # neighborhood latitude value
neighborhood_longitude = toronto.loc[0, 'Longitude'] # neighborhood longitude value

neighborhood_name = toronto.loc[0, 'Neighborhood'] # neighborhood name

print('Latitude and longitude values of {} are {}, {}.'.format(neighborhood_name, 
                                                               neighborhood_latitude, 
                                                               neighborhood_longitude))

Latitude and longitude values of Regent Park, Harbourfront are 43.6542599, -79.3606359.


In [23]:
radius = 500
url = 'https://api.foursquare.com/v2/venues/explore?&client_id=HPGIU0LLNQIUQV02H5ZH3TTX5FA1DZUNB5JAZAEQTXMD033O&client_secret=XICDZGDDOGGTDSTB3ACFDSUO1LVMFCGXJKJMZAVPUCNV0PKP&v=20210611&ll=43.6542599,-79.3606359&radius=500&limit=45'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    neighborhood_latitude, 
    neighborhood_longitude, 
    radius, 
    LIMIT)

In [24]:
results = requests.get(url).json()
results

{'meta': {'code': 200, 'requestId': '60cba5a81f70f05311cf6e1a'},
 'response': {'suggestedFilters': {'header': 'Tap to show:',
   'filters': [{'name': 'Open now', 'key': 'openNow'}]},
  'headerLocation': 'Corktown',
  'headerFullLocation': 'Corktown, Toronto',
  'headerLocationGranularity': 'neighborhood',
  'totalResults': 44,
  'suggestedBounds': {'ne': {'lat': 43.6587599045, 'lng': -79.3544279001486},
   'sw': {'lat': 43.6497598955, 'lng': -79.36684389985142}},
  'groups': [{'type': 'Recommended Places',
    'name': 'recommended',
    'items': [{'reasons': {'count': 0,
       'items': [{'summary': 'This spot is popular',
         'type': 'general',
         'reasonName': 'globalInteractionReason'}]},
      'venue': {'id': '53b8466a498e83df908c3f21',
       'name': 'Tandem Coffee',
       'location': {'address': '368 King St E',
        'crossStreet': 'at Trinity St',
        'lat': 43.65355870959944,
        'lng': -79.36180945913513,
        'labeledLatLngs': [{'label': 'display',
 

In [25]:
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [26]:

venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()


  app.launch_new_instance()


Unnamed: 0,name,categories,lat,lng
0,Tandem Coffee,Coffee Shop,43.653559,-79.361809
1,Roselle Desserts,Bakery,43.653447,-79.362017
2,Cooper Koo Family YMCA,Distribution Center,43.653249,-79.358008
3,Body Blitz Spa East,Spa,43.654735,-79.359874
4,Impact Kitchen,Restaurant,43.656369,-79.35698


In [27]:
print('{} venues were returned by Foursquare.'.format(nearby_venues.shape[0]))


44 venues were returned by Foursquare.


## explore neighborhoods in Toronto

In [31]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        result = requests.get(url).json()
        results= result["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [32]:
m_name = toronto['Neighborhood']
m_lat = toronto['Latitude']
m_lng = toronto['Longitude']
toronto_venues = getNearbyVenues(m_name,m_lat,m_lng)

Regent Park, Harbourfront


KeyError: 'groups'

In [33]:
print(toronto_venues.shape)
toronto_venues.head()

NameError: name 'toronto_venues' is not defined