# Where To Open A New Coffee Shop In Toronto?

**Introduction:** An International Coffee chain is looking to expand to Canada and wants to open its first location in Toronto. It wants to ensure that its first location is a success, and is looking for an area that fits the following criteria:

- **Large customer base:** Densely populated area with many residents, offices and schools**
- **Underdeveloped coffee scene**: Not many existing coffee shops in the area

This analysis will aim to find Toronto neighbourhoods that satisfy these criteria

**Data**
This analysis will be performed by integrating the following data sets:
- Neighbourhood information for the City of Toronto that will be obtained from Wikipedia
- Venue related information for the City of Toronto that will be obtained from the Foursquare API

The data will be converted into a final pandas data frame for analysis, and Folium will be used to visualize it. Finally, k-means clustering will be used to discover neighbourhoods that are best suited to opening a new coffee shop 

In [16]:
#Prepare Environment + Load packages
from bs4 import BeautifulSoup
import html5lib
import requests
import lxml
import pandas as pd
import numpy as np
#!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors
# import k-means from clustering stage
from sklearn.cluster import KMeans
!pip install folium
import folium # map rendering library
print('Libraries imported.')

Collecting folium
[?25l  Downloading https://files.pythonhosted.org/packages/a4/f0/44e69d50519880287cc41e7c8a6acc58daa9a9acf5f6afc52bcc70f69a6d/folium-0.11.0-py2.py3-none-any.whl (93kB)
[K     |████████████████████████████████| 102kB 8.1MB/s ta 0:00:011
[?25hCollecting branca>=0.3.0 (from folium)
  Downloading https://files.pythonhosted.org/packages/13/fb/9eacc24ba3216510c6b59a4ea1cd53d87f25ba76237d7f4393abeaf4c94e/branca-0.4.1-py3-none-any.whl
Installing collected packages: branca, folium
Successfully installed branca-0.4.1 folium-0.11.0
Libraries imported.


In [17]:
#Extract Toronto City Data
source=requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup=BeautifulSoup(source,'lxml')
print(soup.title)
df_table=soup.find_all('table')[0]

<title>List of postal codes of Canada: M - Wikipedia</title>


In [18]:
#Transform into data frame
tb_row=df_table.find_all('tr')
table=[]
for row in tb_row :
    head=row.find_all('th')
    head=[x.text.strip() for x in head]
    cols=row.find_all('td')
    cols=[x.text.strip() for x in cols]
    table.append(cols)
    
df=pd.DataFrame(table)
df.head(5)

Unnamed: 0,0,1,2
0,,,
1,M1A,Not assigned,Not assigned
2,M2A,Not assigned,Not assigned
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village


In [19]:
#Rename Columns
df.rename(columns={0:'Postcode',1:'Borough',2:'Neighbourhood'}, inplace=True)

#Ignore cells with unassigned Borough
indexNames=df[df['Borough']=='Not assigned'].index
indexNames
df.drop(indexNames, inplace=True)
df=df.drop(df.index[0])

#More than one neighborhood can exist in one postal code area.
df[df.duplicated(['Postcode'], keep=False)]
df = df.groupby('Postcode').agg({'Borough':'first', 
                             'Neighbourhood': ', '.join,}).reset_index()

#Cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough
indexNei=df[df['Neighbourhood']=='Not assigned'].index
indexNei
df.at[[9], 'Neighbourhood']="Port Union"
df.loc[9]

df.shape

(103, 3)

In [20]:
#Merge Geogrpahical Coordinates
file='http://cocl.us/Geospatial_data'
geocode=pd.read_csv(file)
geocode.rename(columns={'Postal Code':'Postcode'}, inplace=True)

#Combine both data frames
dfg=pd.merge(df,geocode, on='Postcode')
dfg.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [21]:
#Identify the City of Toronto
address = 'Toronto City'
geolocator = Nominatim()
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto City are {}, {}.'.format(latitude, longitude))

  app.launch_new_instance()


The geograpical coordinate of Toronto City are 43.6534817, -79.3839347.


In [24]:
# create map of Toronto
map_to = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, postcode, borough, neighbourhood in zip(dfg['Latitude'], dfg['Longitude'], dfg['Postcode'],dfg['Borough'], dfg['Neighbourhood']):
    label = '{}, {}'.format(neighbourhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_to)  
    
map_to

In [25]:
#Connecting to Foursquare API
CLIENT_ID = '1SVRHSUG5LQKSCMCEL05AG501AV4JITZGSZHXUAWNDPV5R5Z' # your Foursquare ID
CLIENT_SECRET = '3G4MACZTZCT5MZL12KARLD4TP3D5YQ0HGDVYSK31J1O3ZUEE' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

In [26]:
df[df['Postcode']=='M2C'].index

Int64Index([], dtype='int64')

In [27]:
#Getting the latitude and longitude values neighborhood above
lat = dfg.loc[53,'Latitude'] # neighborhood latitude value
long = dfg.loc[53,'Longitude'] # neighborhood longitude value
name = dfg.loc[53,'Postcode'] # neighborhood name

In [29]:
#Exploring Toronto using Foursquare API
LIMIT = 50
radius =1000
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    lat, 
    long, 
    radius, 
    LIMIT)
url

results = requests.get(url).json()
results

{'meta': {'code': 200, 'requestId': '5edd76465fb726001b9607ca'},
 'response': {'suggestedFilters': {'header': 'Tap to show:',
   'filters': [{'name': 'Open now', 'key': 'openNow'}]},
  'headerLocation': 'Corktown',
  'headerFullLocation': 'Corktown, Toronto',
  'headerLocationGranularity': 'neighborhood',
  'totalResults': 117,
  'suggestedBounds': {'ne': {'lat': 43.66325990900001,
    'lng': -79.3482199002972},
   'sw': {'lat': 43.64525989099999, 'lng': -79.37305189970282}},
  'groups': [{'type': 'Recommended Places',
    'name': 'recommended',
    'items': [{'reasons': {'count': 0,
       'items': [{'summary': 'This spot is popular',
         'type': 'general',
         'reasonName': 'globalInteractionReason'}]},
      'venue': {'id': '54ea41ad498e9a11e9e13308',
       'name': 'Roselle Desserts',
       'location': {'address': '362 King St E',
        'crossStreet': 'Trinity St',
        'lat': 43.653446723052674,
        'lng': -79.3620167174383,
        'labeledLatLngs': [{'label':

In [30]:
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

#Clean the json file and create pandas data frame for venues for postcode M5A
venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]


print('{} venues were returned by Foursquare.'.format(nearby_venues.shape[0]))

nearby_venues.head()

50 venues were returned by Foursquare.


Unnamed: 0,name,categories,lat,lng
0,Roselle Desserts,Bakery,43.653447,-79.362017
1,Tandem Coffee,Coffee Shop,43.653559,-79.361809
2,Impact Kitchen,Restaurant,43.656369,-79.35698
3,The Distillery Historic District,Historic Site,43.650244,-79.359323
4,Corktown Common,Park,43.655618,-79.356211


In [31]:
def getNearbyVenues(names, latitudes, longitudes, radius=1000):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Postcode', 
                  'Postcode Latitude', 
                  'Postcode Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

to_venues = getNearbyVenues(names=dfg['Postcode'],
                                   latitudes=dfg['Latitude'],
                                   longitudes=dfg['Longitude']
                                  )

M1B
M1C
M1E
M1G
M1H
M1J
M1K
M1L
M1M
M1N
M1P
M1R
M1S
M1T
M1V
M1W
M1X
M2H
M2J
M2K
M2L
M2M
M2N
M2P
M2R
M3A
M3B
M3C
M3H
M3J
M3K
M3L
M3M
M3N
M4A
M4B
M4C
M4E
M4G
M4H
M4J
M4K
M4L
M4M
M4N
M4P
M4R
M4S
M4T
M4V
M4W
M4X
M4Y
M5A
M5B
M5C
M5E
M5G
M5H
M5J
M5K
M5L
M5M
M5N
M5P
M5R
M5S
M5T
M5V
M5W
M5X
M6A
M6B
M6C
M6E
M6G
M6H
M6J
M6K
M6L
M6M
M6N
M6P
M6R
M6S
M7A
M7R
M7Y
M8V
M8W
M8X
M8Y
M8Z
M9A
M9B
M9C
M9L
M9M
M9N
M9P
M9R
M9V
M9W


In [32]:
to_venues.head()

Unnamed: 0,Postcode,Postcode Latitude,Postcode Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,M1B,43.806686,-79.194353,Harvey's,43.80002,-79.198307,Restaurant
1,M1B,43.806686,-79.194353,Wendy's,43.802008,-79.19808,Fast Food Restaurant
2,M1B,43.806686,-79.194353,Wendy’s,43.807448,-79.199056,Fast Food Restaurant
3,M1B,43.806686,-79.194353,RBC Royal Bank,43.798782,-79.19709,Bank
4,M1B,43.806686,-79.194353,Caribbean Wave,43.798558,-79.195777,Caribbean Restaurant


In [33]:
to_venues.groupby('Postcode').count()

Unnamed: 0_level_0,Postcode Latitude,Postcode Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Postcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
M1B,17,17,17,17,17,17
M1C,5,5,5,5,5,5
M1E,24,24,24,24,24,24
M1G,10,10,10,10,10,10
M1H,31,31,31,31,31,31
M1J,12,12,12,12,12,12
M1K,27,27,27,27,27,27
M1L,28,28,28,28,28,28
M1M,12,12,12,12,12,12
M1N,14,14,14,14,14,14


In [34]:
print('There are {} uniques categories.'.format(len(to_venues['Venue Category'].unique())))

There are 309 uniques categories.


In [35]:
to_onehot = pd.get_dummies(to_venues[['Venue Category']], prefix="", prefix_sep="")
to_onehot['Postcode'] = to_venues['Postcode'] 
fixed_columns = [to_onehot.columns[-1]] + list(to_onehot.columns[:-1])
to_onehot = to_onehot[fixed_columns]
to_onehot.head()

Unnamed: 0,Postcode,Accessories Store,Afghan Restaurant,Airport,Airport Lounge,American Restaurant,Amphitheater,Antique Shop,Aquarium,Art Gallery,...,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio,Zoo
0,M1B,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,M1B,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,M1B,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,M1B,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,M1B,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [36]:
to_grouped = to_onehot.groupby('Postcode').mean().reset_index()
to_grouped
to_grouped.shape

(102, 310)

In [38]:
num_top_venues=5

for hood in to_grouped['Postcode']:
    print("----"+hood+"----")
    temp = to_grouped[to_grouped['Postcode'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----M1B----
                  venue  freq
0           Coffee Shop  0.12
1            Restaurant  0.12
2                 Trail  0.12
3  Fast Food Restaurant  0.12
4  Caribbean Restaurant  0.06


----M1C----
                venue  freq
0      Breakfast Spot   0.2
1        Burger Joint   0.2
2                Park   0.2
3          Playground   0.2
4  Italian Restaurant   0.2


----M1E----
           venue  freq
0    Pizza Place  0.12
1  Grocery Store  0.08
2           Bank  0.08
3    Coffee Shop  0.08
4     Restaurant  0.08


----M1G----
                venue  freq
0                Park   0.2
1         Coffee Shop   0.2
2            Pharmacy   0.1
3   Mobile Phone Shop   0.1
4  Chinese Restaurant   0.1


----M1H----
               venue  freq
0        Coffee Shop  0.10
1             Bakery  0.10
2  Indian Restaurant  0.06
3               Bank  0.06
4        Gas Station  0.06


----M1J----
               venue  freq
0     Ice Cream Shop  0.17
1      Grocery Store  0.08
2      Train Station 

In [40]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

num_top_venues = 5

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Postcode']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Postcode'] = to_grouped['Postcode']

for ind in np.arange(to_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(to_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Postcode,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,M1B,Trail,Coffee Shop,Restaurant,Fast Food Restaurant,Bakery
1,M1C,Breakfast Spot,Playground,Burger Joint,Park,Italian Restaurant
2,M1E,Pizza Place,Grocery Store,Fast Food Restaurant,Bank,Restaurant
3,M1G,Park,Coffee Shop,Chinese Restaurant,Pharmacy,Indian Restaurant
4,M1H,Coffee Shop,Bakery,Gas Station,Indian Restaurant,Bank


In [42]:
# set number of clusters
kclusters = 5

to_grouped_clustering = to_grouped.drop('Postcode', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(to_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_

array([3, 2, 3, 3, 3, 3, 3, 3, 3, 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1, 0, 0,
       3, 3, 3, 0, 0, 3, 3, 3, 3, 4, 3, 3, 3, 3, 0, 3, 3, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 3, 3, 3, 3, 0, 0, 0, 0, 3, 3, 3, 0, 0, 0, 0, 0, 3, 3,
       3, 0, 2, 0, 2, 3, 3, 2, 3, 3, 3, 3, 3, 3], dtype=int32)

In [43]:
dfg_group=dfg.groupby('Postcode').mean().reset_index()
#dfg_group.head()
#dfg_group['duplicate']=dfg_group.duplicated() 
#print(dfg_group.loc[dfg_group['duplicate']==False])
#toronto_grouped['duplicate']=toronto_grouped.duplicated() 
#print(toronto_grouped.loc[toronto_grouped['duplicate']==False])

to_merge=pd.merge(dfg_group, to_grouped, on='Postcode')
to_merge1=to_merge[['Postcode','Latitude','Longitude']]
to_merge1
#toronto_merge=dfg

# add clustering labels
pd.options.mode.chained_assignment = None
to_merge1['Cluster Labels'] = kmeans.labels_


# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
to_merge1 = to_merge1.join(neighborhoods_venues_sorted.set_index('Postcode'), on='Postcode')

to_merge1.head() # check the last columns!

Unnamed: 0,Postcode,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,M1B,43.806686,-79.194353,3,Trail,Coffee Shop,Restaurant,Fast Food Restaurant,Bakery
1,M1C,43.784535,-79.160497,2,Breakfast Spot,Playground,Burger Joint,Park,Italian Restaurant
2,M1E,43.763573,-79.188711,3,Pizza Place,Grocery Store,Fast Food Restaurant,Bank,Restaurant
3,M1G,43.770992,-79.216917,3,Park,Coffee Shop,Chinese Restaurant,Pharmacy,Indian Restaurant
4,M1H,43.773136,-79.239476,3,Coffee Shop,Bakery,Gas Station,Indian Restaurant,Bank


In [44]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i+x+(i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(to_merge1['Latitude'], to_merge1['Longitude'], to_merge1['Postcode'], to_merge1['Cluster Labels']):
    label = folium.Popup(str(poi) + ' cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

In [46]:
#Identifying neighbourhoods that have few coffee shops and cafes
array=['Coffee Shop','Café']
#Cluster0=to_merge1[to_merge1['Cluster Labels']==0]
NoCafe=to_merge1[~to_merge1['1st Most Common Venue'].isin(array) & ~to_merge1['2nd Most Common Venue'].isin(array)& ~to_merge1['3rd Most Common Venue'].isin(array) & ~to_merge1['4th Most Common Venue'].isin(array) & ~to_merge1['5th Most Common Venue'].isin(array) ] 
NoCafe.head()
NoCafe.shape

(29, 9)

In [47]:
# Rendering these neighborhoods on Map for easy visualization and understanding
# create map
nocafe_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i+x+(i*x)**2 for i in range(kclusters)]
colors_array = cm.ocean(np.linspace(0, 1, len(ys)))
ocean = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(NoCafe['Latitude'], NoCafe['Longitude'], NoCafe['Postcode'], NoCafe['Cluster Labels']):
    label = folium.Popup(str(poi) + ' cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=ocean[cluster-1],
        fill=True,
        fill_color=ocean[cluster-1],
        fill_opacity=0.7).add_to(nocafe_clusters)
       
nocafe_clusters