# Capstone Project

In [1]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests
import json
from sklearn.cluster import KMeans 
from sklearn.preprocessing import StandardScaler
import matplotlib.cm as cm
import matplotlib.colors as colors
import folium

## Prepare the neighborhood geographical info

### Script data from Wikipedia page

In [2]:
# script the html page
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
response = requests.get(url)
if response.status_code == 200:
    soup = BeautifulSoup(response.content,'lxml')

In [3]:
# get the PostalCode and save to a list
PostalCode = []
for tag in soup.find('table').find_all('b'):
    PostalCode.append(tag.get_text())

In [4]:
# get the Borough(Neighborhood) and save to a list
Borough = []
for tag in soup.find('table').find_all('span'):
    Borough.append(tag.get_text())

In [5]:
# create a dataframe and put the data above into it
df = pd.DataFrame(columns = ['PostalCode','Borough','Neighborhood'])
df.PostalCode = PostalCode
df.Borough = Borough

In [6]:
# remove rows that contain unassigned boroughs
df = df[df.Borough != 'Not assigned'].reset_index(drop = True)

In [7]:
# clean the 'Borough' and 'Neighborhood' columns to the right format
for i in range(len(df)):
    l = df.Borough[i].split('(')
    df.Borough[i] = l[0]
    try:
        df.Neighborhood[i] = l[1]
    except:
        df.Neighborhood[i] = l[0]
    if df.Neighborhood[i][-1] == ')':
        df.Neighborhood[i] = df.Neighborhood[i][:-1]

In [8]:
for i in range(len(df)):
    if '/' in df.Borough[i]:
        df = df[df.index != i]
df = df.reset_index(drop = True)
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Regent Park / Harbourfront
3,M6A,North York,Lawrence Manor / Lawrence Heights
4,M9A,Etobicoke,Islington Avenue


In [9]:
df.shape

(102, 3)

### Add coordinates

In [10]:
coor = pd.read_csv('Geospatial_Coordinates.csv')

In [11]:
df = df.merge(coor, left_on = 'PostalCode', right_on = 'Postal Code', how = 'left')
df = df.drop('Postal Code',1)

In [12]:
df

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,Regent Park / Harbourfront,43.654260,-79.360636
3,M6A,North York,Lawrence Manor / Lawrence Heights,43.718518,-79.464763
4,M9A,Etobicoke,Islington Avenue,43.667856,-79.532242
...,...,...,...,...,...
97,M8X,Etobicoke,The Kingsway / Montgomery Road / Old Mill North,43.653654,-79.506944
98,M4Y,Downtown Toronto,Church and Wellesley,43.665860,-79.383160
99,M7Y,East TorontoBusiness reply mail Processing Cen...,Enclave of M4L,43.662744,-79.321558
100,M8Y,Etobicoke,Old Mill South / King's Mill Park / Sunnylea /...,43.636258,-79.498509


## Explore the restaurants around each neighborhood

### Get venues info around each postal code

In [13]:
CLIENT_ID = '3GSOCBER1WPKAWDHRZWW2FAYTTMQZT2OVZGXVEJOTY1Y4DTJ'
CLIENT_SECRET = 'SFZU24TJPXDXBFNDEW0K0BMHSBVNS2DA0Q50DCMKBKDHFO1F'
VERSION = '20180605'


def getNearbyVenues(postcodes, latitudes, longitudes, radius=2000, LIMIT = 50):
    
    venues_list=[]
      
    for pc, lat, lng in zip(postcodes, latitudes, longitudes):
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            pc, 
            v['venue']['id'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['PostalCode', 'Venue ID', 'Venue Category']
    
    return(nearby_venues)

In [14]:
venues = getNearbyVenues(postcodes=df['PostalCode'],latitudes=df['Latitude'],longitudes=df['Longitude'])

In [15]:
venues

Unnamed: 0,PostalCode,Venue ID,Venue Category
0,M3A,4b8991cbf964a520814232e3,Caribbean Restaurant
1,M3A,4bd4846a6798ef3bd0c5618d,Golf Course
2,M3A,4e8d9dcdd5fbbbb6b3003c7b,Park
3,M3A,4b8ec91af964a520053733e3,Event Space
4,M3A,4ccec87654f0b1f7f32824ca,Supermarket
...,...,...,...
4924,M8Z,50a9657ee4b0170b321d3a75,Turkish Restaurant
4925,M8Z,4b8ef6fcf964a520954133e3,Restaurant
4926,M8Z,4b69ce03f964a5204eb52be3,Gas Station
4927,M8Z,4b3cf645f964a520a38a25e3,Restaurant


### Extract the restaurant types and add ratings

In [16]:
venues_extract = venues.copy()
for i in range(len(venues_extract)):
    if 'Restaurant' not in venues_extract['Venue Category'][i]:
        venues_extract = venues_extract[venues_extract.index != i]
venues_extract = venues_extract.reset_index(drop = True)
venues_extract.head()

Unnamed: 0,PostalCode,Venue ID,Venue Category
0,M3A,4b8991cbf964a520814232e3,Caribbean Restaurant
1,M3A,4b149ea4f964a52029a523e3,Middle Eastern Restaurant
2,M3A,54b55e81498e6b087da5f439,Mediterranean Restaurant
3,M3A,5737698bcd10aa51361abad8,Seafood Restaurant
4,M3A,4b0aed06f964a520202a23e3,Caribbean Restaurant


### Numbers, diversity, and average ratings of restaurants nearby

In [17]:
res_count = venues_extract.groupby('PostalCode').size().reset_index()
res_count.columns = ['PostalCode','Restaurant Counts']
res_count

Unnamed: 0,PostalCode,Restaurant Counts
0,M1B,10
1,M1C,3
2,M1E,5
3,M1G,13
4,M1H,14
...,...,...
96,M9N,6
97,M9P,5
98,M9R,10
99,M9V,13


In [18]:
res_div = venues_extract.groupby('PostalCode')['Venue Category'].unique().reset_index()
res_div.columns = ['PostalCode','Restaurant Diversity']
for i in range(len(res_div)):
    res_div['Restaurant Diversity'][i] = len(res_div['Restaurant Diversity'][i])
res_div

Unnamed: 0,PostalCode,Restaurant Diversity
0,M1B,5
1,M1C,3
2,M1E,4
3,M1G,7
4,M1H,10
...,...,...
96,M9N,5
97,M9P,4
98,M9R,6
99,M9V,5


In [19]:
res = res_count.merge(res_div,on = 'PostalCode')
res

Unnamed: 0,PostalCode,Restaurant Counts,Restaurant Diversity
0,M1B,10,5
1,M1C,3,3
2,M1E,5,4
3,M1G,13,7
4,M1H,14,10
...,...,...,...
96,M9N,6,5
97,M9P,5,4
98,M9R,10,6
99,M9V,13,5


### Cluster neighborhoods

In [20]:
res_clustering = res.drop('PostalCode', 1)
res_clustering = StandardScaler().fit_transform(res_clustering)

# run k-mean clustering
k = 5
db = KMeans(n_clusters=k,random_state = 0).fit(res_clustering)

In [21]:
res['Cluster'] = db.labels_ 
nei_cluster = res.merge(df,on = 'PostalCode')
nei_cluster

Unnamed: 0,PostalCode,Restaurant Counts,Restaurant Diversity,Cluster,Borough,Neighborhood,Latitude,Longitude
0,M1B,10,5,3,Scarborough,Malvern / Rouge,43.806686,-79.194353
1,M1C,3,3,1,Scarborough,Rouge Hill / Port Union / Highland Creek,43.784535,-79.160497
2,M1E,5,4,1,Scarborough,Guildwood / Morningside / West Hill,43.763573,-79.188711
3,M1G,13,7,0,Scarborough,Woburn,43.770992,-79.216917
4,M1H,14,10,0,Scarborough,Cedarbrae,43.773136,-79.239476
...,...,...,...,...,...,...,...,...
96,M9N,6,5,1,York,Weston,43.706876,-79.518188
97,M9P,5,4,1,Etobicoke,Westmount,43.696319,-79.532242
98,M9R,10,6,3,Etobicoke,Kingsview Village / St. Phillips / Martin Grov...,43.688905,-79.554724
99,M9V,13,5,3,Etobicoke,South Steeles / Silverstone / Humbergate / Jam...,43.739416,-79.588437


### Visualize the clusters

In [22]:
# create map
map_clusters = folium.Map(location=[43.6532, -79.3832], zoom_start=11)

# set color scheme for the clusters
x = np.arange(k)
ys = [i + x + (i*x)**2 for i in range(k)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(nei_cluster['Latitude'], nei_cluster['Longitude'], nei_cluster['PostalCode'], nei_cluster['Cluster']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)


map_clusters

## Analyze each cluster

### Cluster 1

In [23]:
nei_cluster.loc[nei_cluster['Cluster'] == 0, nei_cluster.columns[[0]+[1]+[2]]]

Unnamed: 0,PostalCode,Restaurant Counts,Restaurant Diversity
3,M1G,13,7
4,M1H,14,10
16,M2H,14,9
17,M2J,13,7
18,M2K,14,9
19,M2L,11,9
22,M2P,13,10
23,M2R,15,9
26,M3C,15,10
27,M3H,10,9


### Cluster 2

In [24]:
nei_cluster.loc[nei_cluster['Cluster'] == 1, nei_cluster.columns[[0]+[1]+[2]]]

Unnamed: 0,PostalCode,Restaurant Counts,Restaurant Diversity
1,M1C,3,3
2,M1E,5,4
5,M1J,6,4
9,M1N,4,4
30,M3L,4,1
58,M5J,6,5
79,M6M,5,4
91,M9A,4,4
93,M9C,2,2
95,M9M,6,5


### Cluster 3

In [25]:
nei_cluster.loc[nei_cluster['Cluster'] == 2, nei_cluster.columns[[0]+[1]+[2]]]

Unnamed: 0,PostalCode,Restaurant Counts,Restaurant Diversity
11,M1R,24,15
12,M1S,24,13
13,M1T,25,16
20,M2M,24,11


### Cluster 4

In [26]:
nei_cluster.loc[nei_cluster['Cluster'] == 3, nei_cluster.columns[[0]+[1]+[2]]]

Unnamed: 0,PostalCode,Restaurant Counts,Restaurant Diversity
0,M1B,10,5
6,M1K,12,5
7,M1L,10,8
8,M1M,8,6
21,M2N,11,7
31,M3M,9,4
32,M3N,8,5
34,M4B,7,6
35,M4C,8,6
51,M4Y,10,8


### Cluster 5

In [27]:
nei_cluster.loc[nei_cluster['Cluster'] == 4, nei_cluster.columns[[0]+[1]+[2]]]

Unnamed: 0,PostalCode,Restaurant Counts,Restaurant Diversity
10,M1P,16,14
14,M1V,20,11
15,M1W,22,11
24,M3A,17,12
25,M3B,18,12
47,M4T,18,10
48,M4V,18,11
49,M4W,15,13
61,M5M,18,12
72,M6C,16,10


## Recommendation

__Cluster 4__ has medium restaurant counts and relatively low restaurant diversity, which means that there's enough traffic in these areas, and opening the kinds of restaurants that have not been covered can be a good business opportunity. Among the neighborhoods in cluster 2, __M9V__ looks promising, with up to 13 restaurants and only 5 kinds. let's see what are the other venues located in this area.

In [28]:
M9V = venues[venues.PostalCode == 'M9V']
M9V['Venue Category'].value_counts()

Coffee Shop                                 7
Indian Restaurant                           6
Pizza Place                                 5
Fast Food Restaurant                        4
Grocery Store                               3
Skating Rink                                2
Bank                                        2
Chinese Restaurant                          1
Pharmacy                                    1
Park                                        1
Flea Market                                 1
Fried Chicken Joint                         1
Spa                                         1
Discount Store                              1
Gas Station                                 1
Juice Bar                                   1
Sandwich Place                              1
Ice Cream Shop                              1
Clothing Store                              1
Burger Joint                                1
Beer Store                                  1
Greek Restaurant                  

M9V is a rather commercial neighborhood with a lot of shops, food stands, and restaurants, but restaurants can be more diversified. People who wants to seek opportunities in food industry in this neighborhood can try restaurants other than Coffee Shop, Indian Restaurant, Pizza Place, and Fast Food Restaurant.