# Applied Data Science Capstone - Week 5

##  _Open a Coffee Shop in Seattle, WA_

__Contents:__
1. Build a dataframe of neighborhoods in Seattle, WA
2. Get the coordinates of the neighborhoods
3. Obtain the venue data from Foursquare API
4. Explore and cluster the neighborhoods
5. Find the best cluster
6. Conclusion

### 1. Import libraries

In [1]:
import numpy as np
import pandas as pd
import json
import requests
import geocoder
from pandas.io.json import json_normalize
from bs4 import BeautifulSoup
from geopy.geocoders import Nominatim
from sklearn.cluster import KMeans
import matplotlib.cm as cm
import matplotlib.colors as colors
import folium

print('Imported!')

Imported!


### 2. Scrap data from Wikipedia

In [2]:
url = 'https://en.wikipedia.org/wiki/Category:Neighborhoods_in_Seattle'
data = requests.get(url).text

In [3]:
soup = BeautifulSoup(data, 'html.parser')

In [4]:
neigh_list = []

for row in soup.find_all("div", class_="mw-category")[1].find_all("li"):
    neigh_list.append(row.text)
    
df = pd.DataFrame({"Neighborhood": neigh_list})
df.drop(df.index[0], inplace=True)
df.reset_index(inplace= True, drop=True)

df

Unnamed: 0,Neighborhood
0,"Adams, Seattle"
1,"Alki Point, Seattle"
2,"Arbor Heights, Seattle"
3,"Atlantic, Seattle"
4,The Ave
...,...
101,"Westlake, Seattle"
102,"Westwood, Seattle"
103,"Whittier Heights, Seattle"
104,"Windermere, Seattle"


### 3. Get the coordinates

In [5]:
# define a function to get coordinates
def get_latlng(neighborhood):
    lat_lng_coords = None
    while(lat_lng_coords is None):
        g = geocoder.arcgis(neighborhood)
        lat_lng_coords = g.latlng
    return lat_lng_coords

In [10]:
coords = [get_latlng(neighborhood) for neighborhood in df["Neighborhood"].tolist()]

print(coords[:5])
print(coords[-5:])

[[47.67297000000008, -122.38759999999996], [47.582354365185026, -122.37328706145979], [47.50861000000003, -122.37592999999998], [47.596010000000035, -122.30222999999995], [30.269460000000038, -97.74225999999999]]
[[47.63297000000006, -122.34172999999998], [47.73118032131357, -122.28598448289429], [47.684510000000046, -122.37143999999995], [47.66999000000004, -122.26626999999996], [47.60155000000003, -122.31408999999996]]


#### Create a dataframe to merge the coordinates.

In [11]:
df_coords = pd.DataFrame(coords, columns=['Latitude', 'Longitude'])

df['Latitude'] = df_coords['Latitude']
df['Longitude'] = df_coords['Longitude']

df

Unnamed: 0,Neighborhood,Latitude,Longitude
0,"Adams, Seattle",47.672970,-122.387600
1,"Alki Point, Seattle",47.582354,-122.373287
2,"Arbor Heights, Seattle",47.508610,-122.375930
3,"Atlantic, Seattle",47.596010,-122.302230
4,The Ave,30.269460,-97.742260
...,...,...,...
101,"Westlake, Seattle",47.632970,-122.341730
102,"Westwood, Seattle",47.731180,-122.285984
103,"Whittier Heights, Seattle",47.684510,-122.371440
104,"Windermere, Seattle",47.669990,-122.266270


In [12]:
# save the dataframe as CSV file
df.to_csv("df.csv", index=False)

### 4. Create a map of Seattle with neighborhoods

In [13]:
# get the coordinates of Seattle
address = 'Seattle'

geolocator = Nominatim(user_agent="my-app")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Seattle is {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Seattle is 47.6038321, -122.3300624.


In [14]:
# create map of Seattle using latitude and longitude values
map_seattle = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, neighborhood in zip(df['Latitude'], df['Longitude'], df['Neighborhood']):
    label = '{}'.format(neighborhood)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7).add_to(map_seattle)  
    
map_seattle

In [15]:
# save the map as HTML file
map_seattle.save('map_seattle.html')

### 5. Explore the neignborhoods using the Foursquare API

In [53]:
# define Foursquare Credentials and Version
CLIENT_ID = 'YOUR FOURSQUARE ID' # your Foursquare ID
CLIENT_SECRET = 'YOUR FOURSQUARE SECRET' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: YOUR FOURSQUARE ID
CLIENT_SECRET:YOUR FOURSQUARE SECRET


In [17]:
# get the top 100 venues within a radius of 1000m
radius = 1000
LIMIT = 100

venues = []

for lat, long, neighborhood in zip(df['Latitude'], df['Longitude'], df['Neighborhood']):
    
    # create the API request URL
    url = "https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}".format(
        CLIENT_ID,
        CLIENT_SECRET,
        VERSION,
        lat,
        long,
        radius, 
        LIMIT)
    
    # make the GET request
    results = requests.get(url).json()["response"]['groups'][0]['items']
    
    # return only relevant information for each nearby venue
    for venue in results:
        venues.append((
            neighborhood,
            lat, 
            long, 
            venue['venue']['name'], 
            venue['venue']['location']['lat'], 
            venue['venue']['location']['lng'],  
            venue['venue']['categories'][0]['name']))

In [18]:
# convert the venues list into a new DataFrame
df_venues = pd.DataFrame(venues)

# define the column names
df_venues.columns = ['Neighborhood', 'Latitude', 'Longitude', 'VenueName', 'VenueLatitude', 'VenueLongitude', 'VenueCategory']

print(df_venues.shape)
df_venues.head()

(6109, 7)


Unnamed: 0,Neighborhood,Latitude,Longitude,VenueName,VenueLatitude,VenueLongitude,VenueCategory
0,"Adams, Seattle",47.67297,-122.3876,Cafe Besalu,47.671971,-122.387755,Bakery
1,"Adams, Seattle",47.67297,-122.3876,Tall Grass Bakery,47.671982,-122.38769,Bakery
2,"Adams, Seattle",47.67297,-122.3876,Rupee Bar,47.674828,-122.38784,Sri Lankan Restaurant
3,"Adams, Seattle",47.67297,-122.3876,Copine,47.675741,-122.387404,French Restaurant
4,"Adams, Seattle",47.67297,-122.3876,Venture Coffee,47.67143,-122.387849,Coffee Shop


In [19]:
# count venues returned for each neighborhood
df_venues.groupby(["Neighborhood"]).count()

Unnamed: 0_level_0,Latitude,Longitude,VenueName,VenueLatitude,VenueLongitude,VenueCategory
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"Adams, Seattle",83,83,83,83,83,83
"Alki Point, Seattle",54,54,54,54,54,54
"Arbor Heights, Seattle",6,6,6,6,6,6
"Atlantic, Seattle",64,64,64,64,64,64
"Ballard, Seattle",100,100,100,100,100,100
...,...,...,...,...,...,...
"Westlake, Seattle",59,59,59,59,59,59
"Westwood, Seattle",30,30,30,30,30,30
"Whittier Heights, Seattle",81,81,81,81,81,81
"Windermere, Seattle",6,6,6,6,6,6


In [20]:
print('There are {} unique categories.'.format(len(df_venues['VenueCategory'].unique())))

There are 370 unique categories.


In [21]:
df_venues['VenueCategory'].unique()[:10]

array(['Bakery', 'Sri Lankan Restaurant', 'French Restaurant',
       'Coffee Shop', 'Bar', 'Burger Joint', 'Ice Cream Shop', 'Park',
       'Thai Restaurant', 'Design Studio'], dtype=object)

In [22]:
# check if the dataframe contain "Coffee Shop"
"Coffee Shop" in df_venues['VenueCategory'].unique()

True

#### Analyze each neighborhood.

In [23]:
# one hot encoding
onehot = pd.get_dummies(df_venues[['VenueCategory']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
onehot['Neighborhoods'] = df_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [onehot.columns[-1]] + list(onehot.columns[:-1])
onehot = onehot[fixed_columns]

print(onehot.shape)
onehot.head()

(6109, 371)


Unnamed: 0,Neighborhoods,ATM,Accessories Store,African Restaurant,Airport,American Restaurant,Amphitheater,Antique Shop,Arcade,Argentinian Restaurant,...,Weight Loss Center,Whisky Bar,Wine Bar,Wine Shop,Winery,Wings Joint,Women's Store,Yoga Studio,Zoo,Zoo Exhibit
0,"Adams, Seattle",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,"Adams, Seattle",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,"Adams, Seattle",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,"Adams, Seattle",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,"Adams, Seattle",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [24]:
# group by neighborhood and take the mean of the frequency of occurrence
df_grouped = onehot.groupby(["Neighborhoods"]).mean().reset_index()

print(df_grouped.shape)
df_grouped

(106, 371)


Unnamed: 0,Neighborhoods,ATM,Accessories Store,African Restaurant,Airport,American Restaurant,Amphitheater,Antique Shop,Arcade,Argentinian Restaurant,...,Weight Loss Center,Whisky Bar,Wine Bar,Wine Shop,Winery,Wings Joint,Women's Store,Yoga Studio,Zoo,Zoo Exhibit
0,"Adams, Seattle",0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
1,"Alki Point, Seattle",0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.018519,0.0,0.0,0.0,0.0,0.018519,0.0,0.0
2,"Arbor Heights, Seattle",0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
3,"Atlantic, Seattle",0.000000,0.0,0.015625,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.015625,0.0,0.0
4,"Ballard, Seattle",0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.010000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101,"Westlake, Seattle",0.016949,0.0,0.000000,0.016949,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
102,"Westwood, Seattle",0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
103,"Whittier Heights, Seattle",0.000000,0.0,0.000000,0.000000,0.012346,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.012346,0.0,0.0
104,"Windermere, Seattle",0.000000,0.0,0.000000,0.000000,0.166667,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0


In [25]:
len(df_grouped[df_grouped["Coffee Shop"] > 0])

87

#### Creat a dataframe for Coffee Shop.

In [26]:
df_coffee = df_grouped[["Neighborhoods", "Coffee Shop"]]
df_coffee

Unnamed: 0,Neighborhoods,Coffee Shop
0,"Adams, Seattle",0.072289
1,"Alki Point, Seattle",0.092593
2,"Arbor Heights, Seattle",0.000000
3,"Atlantic, Seattle",0.078125
4,"Ballard, Seattle",0.050000
...,...,...
101,"Westlake, Seattle",0.118644
102,"Westwood, Seattle",0.033333
103,"Whittier Heights, Seattle",0.049383
104,"Windermere, Seattle",0.000000


### 6. Cluster Neighborhoods

By using k-means

In [27]:
# set number of clusters
kclusters = 5

clustering = df_coffee.drop(["Neighborhoods"], 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([1, 4, 0, 4, 1, 4, 1, 3, 0, 1])

#### Create a new dataframe includes the cluster for each neighborhood.

In [28]:
df_merged = df_coffee.copy()

# add clustering labels
df_merged["Cluster Labels"] = kmeans.labels_

df_merged.rename(columns={"Neighborhoods": "Neighborhood"}, inplace=True)
df_merged.head()

Unnamed: 0,Neighborhood,Coffee Shop,Cluster Labels
0,"Adams, Seattle",0.072289,1
1,"Alki Point, Seattle",0.092593,4
2,"Arbor Heights, Seattle",0.0,0
3,"Atlantic, Seattle",0.078125,4
4,"Ballard, Seattle",0.05,1


#### Merge `df_grouped` with Seattle data `df` to add latitude/longitude.

In [30]:
df_merged = df_merged.join(df.set_index("Neighborhood"), on="Neighborhood")
df_merged

Unnamed: 0,Neighborhood,Coffee Shop,Cluster Labels,Latitude,Longitude
0,"Adams, Seattle",0.072289,1,47.672970,-122.387600
1,"Alki Point, Seattle",0.092593,4,47.582354,-122.373287
2,"Arbor Heights, Seattle",0.000000,0,47.508610,-122.375930
3,"Atlantic, Seattle",0.078125,4,47.596010,-122.302230
4,"Ballard, Seattle",0.050000,1,47.668670,-122.384530
...,...,...,...,...,...
101,"Westlake, Seattle",0.118644,2,47.632970,-122.341730
102,"Westwood, Seattle",0.033333,3,47.731180,-122.285984
103,"Whittier Heights, Seattle",0.049383,1,47.684510,-122.371440
104,"Windermere, Seattle",0.000000,0,47.669990,-122.266270


#### Sort the results by Cluster Labels.

In [31]:
df_merged.sort_values(["Cluster Labels"], inplace=True)
df_merged

Unnamed: 0,Neighborhood,Coffee Shop,Cluster Labels,Latitude,Longitude
41,"Harbor Island, Seattle",0.000000,0,47.579810,-122.352760
60,"Matthews Beach, Seattle",0.000000,0,47.699340,-122.278350
42,Harvard-Belmont Landmark District,0.000000,0,49.664160,-96.819980
58,"Magnolia, Seattle",0.000000,0,47.633480,-122.387026
17,"Cascade, Seattle",0.000000,0,47.742753,-122.365332
...,...,...,...,...,...
14,"Broadway District, Seattle",0.100000,4,47.610210,-122.320772
78,"Ravenna, Seattle",0.074074,4,47.673930,-122.299140
13,Broadway (Seattle),0.080000,4,47.624960,-122.320920
57,"Madrona, Seattle",0.080000,4,47.612930,-122.289680


### 7. Visualize the resulting clusters

In [32]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=10)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i+x+(i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(df_merged['Latitude'], df_merged['Longitude'], df_merged['Neighborhood'], df_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' - Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

### 8. Examine clusters

#### Cluster 0

In [33]:
df_merged.loc[df_merged['Cluster Labels'] == 0]

Unnamed: 0,Neighborhood,Coffee Shop,Cluster Labels,Latitude,Longitude
41,"Harbor Island, Seattle",0.0,0,47.57981,-122.35276
60,"Matthews Beach, Seattle",0.0,0,47.69934,-122.27835
42,Harvard-Belmont Landmark District,0.0,0,49.66416,-96.81998
58,"Magnolia, Seattle",0.0,0,47.63348,-122.387026
17,"Cascade, Seattle",0.0,0,47.742753,-122.365332
56,"Madrona Valley, Seattle",0.0,0,47.73963,-122.370259
76,"Rainier Beach, Seattle",0.0,0,47.51235,-122.26277
104,"Windermere, Seattle",0.0,0,47.66999,-122.26627
79,"Renton Hill, Seattle",0.0,0,47.497063,-122.246169
12,"Broadview, Seattle",0.0,0,47.72238,-122.36498


#### Cluster 1

In [35]:
df_merged.loc[df_merged['Cluster Labels'] == 1]

Unnamed: 0,Neighborhood,Coffee Shop,Cluster Labels,Latitude,Longitude
62,"Meridian, Seattle",0.064103,1,47.66937,-122.32933
45,"Industrial District, Seattle",0.06,1,47.58616,-122.32738
59,"Maple Leaf, Seattle",0.049383,1,47.70013,-122.31765
43,"Hawthorne Hills, Seattle",0.04878,1,41.68323,-69.94955
0,"Adams, Seattle",0.072289,1,47.67297,-122.3876
67,"North Admiral, Seattle",0.070175,1,47.57933,-122.38863
68,"Northgate, Seattle",0.061538,1,47.7131,-122.3193
69,"Northlake, Seattle",0.07,1,47.655033,-122.320814
72,"Pinehurst, Seattle",0.068966,1,47.71894,-122.314
74,Portage Bay,0.064516,1,47.6469,-122.32028


#### Cluster 2

In [36]:
df_merged.loc[df_merged['Cluster Labels'] == 2]

Unnamed: 0,Neighborhood,Coffee Shop,Cluster Labels,Latitude,Longitude
25,"Delridge, Seattle",0.153846,2,47.56451,-122.36337
26,"Denny Triangle, Seattle",0.12,2,47.616505,-122.337623
27,"Denny-Blaine, Seattle",0.111111,2,47.620213,-122.28063
61,"Meadowbrook, Seattle",0.115385,2,47.70841,-122.29586
99,West Seattle,0.119048,2,47.57123,-122.38514
87,"South Park, Seattle",0.15,2,47.52722,-122.31445
86,"South Lake Union, Seattle",0.14,2,47.62341,-122.33435
101,"Westlake, Seattle",0.118644,2,47.63297,-122.34173
39,"Greenwood, Seattle",0.130952,2,47.69082,-122.35529


#### Cluster 3

In [37]:
df_merged.loc[df_merged['Cluster Labels'] == 3]

Unnamed: 0,Neighborhood,Coffee Shop,Cluster Labels,Latitude,Longitude
89,"Squire Park, Seattle",0.030303,3,32.92997,-96.89423
71,"Phinney Ridge, Seattle",0.032609,3,47.67596,-122.35436
70,"Olympic Hills, Seattle",0.045455,3,47.72656,-122.3026
7,"Bitter Lake, Seattle",0.019231,3,47.71868,-122.3503
11,"Broadmoor, Seattle",0.018182,3,47.63282,-122.28827
18,"Cedar Park, Seattle",0.023256,3,47.72645,-122.28801
40,"Haller Lake, Seattle",0.039216,3,47.7232,-122.3387
64,"Montlake, Seattle",0.03125,3,47.64085,-122.30207
93,"University Village, Seattle",0.035294,3,47.66428,-122.29853
96,"Wallingford, Seattle",0.040404,3,47.65555,-122.3265


#### Cluster 4

In [38]:
df_merged.loc[df_merged['Cluster Labels'] == 4]

Unnamed: 0,Neighborhood,Coffee Shop,Cluster Labels,Latitude,Longitude
92,"University District, Seattle",0.09,4,47.66127,-122.31307
94,"Victory Heights, Seattle",0.086957,4,47.7103,-122.3072
98,"Wedgwood, Seattle",0.096774,4,47.68701,-122.29494
3,"Atlantic, Seattle",0.078125,4,47.59601,-122.30223
1,"Alki Point, Seattle",0.092593,4,47.582354,-122.373287
5,"Beacon Hill, Seattle",0.085714,4,47.57686,-122.31271
20,"Central Waterfront, Seattle",0.08,4,47.60357,-122.32945
80,"Roosevelt, Seattle",0.08642,4,47.68156,-122.31676
38,"Green Lake, Seattle",0.094118,4,47.68508,-122.33232
44,"Hillman City, Seattle",0.08,4,47.60357,-122.32945


### 9. Conclusion

As observed from the result, coffee shops with the highest number in Cluster 2, meanwhile Cluster 0 has a very low number to no coffee shop in the neighborhoods. It represents a great opportunity to open a new coffee shop in Cluster 0 since there's very little competition from existing shops. On the other hand, coffee shops in Cluster 2 are likely suffering from intense competition due to oversupply and high concentration of coffee shops. Therefore, this project recommends opening new coffee shops in neighborhoods in Cluster 0 for avoiding competition.