This notebook will be mainly used for the Coursera capstone project.

In [108]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
import geocoder
from geopy.geocoders import Nominatim
from geopy.exc import GeocoderTimedOut
import folium
from sklearn.cluster import KMeans
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.model_selection import KFold

I got a list of San Francisco's neighborhoods from https://data.sfgov.org/Geographic-Locations-and-Boundaries/Planning-Neighborhood-Groups-Map/iacs-ws63 and put them into a CSV file.

In [3]:
df = pd.read_csv('sf_neighborhoods.csv')

In [4]:
geolocator = Nominatim(user_agent="sf_explorer")

def get_geocode(address):
    try:
        return geolocator.geocode(address)
    except GeocoderTimedOut:
        return get_geocode(address)

df_rows = []
    
for neigh in df['Neighborhood']:
    address = '{}, San Francisco, California'.format(neigh)
    location = get_geocode(address)
    if location is not None:
        df_rows.append([neigh, location.latitude, location.longitude])
    else:
        df_rows.append([neigh])

In [5]:
columns = ['Neighborhood', 'Latitude', 'Longitude']
neigh_df = pd.DataFrame(df_rows, columns=columns).set_index('Neighborhood')

Nominatim couldn't find some of the neighborhoods, so I used Wikipedia to get the coordinates of the following neighborhoods:

In [6]:
# Which neighborhoods are missing latitude/longitude?
neigh_df[pd.isnull(neigh_df).any(axis=1)]

Unnamed: 0_level_0,Latitude,Longitude
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1
Diamond Heights,,
Portrero Hill,,
West of Twin Peaks,,


In [7]:
# Fill in missing data manually
dh_lat, dh_lon = 37.745764, -122.441638
ph_lat, ph_lon = 37.75716, -122.39986
wotp_lat, wotp_lon = 37.739689, -122.466942

neigh_df.loc['Diamond Heights'] = dh_lat, dh_lon
neigh_df.loc['Portrero Hill'] = ph_lat, ph_lon
neigh_df.loc['West of Twin Peaks'] = wotp_lat, wotp_lon

In [8]:
# Nominatim's lat/lon for Outer Sunset and Lakeshore were pretty off, so I'm correcting them here.
os_lat, os_lon = 37.753427, -122.495402
ls_lat, ls_lon = 37.723698, -122.480287

neigh_df.loc['Outer Sunset'] = os_lat, os_lon
neigh_df.loc['Lakeshore'] = ls_lat, ls_lon

In [153]:
sf_lat = 37.7792808
sf_lon = -122.4192363

map_sf = folium.Map(location=[sf_lat, sf_lon], zoom_start=12)

for neighborhood, coordinates in neigh_df.iterrows():
    lat, lon = coordinates[0], coordinates[1]
    
    label = neighborhood
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_sf)  

In [154]:
map_sf

Now, let's use FourSquare to cluster all these neighborhoods.

In [127]:
CLIENT_ID = 'VNBGQDHO2OBW0A4LRJJWRLKIFGQI3TF4P0JZMYZYT5TY0BZO'
CLIENT_SECRET = '2TVVXX51X31DSAKOBCIJLOXB4SWTSLY2Y5HOR44NJG2DLXXG'
CATEGORY_ID = '4d4b7105d754a06374d81259' # Restaurants
VERSION = '20180605'
LIMIT = 500

radius = 500

url_base = 'https://api.foursquare.com/v2/venues/explore?'

price_freq_df_rows = []

for neighborhood, coordinates in neigh_df.iterrows():
    lat, lon = coordinates[0], coordinates[1]
    
    price_buckets = []
    
    for price in range(1, 5):
        url = url_base + '&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&price={}&limit={}&categoryId={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lon, 
            radius,
            price,
            LIMIT,
            CATEGORY_ID)
        
        temp = requests.get(url, verify=False).json()['response']
        results = temp['groups'][0]['items']
        price_buckets.append(len(results))
        
    total = sum(price_buckets)
    if total > 0:
        temp = [i / total for i in price_buckets]
        price_buckets = temp
        
    price_freq_df_rows.append([neighborhood] + price_buckets)














In [133]:
columns = ['Neighborhood', '$', '$$', '$$$', '$$$$']
price_freq_df = pd.DataFrame(price_freq_df_rows, columns=columns).set_index('Neighborhood')

In [134]:
neigh_price_df = neigh_df.merge(price_freq_df, on='Neighborhood')

In [135]:
neigh_price_df

Unnamed: 0_level_0,Latitude,Longitude,$,$$,$$$,$$$$
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Bayview,37.728889,-122.3925,0.75,0.2,0.0,0.05
Bernal Heights,37.741001,-122.414214,0.5,0.392857,0.107143,0.0
Castro,37.760856,-122.434957,0.40678,0.491525,0.084746,0.016949
Chinatown,37.794301,-122.406376,0.404959,0.404959,0.140496,0.049587
Crocker Amazon,37.709378,-122.438587,1.0,0.0,0.0,0.0
Diamond Heights,37.745764,-122.441638,0.375,0.5,0.125,0.0
Civic Center,37.779594,-122.416794,0.488889,0.444444,0.066667,0.0
Excelsior,37.721794,-122.435382,0.795455,0.181818,0.022727,0.0
Financial District,37.793647,-122.398938,0.473684,0.347368,0.147368,0.031579
Glen Park,37.733104,-122.433805,0.6,0.4,0.0,0.0


In [136]:
# set number of clusters
kclusters = 4

sf_clustering = neigh_price_df[['$', '$$', '$$$', '$$$$']]

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters).fit(sf_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([1, 3, 0, 0, 1, 0, 3, 1, 3, 3])

In [137]:
neigh_price_df.insert(len(neigh_price_df.columns), 'Cluster Label', kmeans.labels_)

In [138]:
neigh_price_df

Unnamed: 0_level_0,Latitude,Longitude,$,$$,$$$,$$$$,Cluster Label
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Bayview,37.728889,-122.3925,0.75,0.2,0.0,0.05,1
Bernal Heights,37.741001,-122.414214,0.5,0.392857,0.107143,0.0,3
Castro,37.760856,-122.434957,0.40678,0.491525,0.084746,0.016949,0
Chinatown,37.794301,-122.406376,0.404959,0.404959,0.140496,0.049587,0
Crocker Amazon,37.709378,-122.438587,1.0,0.0,0.0,0.0,1
Diamond Heights,37.745764,-122.441638,0.375,0.5,0.125,0.0,0
Civic Center,37.779594,-122.416794,0.488889,0.444444,0.066667,0.0,3
Excelsior,37.721794,-122.435382,0.795455,0.181818,0.022727,0.0,1
Financial District,37.793647,-122.398938,0.473684,0.347368,0.147368,0.031579,3
Glen Park,37.733104,-122.433805,0.6,0.4,0.0,0.0,3


In [155]:
map_clusters = folium.Map(location=[sf_lat, sf_lon], zoom_start=12)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for index, row in neigh_price_df.iterrows():
    lat, lon = row['Latitude'], row['Longitude']
    poi = index
    cluster = int(row['Cluster Label'])
    
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)

map_clusters

Cluster 0: Red

Cluster 1: Purple

Cluster 2: Blue

Cluster 3: Green

In [156]:
neigh_price_df.reset_index().groupby('Cluster Label').mean()

Unnamed: 0_level_0,Latitude,Longitude,$,$$,$$$,$$$$
Cluster Label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,37.775236,-122.429272,0.354482,0.483892,0.13847,0.023156
1,37.742682,-122.432516,0.825421,0.166498,0.002525,0.005556
2,37.788541,-122.486916,0.0,0.0,0.0,0.0
3,37.763964,-122.443504,0.534154,0.419504,0.032182,0.01416
