In [111]:
# data analysis libraries
import pandas as pd
import numpy as np

# website scraping library
import requests
from bs4 import BeautifulSoup

# lat and long library
from uszipcode import SearchEngine

# mapping libraries
import folium
from geopy.geocoders import Nominatim
import json
import matplotlib.cm as cm
import matplotlib.colors as colors

# clustering libraries
from sklearn.cluster import KMeans

In [79]:
## get SF neighborhoods and zip codes
response = requests.get("http://www.healthysf.org/bdi/outcomes/zipmap.htm")
soup = BeautifulSoup(response.text, "lxml")
table = soup.find_all("table")
df = pd.read_html(str(table))
df = pd.DataFrame(df[4])

In [80]:
# make the first row of df, the columns and drop the first and last rows.
df.columns = df.iloc[0]
df.drop(df.index[0], inplace=True)
df.drop(df.index[21], inplace=True)

In [81]:
# add lat and long to each neighborhood
search = SearchEngine(simple_zipcode=True)

latitude = []
longitude = []

for index, row in df.iterrows():
    zipcode = search.by_zipcode(row["Zip Code"]).to_dict()
    latitude.append(zipcode.get("lat"))
    longitude.append(zipcode.get("lng"))

df["Latitude"] = latitude
df["Longitude"] = longitude

df

Unnamed: 0,Zip Code,Neighborhood,Population (Census 2000),Latitude,Longitude
1,94102,Hayes Valley/Tenderloin/North of Market,28991,37.78,-122.42
2,94103,South of Market,23016,37.78,-122.41
3,94107,Potrero Hill,17368,37.77,-122.39
4,94108,Chinatown,13716,37.791,-122.409
5,94109,Polk/Russian Hill (Nob Hill),56322,37.79,-122.42
6,94110,Inner Mission/Bernal Heights,74633,37.75,-122.42
7,94112,Ingelside-Excelsior/Crocker-Amazon,73104,37.72,-122.44
8,94114,Castro/Noe Valley,30574,37.76,-122.44
9,94115,Western Addition/Japantown,33115,37.79,-122.44
10,94116,Parkside/Forest Hill,42958,37.74,-122.48


In [82]:
# fixing a few lat and longs of Outer Richmond and Marina neighborhoods
df.at[13, 'Latitude'] = 37.781 # Outer Richmond
df.at[13, 'Longitude'] = -122.498 # Outer Richmond
df.at[15, 'Latitude'] = 37.802 # Marina
df.at[15, 'Longitude'] = -122.438 # Marina

In [86]:
# find lat and long of SF
address = 'San Francisco, CA, USA'

geolocator = Nominatim(user_agent = "san_francisco_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinates of San Francisco are {}, {}.'.format(latitude, longitude))

The geograpical coordinates of San Francisco are 37.7790262, -122.4199061.


In [88]:
# print map of SF
sf_map = folium.Map(location = [latitude, longitude], zoom_start=12)

# add markers to map
for lat, lng, neighborhood in zip(df['Latitude'], df['Longitude'], df['Neighborhood']):
    label = '{}'.format(neighborhood)
    label = folium.Popup(label, parse_html = True)
    folium.CircleMarker(
        [lat, lng],
        radius = 5,
        popup = label,
        color = 'blue',
        fill = True,
        fill_color = '#3186cc',
        fill_opacity = 0.7,
        parse_html = False).add_to(sf_map)  
    
sf_map

# # Pull Data from Foursquare

In [89]:
## foursquare credentials and version
CLIENT_ID = '1OEZ3H4JWU312DDTEUJZLLCFJU4UZWK4CJ1JEAQU0E41DDBS' # your Foursquare ID
CLIENT_SECRET = 'KTJ4SLN3ZFDMIEUZDHQXA3HYHHJXO1JWUYTYGH5D5Y02ZFSM' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

### Getting venues for each SF neighborhood from Foursquare

In [96]:
LIMIT = 200
radius = 500

def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [98]:
sf_venues = getNearbyVenues(names = df['Neighborhood'],
                                latitudes = df['Latitude'],
                                longitudes = df['Longitude']
                                )

Hayes Valley/Tenderloin/North of Market
South of Market
Potrero Hill
Chinatown
Polk/Russian Hill (Nob Hill)
Inner Mission/Bernal Heights
Ingelside-Excelsior/Crocker-Amazon
Castro/Noe Valley
Western Addition/Japantown
Parkside/Forest Hill
Haight-Ashbury
Inner Richmond
Outer Richmond
Sunset
Marina
Bayview-Hunters Point
St. Francis Wood/Miraloma/West Portal
Twin Peaks-Glen Park
Lake Merced
North Beach/Chinatown
Visitacion Valley/Sunnydale


In [102]:
## The number of venues by neighborhood
sf_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Bayview-Hunters Point,7,7,7,7,7,7
Castro/Noe Valley,64,64,64,64,64,64
Chinatown,100,100,100,100,100,100
Haight-Ashbury,31,31,31,31,31,31
Hayes Valley/Tenderloin/North of Market,100,100,100,100,100,100
Ingelside-Excelsior/Crocker-Amazon,38,38,38,38,38,38
Inner Mission/Bernal Heights,63,63,63,63,63,63
Inner Richmond,63,63,63,63,63,63
Lake Merced,20,20,20,20,20,20
Marina,78,78,78,78,78,78


In [104]:
## The number of venues by type
sf_venues['Venue Category'].value_counts()

Coffee Shop                   44
Park                          28
Café                          27
Bakery                        27
Pizza Place                   25
Vietnamese Restaurant         24
American Restaurant           24
Chinese Restaurant            23
Mexican Restaurant            23
Thai Restaurant               21
Sandwich Place                21
Sushi Restaurant              20
Bar                           19
Gym / Fitness Center          19
Italian Restaurant            19
Wine Bar                      18
Cocktail Bar                  15
Gym                           15
Deli / Bodega                 15
French Restaurant             14
Yoga Studio                   13
Grocery Store                 12
Japanese Restaurant           12
Diner                         11
Hotel                         11
Food Truck                    11
Clothing Store                11
Bubble Tea Shop               11
Theater                        9
Pharmacy                       9
          

In [105]:
## The number of unique venue types
len(sf_venues['Venue Category'].unique())

221

### Analyze each neighborhood

In [106]:
# one hot encoding
sf_onehot = pd.get_dummies(sf_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
sf_onehot['Neighborhood'] = sf_venues['Neighborhood']

# move neighborhood column to the first column
sf_onehot.set_index('Neighborhood', inplace=True)
sf_onehot.reset_index(inplace=True)
sf_onehot.head()

Unnamed: 0,Neighborhood,ATM,Adult Boutique,Afghan Restaurant,Alternative Healer,American Restaurant,Art Gallery,Art Museum,Arts & Crafts Store,Asian Restaurant,...,Thrift / Vintage Store,Tiki Bar,Toy / Game Store,Trail,Turkish Restaurant,Vegetarian / Vegan Restaurant,Vietnamese Restaurant,Wine Bar,Wine Shop,Yoga Studio
0,Hayes Valley/Tenderloin/North of Market,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Hayes Valley/Tenderloin/North of Market,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Hayes Valley/Tenderloin/North of Market,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Hayes Valley/Tenderloin/North of Market,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Hayes Valley/Tenderloin/North of Market,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [107]:
# group rows by neighborhood 
sf_g = sf_onehot.groupby('Neighborhood').mean().reset_index()
sf_g

Unnamed: 0,Neighborhood,ATM,Adult Boutique,Afghan Restaurant,Alternative Healer,American Restaurant,Art Gallery,Art Museum,Arts & Crafts Store,Asian Restaurant,...,Thrift / Vintage Store,Tiki Bar,Toy / Game Store,Trail,Turkish Restaurant,Vegetarian / Vegan Restaurant,Vietnamese Restaurant,Wine Bar,Wine Shop,Yoga Studio
0,Bayview-Hunters Point,0.0,0.0,0.0,0.0,0.0,0.142857,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Castro/Noe Valley,0.0,0.015625,0.0,0.0,0.015625,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.015625,0.0,0.0,0.0,0.03125,0.015625,0.03125
2,Chinatown,0.0,0.0,0.0,0.0,0.04,0.0,0.0,0.0,0.01,...,0.0,0.0,0.0,0.0,0.0,0.0,0.01,0.01,0.0,0.01
3,Haight-Ashbury,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.032258,0.0,0.0,0.0,0.0,0.032258,0.0,0.032258
4,Hayes Valley/Tenderloin/North of Market,0.0,0.0,0.0,0.0,0.02,0.0,0.01,0.0,0.01,...,0.0,0.01,0.0,0.0,0.0,0.02,0.04,0.02,0.01,0.0
5,Ingelside-Excelsior/Crocker-Amazon,0.0,0.0,0.0,0.0,0.026316,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.052632,0.0,0.0,0.0
6,Inner Mission/Bernal Heights,0.0,0.0,0.0,0.0,0.0,0.031746,0.0,0.0,0.0,...,0.0,0.0,0.0,0.015873,0.0,0.0,0.0,0.0,0.0,0.015873
7,Inner Richmond,0.015873,0.0,0.0,0.0,0.0,0.015873,0.0,0.0,0.015873,...,0.0,0.0,0.015873,0.0,0.015873,0.0,0.031746,0.015873,0.031746,0.0
8,Lake Merced,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,Marina,0.0,0.0,0.0,0.012821,0.038462,0.0,0.0,0.012821,0.0,...,0.012821,0.0,0.0,0.0,0.0,0.0,0.012821,0.038462,0.0,0.012821


In [108]:
# print each neighborhood along with the top 5 most common venues
num_top_venues = 5

for n in sf_g['Neighborhood']:
    print("----"+n+"----")
    temp = sf_g[sf_g['Neighborhood'] == n].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Bayview-Hunters Point----
                        venue  freq
0             Motorcycle Shop  0.14
1                 Art Gallery  0.14
2  Construction & Landscaping  0.14
3              Lighting Store  0.14
4                      Bakery  0.14


----Castro/Noe Valley----
            venue  freq
0         Gay Bar  0.08
1            Park  0.05
2  Clothing Store  0.05
3      Playground  0.03
4   Grocery Store  0.03


----Chinatown----
                 venue  freq
0                Hotel  0.07
1          Coffee Shop  0.06
2  American Restaurant  0.04
3         Cocktail Bar  0.04
4          Men's Store  0.03


----Haight-Ashbury----
           venue  freq
0    Coffee Shop  0.13
1  Grocery Store  0.06
2           Park  0.06
3   Tennis Court  0.06
4       Boutique  0.06


----Hayes Valley/Tenderloin/North of Market----
                   venue  freq
0            Coffee Shop  0.05
1  Vietnamese Restaurant  0.04
2           Cocktail Bar  0.04
3               Beer Bar  0.03
4                The

In [109]:
# put into df with top 10 venues for each neighborhood
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
hood_venue_sorted = pd.DataFrame(columns=columns)
hood_venue_sorted['Neighborhood'] = sf_g['Neighborhood']

for ind in np.arange(sf_g.shape[0]):
    hood_venue_sorted.iloc[ind, 1:] = return_most_common_venues(sf_g.iloc[ind, :], num_top_venues)

hood_venue_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Bayview-Hunters Point,Building,Construction & Landscaping,Coffee Shop,Bakery,Art Gallery,Motorcycle Shop,Lighting Store,Fast Food Restaurant,Field,Filipino Restaurant
1,Castro/Noe Valley,Gay Bar,Clothing Store,Park,Yoga Studio,Grocery Store,Coffee Shop,Thai Restaurant,Playground,Wine Bar,Pharmacy
2,Chinatown,Hotel,Coffee Shop,American Restaurant,Cocktail Bar,Men's Store,Boutique,Church,Clothing Store,Hotel Bar,Beer Bar
3,Haight-Ashbury,Coffee Shop,Park,Boutique,Tennis Court,Grocery Store,Yoga Studio,Bakery,Burrito Place,Salon / Barbershop,Restaurant
4,Hayes Valley/Tenderloin/North of Market,Coffee Shop,Cocktail Bar,Vietnamese Restaurant,Performing Arts Venue,Theater,Boutique,Beer Bar,Café,French Restaurant,Southern / Soul Food Restaurant


### Cluster Neighborhoods

In [115]:
# set number of clusters
kclusters = 3

sf_g_c = sf_g.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters = kclusters, random_state=0).fit(sf_g_c)

# check cluster labels generated for each row in df
kmeans.labels_[0:10]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int32)

In [119]:
# add clustering labels
# hood_venue_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

sf_merged = df

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
sf_merged = sf_merged.join(hood_venue_sorted.set_index('Neighborhood'), on='Neighborhood')

sf_merged.head()

Unnamed: 0,Zip Code,Neighborhood,Population (Census 2000),Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
1,94102,Hayes Valley/Tenderloin/North of Market,28991,37.78,-122.42,0,Coffee Shop,Cocktail Bar,Vietnamese Restaurant,Performing Arts Venue,Theater,Boutique,Beer Bar,Café,French Restaurant,Southern / Soul Food Restaurant
2,94103,South of Market,23016,37.78,-122.41,0,Coffee Shop,Sandwich Place,Theater,American Restaurant,Café,Bakery,Vietnamese Restaurant,Pizza Place,Performing Arts Venue,Marijuana Dispensary
3,94107,Potrero Hill,17368,37.77,-122.39,0,Food Truck,Gym,Coffee Shop,Café,Pharmacy,Pizza Place,Park,Pier,Street Food Gathering,Bank
4,94108,Chinatown,13716,37.791,-122.409,0,Hotel,Coffee Shop,American Restaurant,Cocktail Bar,Men's Store,Boutique,Church,Clothing Store,Hotel Bar,Beer Bar
5,94109,Polk/Russian Hill (Nob Hill),56322,37.79,-122.42,0,Bar,Grocery Store,Sushi Restaurant,Diner,Coffee Shop,Wine Bar,Vietnamese Restaurant,Massage Studio,Mexican Restaurant,Bakery


In [120]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=12)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(sf_merged['Latitude'], sf_merged['Longitude'], sf_merged['Neighborhood'], sf_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters