# Applied Data Science Capstone: Open a restaurant in Manhattan

### Import useful libraries

In [2]:
pip install sodapy

Note: you may need to restart the kernel to use updated packages.


In [3]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')

Collecting package metadata (current_repodata.json): done
Solving environment: done

# All requested packages already installed.

Collecting package metadata (current_repodata.json): done
Solving environment: done

# All requested packages already installed.

Libraries imported.


### Explore neighborhood location data from **Neighborhood Names GIS dataset**

In [4]:
import pandas as pd
from sodapy import Socrata

# Unauthenticated client only works with public data sets. Note 'None'
# in place of application token, and no username or password:
client = Socrata("data.cityofnewyork.us", None)

# Example authenticated client (needed for non-public datasets):
# client = Socrata(data.cityofnewyork.us,
#                  MyAppToken,
#                  userame="user@example.com",
#                  password="AFakePassword")

# First 2000 results, returned as JSON from API / converted to Python list of
# dictionaries by sodapy.
results = client.get("xyye-rtrs", limit=2000)

# Convert to pandas DataFrame
results_df = pd.DataFrame.from_records(results)



In [5]:
man_df = results_df.set_index('borough')
man_df = man_df.loc[['Manhattan']]
man_df.head()

Unnamed: 0_level_0,the_geom,objectid,name,stacked,annoline1,annoline2,annoline3,annoangle
borough,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Manhattan,"{'type': 'Point', 'coordinates': [-73.91065965...",7,Marble Hill,2,Marble,Hill,,0.0
Manhattan,"{'type': 'Point', 'coordinates': [-73.99427936...",103,Chinatown,1,Chinatown,,,0.0
Manhattan,"{'type': 'Point', 'coordinates': [-73.93690027...",104,Washington Heights,2,Washington,Heights,,0.0
Manhattan,"{'type': 'Point', 'coordinates': [-73.92121042...",105,Inwood,1,Inwood,,,0.0
Manhattan,"{'type': 'Point', 'coordinates': [-73.94968791...",106,Hamilton Heights,2,Hamilton,Heights,,0.0


In [6]:
latitude = []
longitude = []
neighborhood = []

for x in range(0,39):
    lat = man_df['the_geom'][x]['coordinates'][1]
    long = man_df['the_geom'][x]['coordinates'][0]
    neigh = man_df['name'][x]
    latitude.append(lat)
    longitude.append(long)
    neighborhood.append(neigh)
    
man_neigh = pd.DataFrame(list(zip(neighborhood,latitude,longitude)),columns=['Neighborhood','Latitude','Longitude'])
print(man_neigh.shape)
man_neigh.head()

(39, 3)


Unnamed: 0,Neighborhood,Latitude,Longitude
0,Marble Hill,40.876551,-73.91066
1,Chinatown,40.715618,-73.994279
2,Washington Heights,40.851903,-73.9369
3,Inwood,40.867684,-73.92121
4,Hamilton Heights,40.823604,-73.949688


### Explore neighborhood food venues information from **Foursquare Location Data**

In [7]:
import config1 as cfg
import requests

#### Explore venue data

In [8]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            cfg.credentials['CLIENT_ID'], 
            cfg.credentials['CLIENT_SECRET'],  
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [9]:
VERSION = '20200101'
LIMIT = 500

manhattan_venues = getNearbyVenues(names=man_neigh['Neighborhood'],
                                   latitudes=man_neigh['Latitude'],
                                   longitudes=man_neigh['Longitude']
                                  )
manhattan_venues.shape

Marble Hill
Chinatown
Washington Heights
Inwood
Hamilton Heights
Manhattanville
Central Harlem
East Harlem
Upper East Side
Yorkville
Lenox Hill
Roosevelt Island
Upper West Side
Lincoln Square
Clinton
Midtown
Murray Hill
Chelsea
Greenwich Village
East Village
Lower East Side
Tribeca
Little Italy
Soho
West Village
Manhattan Valley
Morningside Heights
Gramercy
Battery Park City
Financial District
Carnegie Hill
Noho
Civic Center
Midtown South
Sutton Place
Turtle Bay
Tudor City
Stuyvesant Town
Flatiron


(3084, 7)

In [10]:
manhattan_venues.groupby('Neighborhood').count().sort_values(by=['Venue'],ascending=False)

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Yorkville,100,100,100,100,100,100
Flatiron,100,100,100,100,100,100
Sutton Place,100,100,100,100,100,100
Midtown South,100,100,100,100,100,100
Midtown,100,100,100,100,100,100
Turtle Bay,100,100,100,100,100,100
Little Italy,100,100,100,100,100,100
Lenox Hill,100,100,100,100,100,100
Soho,100,100,100,100,100,100
Greenwich Village,100,100,100,100,100,100


In [11]:
print('There are {} uniques categories.'.format(len(manhattan_venues['Venue Category'].unique())))

There are 327 uniques categories.


#### Explore category data

In [12]:
radius = 500
VERSION = '20200101'

url1 = 'https://api.foursquare.com/v2/venues/categories?&client_id={}&client_secret={}&v={}'.format(
            cfg.credentials['CLIENT_ID'], 
            cfg.credentials['CLIENT_SECRET'],  
            VERSION
            )
            
        # make the GET request

results1 = requests.get(url1).json()["response"]['categories']

In [13]:
# Filter venues of category 'Food'

tlist = {}

for x in range(0,len(results1)-1):
    title = results1[x]['name']
    tlist[title] = {}

    for y in range(0,len(results1[x]['categories'])-1):
        cat = results1[x]['categories'][y]['name']
        tlist[title][y+1] = cat

food_venue_list = list(tlist['Food'].values())
print(len(food_venue_list))
food_venue_list


91


['Afghan Restaurant',
 'African Restaurant',
 'American Restaurant',
 'Asian Restaurant',
 'Australian Restaurant',
 'Austrian Restaurant',
 'BBQ Joint',
 'Bagel Shop',
 'Bakery',
 'Bangladeshi Restaurant',
 'Belgian Restaurant',
 'Bistro',
 'Breakfast Spot',
 'Bubble Tea Shop',
 'Buffet',
 'Burger Joint',
 'Cafeteria',
 'Café',
 'Cajun / Creole Restaurant',
 'Caribbean Restaurant',
 'Caucasian Restaurant',
 'Coffee Shop',
 'Comfort Food Restaurant',
 'Creperie',
 'Czech Restaurant',
 'Deli / Bodega',
 'Dessert Shop',
 'Diner',
 'Donut Shop',
 'Dumpling Restaurant',
 'Dutch Restaurant',
 'Eastern European Restaurant',
 'English Restaurant',
 'Falafel Restaurant',
 'Fast Food Restaurant',
 'Fish & Chips Shop',
 'Fondue Restaurant',
 'Food Court',
 'Food Stand',
 'Food Truck',
 'French Restaurant',
 'Fried Chicken Joint',
 'Friterie',
 'Gastropub',
 'German Restaurant',
 'Gluten-free Restaurant',
 'Greek Restaurant',
 'Halal Restaurant',
 'Hawaiian Restaurant',
 'Hot Dog Joint',
 'Hungar

In [282]:
manhattan_food_venues = manhattan_venues.loc[manhattan_venues['Venue Category'].isin(food_venue_list)]
print(manhattan_food_venues.shape)
manhattan_food_venues = manhattan_food_venues.reset_index(drop=True)
manhattan_food_venues.head()

(1260, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Marble Hill,40.876551,-73.91066,Arturo's,40.874412,-73.910271,Pizza Place
1,Marble Hill,40.876551,-73.91066,Tibbett Diner,40.880404,-73.908937,Diner
2,Marble Hill,40.876551,-73.91066,Starbucks,40.877531,-73.905582,Coffee Shop
3,Marble Hill,40.876551,-73.91066,Dunkin',40.877136,-73.906666,Donut Shop
4,Marble Hill,40.876551,-73.91066,Starbucks,40.873755,-73.908613,Coffee Shop


In [280]:
not_in_man = set(food_venue_list) - set(manhattan_food_venues['Venue Category'].unique())
not_in_man

{'Bangladeshi Restaurant',
 'Belgian Restaurant',
 'Buffet',
 'Cajun / Creole Restaurant',
 'Dutch Restaurant',
 'Fish & Chips Shop',
 'Fondue Restaurant',
 'Friterie',
 'Halal Restaurant',
 'Hungarian Restaurant',
 'Mac & Cheese Joint',
 'Pakistani Restaurant',
 'Polish Restaurant',
 'Portuguese Restaurant',
 'Poutine Place',
 'Russian Restaurant',
 'Scottish Restaurant',
 'Slovak Restaurant',
 'Sri Lankan Restaurant',
 'Truck Stop',
 'Ukrainian Restaurant'}

#### Look into distribution of different types of food venues

In [15]:
# one hot encoding
manhattan_onehot = pd.get_dummies(manhattan_food_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
manhattan_onehot['Neighborhood'] = manhattan_food_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [manhattan_onehot.columns[-1]] + list(manhattan_onehot.columns[:-1])
manhattan_onehot = manhattan_onehot[fixed_columns]

print(manhattan_onehot.shape)
manhattan_onehot.head()

(1260, 71)


Unnamed: 0,Neighborhood,Afghan Restaurant,African Restaurant,American Restaurant,Asian Restaurant,Australian Restaurant,Austrian Restaurant,BBQ Joint,Bagel Shop,Bakery,Bistro,Breakfast Spot,Bubble Tea Shop,Burger Joint,Cafeteria,Café,Caribbean Restaurant,Caucasian Restaurant,Coffee Shop,Comfort Food Restaurant,Creperie,Czech Restaurant,Deli / Bodega,Dessert Shop,Diner,Donut Shop,Dumpling Restaurant,Eastern European Restaurant,English Restaurant,Falafel Restaurant,Fast Food Restaurant,Food Court,Food Stand,Food Truck,French Restaurant,Fried Chicken Joint,Gastropub,German Restaurant,Gluten-free Restaurant,Greek Restaurant,Hawaiian Restaurant,Hot Dog Joint,Indian Restaurant,Irish Pub,Italian Restaurant,Jewish Restaurant,Juice Bar,Kebab Restaurant,Latin American Restaurant,Mediterranean Restaurant,Mexican Restaurant,Middle Eastern Restaurant,Modern European Restaurant,Molecular Gastronomy Restaurant,Pet Café,Pizza Place,Restaurant,Salad Place,Sandwich Place,Scandinavian Restaurant,Seafood Restaurant,Snack Place,Soup Place,Southern / Soul Food Restaurant,Spanish Restaurant,Steakhouse,Swiss Restaurant,Tea Room,Theme Restaurant,Turkish Restaurant,Vegetarian / Vegan Restaurant
0,Marble Hill,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,Marble Hill,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,Marble Hill,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,Marble Hill,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,Marble Hill,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [283]:
manhattan_grouped = manhattan_onehot.groupby('Neighborhood').mean().reset_index()
print(manhattan_grouped.shape)
manhattan_grouped.head()

(39, 71)


Unnamed: 0,Neighborhood,Afghan Restaurant,African Restaurant,American Restaurant,Asian Restaurant,Australian Restaurant,Austrian Restaurant,BBQ Joint,Bagel Shop,Bakery,Bistro,Breakfast Spot,Bubble Tea Shop,Burger Joint,Cafeteria,Café,Caribbean Restaurant,Caucasian Restaurant,Coffee Shop,Comfort Food Restaurant,Creperie,Czech Restaurant,Deli / Bodega,Dessert Shop,Diner,Donut Shop,Dumpling Restaurant,Eastern European Restaurant,English Restaurant,Falafel Restaurant,Fast Food Restaurant,Food Court,Food Stand,Food Truck,French Restaurant,Fried Chicken Joint,Gastropub,German Restaurant,Gluten-free Restaurant,Greek Restaurant,Hawaiian Restaurant,Hot Dog Joint,Indian Restaurant,Irish Pub,Italian Restaurant,Jewish Restaurant,Juice Bar,Kebab Restaurant,Latin American Restaurant,Mediterranean Restaurant,Mexican Restaurant,Middle Eastern Restaurant,Modern European Restaurant,Molecular Gastronomy Restaurant,Pet Café,Pizza Place,Restaurant,Salad Place,Sandwich Place,Scandinavian Restaurant,Seafood Restaurant,Snack Place,Soup Place,Southern / Soul Food Restaurant,Spanish Restaurant,Steakhouse,Swiss Restaurant,Tea Room,Theme Restaurant,Turkish Restaurant,Vegetarian / Vegan Restaurant
0,Battery Park City,0.0,0.0,0.058824,0.0,0.0,0.0,0.058824,0.0,0.0,0.058824,0.0,0.0,0.117647,0.0,0.0,0.0,0.0,0.235294,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.117647,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.058824,0.0,0.0,0.0,0.0,0.058824,0.058824,0.0,0.0,0.0,0.0,0.058824,0.0,0.0,0.058824,0.0,0.0,0.0,0.0,0.0,0.0,0.058824,0.0,0.0,0.0,0.0,0.0
1,Carnegie Hill,0.0,0.0,0.03125,0.0,0.0,0.0,0.0,0.03125,0.0625,0.0,0.0,0.0,0.03125,0.0,0.125,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.03125,0.0,0.0,0.03125,0.0625,0.0,0.0,0.0,0.0,0.0,0.0,0.03125,0.03125,0.0,0.125,0.0,0.0,0.0,0.0,0.0,0.03125,0.0,0.0,0.0,0.0,0.0625,0.03125,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.03125
2,Central Harlem,0.0,0.136364,0.090909,0.0,0.0,0.0,0.045455,0.045455,0.0,0.0,0.0,0.0,0.0,0.045455,0.045455,0.045455,0.0,0.0,0.0,0.0,0.0,0.0,0.045455,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.045455,0.090909,0.090909,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.045455,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.045455,0.045455,0.0,0.0,0.0,0.090909,0.0,0.0,0.045455,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Chelsea,0.0,0.0,0.117647,0.0,0.0,0.0,0.0,0.029412,0.088235,0.0,0.0,0.0,0.029412,0.0,0.088235,0.0,0.0,0.235294,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.088235,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.088235,0.0,0.0,0.0,0.0,0.029412,0.029412,0.029412,0.0,0.0,0.0,0.029412,0.029412,0.0,0.029412,0.0,0.029412,0.0,0.0,0.0,0.0,0.029412,0.0,0.0,0.0,0.0,0.0
4,Chinatown,0.0,0.0,0.088235,0.058824,0.0,0.029412,0.0,0.0,0.176471,0.0,0.0,0.088235,0.0,0.0,0.0,0.0,0.0,0.058824,0.0,0.0,0.0,0.0,0.117647,0.029412,0.0,0.0,0.0,0.029412,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.058824,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.058824,0.0,0.0,0.0,0.0,0.029412,0.0,0.0,0.058824,0.0,0.0,0.029412,0.0,0.0,0.029412,0.0,0.0,0.029412,0.0,0.0,0.029412


### Look at the most common venue categories in each neighborhood

In [17]:
num_top_venues = 5

for hood in manhattan_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = manhattan_grouped[manhattan_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Battery Park City----
                 venue  freq
0          Coffee Shop  0.24
1         Burger Joint  0.12
2           Food Court  0.12
3  American Restaurant  0.06
4          Pizza Place  0.06


----Carnegie Hill----
                venue  freq
0         Coffee Shop  0.25
1                Café  0.12
2  Italian Restaurant  0.12
3   French Restaurant  0.06
4         Pizza Place  0.06


----Central Harlem----
                 venue  freq
0   African Restaurant  0.14
1  Fried Chicken Joint  0.09
2    French Restaurant  0.09
3   Seafood Restaurant  0.09
4  American Restaurant  0.09


----Chelsea----
                 venue  freq
0          Coffee Shop  0.24
1  American Restaurant  0.12
2                 Café  0.09
3   Italian Restaurant  0.09
4    French Restaurant  0.09


----Chinatown----
                 venue  freq
0               Bakery  0.18
1         Dessert Shop  0.12
2  American Restaurant  0.09
3      Bubble Tea Shop  0.09
4          Coffee Shop  0.06


----Civic Center----


In [18]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [290]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = manhattan_grouped['Neighborhood']

for ind in np.arange(manhattan_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(manhattan_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Battery Park City,Coffee Shop,Burger Joint,Food Court,American Restaurant,Bistro,Mediterranean Restaurant,Mexican Restaurant,Italian Restaurant,Pizza Place,Sandwich Place
1,Carnegie Hill,Coffee Shop,Italian Restaurant,Café,French Restaurant,Bakery,Pizza Place,Vegetarian / Vegan Restaurant,Burger Joint,Food Truck,Hot Dog Joint
2,Central Harlem,African Restaurant,Fried Chicken Joint,American Restaurant,French Restaurant,Seafood Restaurant,Restaurant,Dessert Shop,Juice Bar,Caribbean Restaurant,Cafeteria
3,Chelsea,Coffee Shop,American Restaurant,Café,French Restaurant,Italian Restaurant,Bakery,Sandwich Place,Mediterranean Restaurant,Middle Eastern Restaurant,Burger Joint
4,Chinatown,Bakery,Dessert Shop,American Restaurant,Bubble Tea Shop,Sandwich Place,Mexican Restaurant,Greek Restaurant,Coffee Shop,Asian Restaurant,Austrian Restaurant


In [260]:
n_cat = [[]]

for n in manhattan_food_venues['Neighborhood'].unique():
    cat = manhattan_food_venues[manhattan_food_venues['Neighborhood']== n].groupby('Venue Category').count()
    n_cat.append([n,cat['Venue']])

n_cat[1]

['Marble Hill',
 Venue Category
 American Restaurant    1
 Coffee Shop            2
 Deli / Bodega          1
 Diner                  1
 Donut Shop             1
 Pizza Place            1
 Sandwich Place         3
 Seafood Restaurant     1
 Steakhouse             1
 Name: Venue, dtype: int64]

In [257]:
df = {}

for n in range(1,len(n_cat)-1):
    neigh_name = n_cat[n][0]
    col = list(n_cat[n][1].keys())
    num = list(n_cat[n][1].values)
    
    what = pd.DataFrame([col,num]).T
    what.columns=('Venue Category','Count')
    what['Percentage']=(what['Count']/what['Count'].sum())*100
    what.set_index('Venue Category',inplace=True)
    
    df[neigh_name] = what

In [258]:
df['Marble Hill'].loc[['Diner'],'Percentage'][0]

8.333333333333332

In [281]:
df['Marble Hill']

Unnamed: 0_level_0,Count,Percentage
Venue Category,Unnamed: 1_level_1,Unnamed: 2_level_1
American Restaurant,1,8.33333
Coffee Shop,2,16.6667
Deli / Bodega,1,8.33333
Diner,1,8.33333
Donut Shop,1,8.33333
Pizza Place,1,8.33333
Sandwich Place,3,25.0
Seafood Restaurant,1,8.33333
Steakhouse,1,8.33333


### Cluster Neighborhoods

In [288]:
# set number of clusters
kclusters = 5

manhattan_grouped_clustering = manhattan_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(manhattan_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([3, 3, 1, 3, 1, 3, 0, 1, 1, 0], dtype=int32)

In [291]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

manhattan_merged = man_neigh

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
manhattan_merged = manhattan_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

manhattan_merged.head() # check the last columns!

Unnamed: 0,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Marble Hill,40.876551,-73.91066,4,Sandwich Place,Coffee Shop,Donut Shop,Deli / Bodega,Pizza Place,Diner,Seafood Restaurant,Steakhouse,American Restaurant,English Restaurant
1,Chinatown,40.715618,-73.994279,1,Bakery,Dessert Shop,American Restaurant,Bubble Tea Shop,Sandwich Place,Mexican Restaurant,Greek Restaurant,Coffee Shop,Asian Restaurant,Austrian Restaurant
2,Washington Heights,40.851903,-73.9369,1,Café,Bakery,Mexican Restaurant,Donut Shop,Latin American Restaurant,Coffee Shop,Sandwich Place,Deli / Bodega,Italian Restaurant,Spanish Restaurant
3,Inwood,40.867684,-73.92121,1,Mexican Restaurant,Café,Restaurant,American Restaurant,Deli / Bodega,Spanish Restaurant,Caribbean Restaurant,Bakery,Pizza Place,Diner
4,Hamilton Heights,40.823604,-73.949688,1,Pizza Place,Coffee Shop,Mexican Restaurant,Café,Deli / Bodega,Indian Restaurant,Bakery,Sandwich Place,Caribbean Restaurant,Food Truck


In [408]:
# create map
map_clusters = folium.Map(location=[40.78343, -73.96625], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(manhattan_merged['Latitude'], manhattan_merged['Longitude'], manhattan_merged['Neighborhood'], manhattan_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

In [327]:
clus0 = list(manhattan_merged.loc[manhattan_merged['Cluster Labels'] == 0]['Neighborhood'])
clus0

['Upper East Side',
 'Yorkville',
 'Lenox Hill',
 'Upper West Side',
 'Lincoln Square',
 'Clinton',
 'Greenwich Village',
 'Tribeca',
 'Soho',
 'West Village',
 'Gramercy',
 'Financial District',
 'Noho',
 'Sutton Place',
 'Turtle Bay']

In [326]:
clus1 = list(manhattan_merged.loc[manhattan_merged['Cluster Labels'] == 1]['Neighborhood'])
clus1

['Chinatown',
 'Washington Heights',
 'Inwood',
 'Hamilton Heights',
 'Manhattanville',
 'Central Harlem',
 'East Harlem',
 'Murray Hill',
 'East Village',
 'Lower East Side',
 'Little Italy',
 'Morningside Heights',
 'Midtown South',
 'Tudor City',
 'Flatiron']

In [325]:
clus2 = list(manhattan_merged.loc[manhattan_merged['Cluster Labels'] == 2]['Neighborhood'])
clus2

['Stuyvesant Town']

In [324]:
clus3 = list(manhattan_merged.loc[manhattan_merged['Cluster Labels'] == 3]['Neighborhood'])
clus3

['Midtown',
 'Chelsea',
 'Manhattan Valley',
 'Battery Park City',
 'Carnegie Hill',
 'Civic Center']

In [328]:
clus4 = list(manhattan_merged.loc[manhattan_merged['Cluster Labels'] == 4]['Neighborhood'])
clus4

['Marble Hill', 'Roosevelt Island']

### Look at food venues with exactly 3 branches

In [399]:
man_branch = manhattan_food_venues.groupby('Venue').count()
brch_venues = list(man_branch[man_branch['Neighborhood']==3]['Neighborhood'].keys())

In [406]:
for client in brch_venues:
    venue_location = manhattan_food_venues[manhattan_food_venues['Venue']==client]
    venue_neigh = list(venue_location['Neighborhood'])

    clus0_count = len(set(venue_neigh) & set(clus0))
    clus1_count = len(set(venue_neigh) & set(clus1))
    clus2_count = len(set(venue_neigh) & set(clus2))
    clus3_count = len(set(venue_neigh) & set(clus3))
    clus4_count = len(set(venue_neigh) & set(clus4))
    
    max_count = [clus0_count,clus1_count,clus2_count,clus3_count,clus4_count]
    all_neigh = '{0}, {1}, {2}.'.format(venue_neigh[0],venue_neigh[1],venue_neigh[2])
    print(client,'has branches in',all_neigh)
    print(max(max_count)/len(venue_neigh)*100,'% of ',client,"'s stores are in neighborhood in cluster ",max_count.index(max(max_count)),'.')
    print('-------------------------------------------------')

Buttercup Bake Shop has branches in Murray Hill, Midtown South, Turtle Bay.
66.66666666666666 % of  Buttercup Bake Shop 's stores are in neighborhood in cluster  1 .
-------------------------------------------------
Fields Good Chicken has branches in Murray Hill, Financial District, Midtown South.
66.66666666666666 % of  Fields Good Chicken 's stores are in neighborhood in cluster  1 .
-------------------------------------------------
Gotan has branches in Midtown, Tribeca, Civic Center.
66.66666666666666 % of  Gotan 's stores are in neighborhood in cluster  3 .
-------------------------------------------------
Irving Farm Coffee Roasters has branches in Upper West Side, Gramercy, Flatiron.
66.66666666666666 % of  Irving Farm Coffee Roasters 's stores are in neighborhood in cluster  0 .
-------------------------------------------------
JOE & THE JUICE has branches in Upper East Side, Murray Hill, Financial District.
66.66666666666666 % of  JOE & THE JUICE 's stores are in neighborhood