# **Capstone Project (Optimal Restaurant Location in Manhattan)**

## **Introduction / Busniess Problem**

Import numpy, pandas, json, geopy, matplotlib, and folium

In [92]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

# !conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

import folium # map rendering library

print('Libraries imported.')

Libraries imported.


## **Data Extraction / Data Preparation**

We will utilize the new york data given in the labs to construct our data set. 

In [93]:
with open('newyork_data.json') as json_data:
    newyork_data = json.load(json_data)

In [94]:
neighborhoods_data = newyork_data['features']

In [95]:
# define the dataframe columns
column_names = ['Borough', 'Neighborhood', 'Latitude', 'Longitude'] 

# instantiate the dataframe
ny_neighborhoods = pd.DataFrame(columns=column_names)

In [96]:
for data in neighborhoods_data:
    borough = neighborhood_name = data['properties']['borough'] 
    neighborhood_name = data['properties']['name']
        
    neighborhood_latlon = data['geometry']['coordinates']
    neighborhood_lat = neighborhood_latlon[1]
    neighborhood_lon = neighborhood_latlon[0]
    
    ny_neighborhoods = ny_neighborhoods.append({'Borough': borough,
                                          'Neighborhood': neighborhood_name,
                                          'Latitude': neighborhood_lat,
                                          'Longitude': neighborhood_lon}, ignore_index=True)

In [97]:
address = 'New York City, NY'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude

In [98]:
manhattan_data = ny_neighborhoods[ny_neighborhoods['Borough'] == 'Manhattan'].reset_index(drop=True)

In [99]:
# create map of New York using latitude and longitude values
map_manhattan = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(manhattan_data['Latitude'], manhattan_data['Longitude'], manhattan_data['Borough'], manhattan_data['Neighborhood']):
    label = '{}, {}'.format(map_manhattan, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_manhattan)  
    
map_manhattan

In [100]:
CLIENT_ID = 'V1YPB05DJJGQD12QBQNUQUAIKUJZL3NUZIHOIG4GTPLCTY0I' # your Foursquare ID
CLIENT_SECRET = 'OQX0ME53ARGSBG45YSRCZTHBX04FYGDIKIJ2L04XZ2LOYJ3L' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version
LIMIT = 200 # limit of number of venues returned by Foursquare API
radius = 500 # define radius

In [101]:
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [102]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [103]:
ny_venues = getNearbyVenues(names=manhattan_data['Neighborhood'],
                                   latitudes=manhattan_data['Latitude'],
                                   longitudes=manhattan_data['Longitude']
                                  )

Marble Hill
Chinatown
Washington Heights
Inwood
Hamilton Heights
Manhattanville
Central Harlem
East Harlem
Upper East Side
Yorkville
Lenox Hill
Roosevelt Island
Upper West Side
Lincoln Square
Clinton
Midtown
Murray Hill
Chelsea
Greenwich Village
East Village
Lower East Side
Tribeca
Little Italy
Soho
West Village
Manhattan Valley
Morningside Heights
Gramercy
Battery Park City
Financial District
Carnegie Hill
Noho
Civic Center
Midtown South
Sutton Place
Turtle Bay
Tudor City
Stuyvesant Town
Flatiron
Hudson Yards


In [104]:
print(ny_venues.shape)
ny_venues.head()

(3309, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Marble Hill,40.876551,-73.91066,Arturo's,40.874412,-73.910271,Pizza Place
1,Marble Hill,40.876551,-73.91066,Bikram Yoga,40.876844,-73.906204,Yoga Studio
2,Marble Hill,40.876551,-73.91066,Tibbett Diner,40.880404,-73.908937,Diner
3,Marble Hill,40.876551,-73.91066,Dunkin',40.877136,-73.906666,Donut Shop
4,Marble Hill,40.876551,-73.91066,Starbucks,40.877531,-73.905582,Coffee Shop


Notice that some neighborhoods have less venues and some have greater number of venues.

In [105]:
ny_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Battery Park City,93,93,93,93,93,93
Carnegie Hill,100,100,100,100,100,100
Central Harlem,43,43,43,43,43,43
Chelsea,100,100,100,100,100,100
Chinatown,100,100,100,100,100,100
Civic Center,100,100,100,100,100,100
Clinton,100,100,100,100,100,100
East Harlem,43,43,43,43,43,43
East Village,100,100,100,100,100,100
Financial District,100,100,100,100,100,100


In [106]:
ny_venues.groupby('Venue Category').count()

Unnamed: 0_level_0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude
Venue Category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Accessories Store,7,7,7,7,7,7
Adult Boutique,2,2,2,2,2,2
Afghan Restaurant,1,1,1,1,1,1
African Restaurant,3,3,3,3,3,3
American Restaurant,83,83,83,83,83,83
Antique Shop,1,1,1,1,1,1
Arcade,1,1,1,1,1,1
Arepa Restaurant,1,1,1,1,1,1
Argentinian Restaurant,4,4,4,4,4,4
Art Gallery,28,28,28,28,28,28


In [107]:
print('There are {} uniques venues.'.format(len(ny_venues['Venue Category'].unique())))

There are 343 uniques venues.


The following map shows all the venues in Manhattan given by Foursquare Query

In [108]:
# create map of New York using latitude and longitude values
map_venue = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, neighborhood in zip(ny_venues['Venue Latitude'], ny_venues['Venue Longitude'], ny_venues['Neighborhood']):
    label = '{}'.format(map_venue)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='red',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_venue)  
    
map_venue

In [109]:
ny_venues.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Marble Hill,40.876551,-73.91066,Arturo's,40.874412,-73.910271,Pizza Place
1,Marble Hill,40.876551,-73.91066,Bikram Yoga,40.876844,-73.906204,Yoga Studio
2,Marble Hill,40.876551,-73.91066,Tibbett Diner,40.880404,-73.908937,Diner
3,Marble Hill,40.876551,-73.91066,Dunkin',40.877136,-73.906666,Donut Shop
4,Marble Hill,40.876551,-73.91066,Starbucks,40.877531,-73.905582,Coffee Shop


## Data Preperation

Since the problem is to provide a recommedation of a particualar location, we will have the features being the venues type and the target varaible being the longitude and latitude location.

In [110]:
features = ny_venues[['Venue Longitude',  'Venue Longitude']]
target = ny_venues[{'Venue Category'}]

In [None]:
print(len(target[target['Venue Category'].str.contains('Restaurant')]))
print(len(target))

946
3309


Label all the venues that are restaurants as "Yes".

In [None]:
target[target['Venue Category'].str.contains('Restaurant')] = 'Yes'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.loc._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_array(key, value)


Label the rest of the venues (i.e. the non-restaurants) as "No"

In [None]:
target[~target['Venue Category'].str.contains('Yes')] = 'No'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


Create a map that distinguishes the restaurants from the non restaurants.

In [None]:
# create map of New York using latitude and longitude values
map_res = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, tar in zip(ny_venues['Venue Latitude'], ny_venues['Venue Longitude'], target['Venue Category']):
    label = '{}'.format(map_res)
    label = folium.Popup(label, parse_html=True)
    if(tar == 'Yes'):
        folium.CircleMarker(
            [lat, lng],
            radius=5,
            popup=label,
            color='red',
            fill=True,
            fill_color='#3186cc',
            fill_opacity=0.7,
            parse_html=False).add_to(map_res)
    else:
        folium.CircleMarker(
            [lat, lng],
            radius=5,
            popup=label,
            color='blue',
            fill=True,
            fill_color='#3186cc',
            fill_opacity=0.7,
            parse_html=False).add_to(map_res)
        
        
map_res

In [None]:
len(target[target['Venue Category'] == 'Yes'])

## **Model Evaluation**

### KNN (K Nearest Neighbors)

In [None]:
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
import sys
X = preprocessing.StandardScaler().fit(features).transform(features)
y = target.values.ravel()
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=4)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics

In [None]:
Ks = 20
mean_acc = np.zeros((Ks-1))
std_acc = np.zeros((Ks-1))
for n in range(1,Ks):
    
    #Train Model and Predict  
    neigh = KNeighborsClassifier(n_neighbors = n).fit(X_train,y_train)
    yhat=neigh.predict(X_test)
    mean_acc[n-1] = metrics.accuracy_score(y_test, yhat)

    
#     std_acc[n-1]=np.std(yhat==y_test)/np.sqrt(yhat.shape[0])
neigh = KNeighborsClassifier(n_neighbors = mean_acc.argmax()+1).fit(X_train,y_train)
mean_acc

### Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
# tree = DecisionTreeClassifier(criterion="entropy", max_depth = 4)
Ks = 10
mean_acc = np.zeros((Ks-1))
for n in range(1,Ks):
    
    #Train Model and Predict  
    tree = DecisionTreeClassifier(criterion="entropy", max_depth = Ks)
    tree.fit(X_train,y_train)
    predict = tree.predict(X_test)
    mean_acc[n-1] = metrics.accuracy_score(y_test, predict)

mean_acc

In [None]:
print( "The best accuracy was with", mean_acc.max(), "with k=", mean_acc.argmax()+1, "(max_depth)") 
treec = DecisionTreeClassifier(criterion="entropy", max_depth = mean_acc.argmax()+1)
treec.fit(X_train,y_train)

### SVM

In [None]:
from sklearn import svm
clf = svm.SVC(kernel='rbf')
clf.fit(X_train, y_train)

In [None]:
yhat_SVM = clf.predict(X_test)
print("The best accuracy was with", metrics.accuracy_score(y_test, yhat_SVM))

### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
LR = LogisticRegression(C=0.01, solver='liblinear').fit(X_train,y_train)

In [None]:
yhat_LR = LR.predict(X_test)
print("The best accuracy was with", metrics.accuracy_score(y_test, yhat_LR))

### Model Evalution

In [None]:
from sklearn.metrics import jaccard_similarity_score
from sklearn.metrics import f1_score
from sklearn.metrics import log_loss

In [None]:
columns = ['Algorithm','Jaccard', 'F1-score', 'LogLoss']
dfDisp = pd.DataFrame(columns=columns)
## KNN row 
y_KNN_eval = neigh.predict(X_test)
f = f1_score(y_test, y_KNN_eval, average='weighted') 
ja = jaccard_similarity_score(y_test, y_KNN_eval)

dfDisp = dfDisp.append({'Algorithm': 'KNN', 'Jaccard': str(ja), 'F1-score': str(f), 'LogLoss' : 'NA'}, ignore_index=True)
## Descision Tree
y_DT_eval = treec.predict(X_test)
f = f1_score(y_test, y_DT_eval, average='weighted') 
ja = jaccard_similarity_score(y_test, y_DT_eval)

dfDisp = dfDisp.append({'Algorithm': 'Decision Tree', 'Jaccard': str(ja), 'F1-score': str(f), 'LogLoss' : 'NA'}, ignore_index=True)
## SVM
y_SVM_eval = clf.predict(X_test)
f = f1_score(y_test, y_SVM_eval, average='weighted') 
ja = jaccard_similarity_score(y_test, y_SVM_eval)

dfDisp = dfDisp.append({'Algorithm': 'SVM', 'Jaccard': str(ja), 'F1-score': str(f), 'LogLoss' : 'NA'}, ignore_index=True)
## Logistic classification 
y_LC_eval = LR.predict(X_test)
f = f1_score(y_test, y_LC_eval, average='weighted') 
ja = jaccard_similarity_score(y_test, y_LC_eval)
ll = log_loss(y_test, LR.predict_proba(X_test))

dfDisp = dfDisp.append({'Algorithm': 'LogisticRegression', 'Jaccard': str(ja), 'F1-score': str(f), 'LogLoss' : str(ll)}, ignore_index=True)

dfDisp