# Project: the secrets behind popular restaurants

### Import libraries

In [213]:
import numpy as np
import pandas as pd
import math
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
import matplotlib.pyplot as plt
from sklearn.model_selection import RandomizedSearchCV
from sklearn.tree import export_graphviz
import pydot

### Load datasets and data cleaning

In [214]:
f_name = 'CS_restaurant_raw_final.csv'
f_name2 = 'CS_restaurant_distance.csv'
df_r = pd.read_csv(f_name)
df_dis = pd.read_csv(f_name2)
df_dis = df_dis['distance']
df_raw = pd.concat([df_r, df_dis], axis=1, sort=False)
filtered_columns = ['name', 'verified','location.city','delivery.provider.name','photos.count','tips.count','listed.count','rating','price.tier','distance']
df_f = df_raw.loc[:, filtered_columns]

prefix = ['Austin_restaurant','Houston_grill','Houston_restaurant','CS_grill']
suffix = '_raw_final.csv'
suffix_2 = '_distance.csv'
for pre in prefix:
    f_name = pre+suffix
    f_name2 = pre+suffix_2
    print(f_name)
    df_r = pd.read_csv(f_name)
    df_dis = pd.read_csv(f_name2)
    df_dis = df_dis['distance']
    df_raw = pd.concat([df_r, df_dis], axis=1, sort=False)
    df_f_new = df_raw.loc[:, filtered_columns]
    df_f = df_f.append(df_f_new)
    
df_dp = df_f.drop_duplicates(subset = ['verified','location.city','delivery.provider.name','photos.count','tips.count','listed.count','rating','price.tier'])
nul = df_dp['rating'].isnull()
df = df_dp[nul==False]
df = df.rename(columns={'delivery.provider.name': 'delivery','location.city':'city'})

df['verified']=np.where(df['verified']==False,0,1)
df['delivery']=np.where(df['delivery']=='grubhub',1,0)
df['price.tier']=df['price.tier'].fillna(1)
df['city']=df['city'].fillna('College Station')

df_city = pd.DataFrame(0,index=range(145),columns=['name','city','Austin','Houston','College Station'])
df_city[['name','city']]=df[['name','city']].values
for i in range(0,145):
    if df_city.loc[i,'city'] == 'College Station' or df_city.loc[i,'city'] == 'Bryan':
        df_city.loc[i,'College Station']=1
    if df_city.loc[i,'city'] == 'Austin' or df_city.loc[i,'city'] == 'Sunset Valley':
        df_city.loc[i,'Austin']=1
    if df_city.loc[i,'city'] == 'Houston' or df_city.loc[i,'city'] == 'Bellaire' or df_city.loc[i,'city'] == 'Pearland':
        df_city.loc[i,'Houston']=1

a = df_city[['Austin','Houston','College Station']]
df.reset_index(drop=True, inplace=True)
a.reset_index(drop=True, inplace=True)
df=pd.concat([df, a], axis=1)

filtered_columns = ['name', 'verified','delivery','photos.count','tips.count','listed.count','rating','price.tier','distance','Austin','Houston','College Station']
df_final = df.loc[:, filtered_columns]
df_final


Austin_restaurant_raw_final.csv
Houston_grill_raw_final.csv
Houston_restaurant_raw_final.csv
CS_grill_raw_final.csv


Unnamed: 0,name,verified,delivery,photos.count,tips.count,listed.count,rating,price.tier,distance,Austin,Houston,College Station
0,BJ's Restaurant & Brewhouse,1,1,133,39,25,8.5,2.0,3317,0,0,1
1,Jose's Mexican Restaurant,0,1,8,8,5,7.0,2.0,2472,0,0,1
2,La Riviera Restaurant & Bakery,0,0,9,3,5,6.3,2.0,2688,0,0,1
3,Centro American Restaurant & Pupuseria,0,0,3,7,0,7.8,1.0,1440,0,0,1
4,Chef Cao's Chinese ReStaurant,1,1,13,4,4,6.3,1.0,1656,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...
140,Buffalo Wild Wings,1,1,48,21,8,7.1,2.0,2386,0,0,1
141,Johnny Carino's,1,0,23,14,7,7.4,2.0,2161,0,0,1
142,Golden Corral,1,0,22,7,0,6.0,1.0,1954,0,0,1
143,Fat Burger Grill,0,1,8,5,2,7.0,1.0,4332,0,0,1


### Data formatting

In [215]:
y = np.array(df_final['rating'])
x = df_final.drop(['name','rating'], axis = 1)
feature_list = list(x.columns)
x = np.array(x)
train_x, test_x, train_y, test_y = train_test_split(x, y, test_size = 0.25, random_state = 1)
print('Training Features Shape:', train_x.shape)
print('Training Labels Shape:', train_y.shape)
print('Testing Features Shape:', test_x.shape)
print('Testing Labels Shape:', test_y.shape)

Training Features Shape: (108, 10)
Training Labels Shape: (108,)
Testing Features Shape: (37, 10)
Testing Labels Shape: (37,)


### Random forest model hyperparameter tuning

In [191]:
rf = RandomForestRegressor()
# number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 500, stop = 1500, num = 10)]
# number of features at every split
max_features = ['auto', 'sqrt']

# max depth
max_depth = [int(x) for x in np.linspace(100, 500, num = 2)]
max_depth.append(None)
# create random grid
random_grid = {
 'n_estimators': n_estimators,
 'max_features': max_features,
 'max_depth': max_depth
 }
# Random search of parameters
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 5, verbose=2, random_state=1, n_jobs = -1)

# Fit the model
rf_random.fit(train_x, train_y)

# print results
print(rf_random.best_params_)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 5 folds for each of 60 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   17.1s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:  3.1min finished


{'n_estimators': 1055, 'max_features': 'sqrt', 'max_depth': 500}


### Train and evaluate the model

In [216]:
rf = RandomForestRegressor(n_estimators=1055, max_features='sqrt',max_depth = 500,random_state=1)
rf.fit(train_x, train_y)
predictions = rf.predict(test_x)
errors = abs(predictions - test_y)
# Calculate mean absolute percentage error (MAPE)
mape = 100 * (errors / test_y)
accuracy = 100 - np.mean(mape)
print('Accuracy:', round(accuracy, 2), '%.')

Accuracy: 89.87 %.


### Plot the decision trees

In [217]:
# Pull out one tree from the forest
tree = rf.estimators_[5]
# Export the image to a dot file
export_graphviz(tree, out_file = 'tree.dot', feature_names = feature_list, rounded = True, precision = 1)
# Use dot file to create a graph
(graph, ) = pydot.graph_from_dot_file('tree.dot')
# Write graph to a png file
graph.write_png('tree.png')

### Variable importances

In [218]:
# Get numerical feature importances
importances = list(rf.feature_importances_)
# List of tuples with variable and importance
feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(feature_list, importances)]
# Sort the feature importances by most important first
feature_importances = sorted(feature_importances, key = lambda z: z[1], reverse = True)
# Print out the feature and importances 
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances];

Variable: listed.count         Importance: 0.26
Variable: photos.count         Importance: 0.21
Variable: tips.count           Importance: 0.21
Variable: distance             Importance: 0.17
Variable: verified             Importance: 0.04
Variable: delivery             Importance: 0.04
Variable: price.tier           Importance: 0.03
Variable: Houston              Importance: 0.02
Variable: College Station      Importance: 0.02
Variable: Austin               Importance: 0.01
