<a href="https://colab.research.google.com/github/bhargavreddy111/Restaurant-Revenue-Prediction/blob/master/Restaurant%20Revenue%20Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
!pip install vecstack

from vecstack import stacking
import pandas as pd
import numpy as np


from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVR
from sklearn import linear_model

import warnings
warnings.filterwarnings("ignore")

from google.colab import drive
drive.mount('/gdrive')
#Change current working directory to gdrive
%cd /gdrive



In [0]:
#Extract training and test data
trainfile = r'/gdrive/My Drive/Colab Notebooks/Restaurantprediction/train.csv'
trainData = pd.read_csv(trainfile) #creates a dataframe
testfile = r'/gdrive/My Drive/Colab Notebooks/Restaurantprediction/test.csv'
testData = pd.read_csv(testfile)  #creates a dataframe


print(trainData.shape)
print(testData.shape)



(137, 43)
(100000, 42)


In [0]:
#check missing value
print(trainData.isnull().any())

In [0]:
print(testData.isnull().any())

In [0]:
#Dropping unneccesary columns
y_train = trainData["revenue"]
X_train1 = trainData.drop(["Id", "Open Date", "City", "revenue"], axis=1) 
X_test1 = testData.drop(["Id", "Open Date", "City"], axis=1)


print(X_train1.shape)
print(X_test1.shape)
X_train1.head()
X_test1.head()

(137, 39)
(100000, 39)


In [0]:
#selecting categorical features
categoricalFeatures = ["City Group", "Type"]
#Combine Train and test for one Hot Encoding
combined_Data = pd.concat([X_train1, X_test1], keys=[0,1])
#Do one Hot encoding for categorical features
combined_Data = pd.get_dummies(combined_Data,columns=categoricalFeatures)
#Separate Train data and test data
X_train = combined_Data.xs(0)
X_test = combined_Data.xs(1)

X_test.head()
X_train.head()

In [0]:
#getting ID column
X_Test_id=testData["ID"]

In [0]:
#Decision Tree Regressor
clf = DecisionTreeRegressor()
clf.fit(X_train, y_train)
clf_predict_Train=clf.predict(X_train)

mean_squared_error(y_train,clf_predict_Train)
print("RMSE:{0:10f}".format(mean_squared_error(y_train,clf_predict_Train)))
clf_predict =pd.DataFrame(clf.predict(X_test), columns=["Prediction"])

pd.concat([X_Test_id,clf_predict],axis=1).to_csv("/gdrive/My Drive/Restaurantprediction/Test_results1.csv", index = None)

res=pd.read_csv('/gdrive/My Drive/Restaurantprediction/Test_results1.csv')
res.head()

#Hyperparameter tuning done for decision tree classifier
parameters={'max_depth': [5,30,20],'min_samples_split':[2,5,15],'random_state':[1,0],'min_samples_leaf':[2,5,8]}
clf_random = RandomizedSearchCV(clf,parameters,n_iter=15)
clf_random.fit(X_train, y_train)
grid_parm=clf_random.best_params_
print(grid_parm)

#Using the parameters obtained from HyperParameterTuning in the DecisionTreeClassifier
clf1 = DecisionTreeRegressor(**grid_parm)
clf1.fit(X_train,y_train)
clf_predict_Train1=clf1.predict(X_train)
clf_predict1 =pd.DataFrame(clf1.predict(X_test), columns=["Prediction"])

pd.concat([X_Test_id,clf_predict1],axis=1).to_csv("/gdrive/My Drive/Restaurantprediction/Test_results2.csv", index = None)

res=pd.read_csv('/gdrive/My Drive/Restaurantprediction/Test_results2.csv')
res.head()

mean_squared_error(y_train,clf_predict_Train1)
print("RMSE:{0:10f}".format(mean_squared_error(y_train,clf_predict_Train1)))
clf_predict_Test=clf1.predict(X_test)
clf_cv_score = cross_val_score(clf1, X_train, y_train, cv=10)
print("=== All AUC Scores ===")
print(clf_cv_score)
print('\n')
print("=== Mean AUC Score ===")
print("Mean AUC Score - Decision Tree: ",clf_cv_score.mean())

In [0]:
#MLP Regressor
mlp=MLPRegressor(hidden_layer_sizes =(1000,),activation='relu',max_iter=20000)
mlp.fit(X_train,y_train)
mlp_predict =pd.DataFrame(mlp.predict(X_test), columns=["Prediction"])

pd.concat([X_Test_id,mlp_predict],axis=1).to_csv("/gdrive/My Drive/Restaurantprediction/Test_results3.csv", index = None)

res=pd.read_csv('/gdrive/My Drive/Restaurantprediction/Test_results3.csv')
res.head()

In [0]:
#MLP Regressor with hyperparameter tuning
mlp_parameters={'hidden_layer_sizes':[1000,200,500],'max_iter': [20000, 10000],'activation': ['relu', 'logistic']}
mlp1 = MLPRegressor()
#RANDOMIZED SEARCH----------------------------------------
mlp1_random = RandomizedSearchCV(mlp1,mlp_parameters,n_iter=10,cv=5)
mlp1_random.fit(X_train, y_train)
grid_parm_mlp1=mlp1_random.best_params_
print(grid_parm_mlp1)

In [0]:
mlp1= MLPRegressor(**grid_parm_mlp1)
mlp1.fit(X_train,y_train)
mlp1_predict_Train=mlp1.predict(X_train)
mlp1_predict = pd.DataFrame(mlp1.predict(X_test),columns=["Prediction"])

#run cross-validation on best hyperparameters, get auc score
mlp_cv_score = cross_val_score(mlp1, X_train, y_train, cv=5)
print("=== All AUC Scores ===")
print(mlp_cv_score)
print('\n')
print("=== Mean AUC Score ===")
print("Mean AUC Score - Decision Tree: ",mlp_cv_score.mean())

pd.concat([X_Test_id,mlp1_predict],axis=1).to_csv("/gdrive/My Drive/Restaurantprediction/Test_resultsmh.csv", index = None)

res=pd.read_csv('/gdrive/My Drive/Restaurantprediction/Test_resultsmh.csv')
res.head()

In [0]:
#Support Vector Regressor
svr=SVR(C=1.0, kernel='rbf', degree=3)
svr.fit(X_train,y_train)
svr_predict =pd.DataFrame(svr.predict(X_test), columns=["Prediction"])
pd.concat([X_Test_id,svr_predict],axis=1).to_csv("/gdrive/My Drive/Restaurantprediction/Test_results4.csv", index = None)

res=pd.read_csv('/gdrive/My Drive/Restaurantprediction/Test_results4.csv')
res.head()


In [0]:
#Hyperparameter tuning done for SVR Regressor
parameters={'kernel': ['linear', 'rbf', 'sigmoid'],'degree': range(1,10,1), 'max_iter': range(1,100,10)}

#do grid search with cross-validation
svr_grid = GridSearchCV(svr,parameters)
svr_grid.fit(X_train, y_train)
grid_parm_svr=svr_grid.best_params_
print(grid_parm_svr)

svr = SVR(**grid_parm_svr)
svr.fit(X_train,y_train)
svr_predict = svr.predict(X_test)
pred6 = pd.DataFrame(svr.predict(X_test),columns=["Prediction"])

pd.concat([X_Test_id,pred6],axis=1).to_csv("/gdrive/My Drive/Colab Notebooks/Restaurantprediction 1/results_SVR_HT.csv", index = None)

res1=pd.read_csv('/gdrive/My Drive/Colab Notebooks/Restaurantprediction 1/results_SVR_HT.csv')

#run cross-validation on best hyperparameters, get auc score
clf_cv_score = cross_val_score(svr, X_train, y_train, cv=10)
print("=== All AUC Scores ===")
print(clf_cv_score)
print('\n')
print("=== Mean AUC Score ===")
print("Mean AUC Score - Decision Tree: ",clf_cv_score.mean())

In [0]:
# RANDOM FOREST REGRESSOR
rfc = RandomForestRegressor()
rfc.fit(X_train, y_train)
rfc_predict_Train=rfc.predict(X_train)

mean_squared_error(y_train,rfc_predict_Train)
print("RMSE (training) for Decision Tree:{0:10f}".format(mean_squared_error(y_train,rfc_predict_Train)))
rfc_predict_Test=rfc.predict(X_test)

pred2 = pd.DataFrame(rfc.predict(X_test),columns=["Prediction"])

pd.concat([X_Test_id,pred2],axis=1).to_csv("/gdrive/My Drive/Colab Notebooks/Restaurantprediction/results_RFR.csv", index = None)

res=pd.read_csv('/gdrive/My Drive/Colab Notebooks/Restaurantprediction/results_RFR.csv')

In [0]:
#Hyperparameter tuning done for RFR
parameters={'min_samples_leaf' : range(10,100,10),'max_depth': range(1,40,2),'max_features':[2,3,4],'min_samples_split': [8, 10, 12],'n_estimators':[20,30,40]}

#do grid search with cross-validation
rfc_grid = GridSearchCV(rfc,parameters)
rfc_grid.fit(X_train, y_train)
grid_parm_rfc1=rfc_grid.best_params_
print(grid_parm_rfc1)

rfr1 = RandomForestRegressor(**grid_parm_rfc1)
rfr1.fit(X_train,y_train)
rfr1_predict = rfr1.predict(X_test)
pred1 = pd.DataFrame(rfr1.predict(X_test),columns=["Prediction"])

pd.concat([X_Test_id,pred1],axis=1).to_csv("/gdrive/My Drive/Colab Notebooks/Restaurantprediction 1/results_RFR_HT.csv", index = None)

res1=pd.read_csv('/gdrive/My Drive/Colab Notebooks/Restaurantprediction/results_RFR_HT.csv')

#run cross-validation on best hyperparameters, get auc score
clf_cv_score = cross_val_score(rfc, X_train, y_train, cv=10)
print("=== All AUC Scores ===")
print(clf_cv_score)
print('\n')
print("=== Mean AUC Score ===")
print("Mean AUC Score - Decision Tree: ",clf_cv_score.mean())

In [0]:
#Default Gradient Descent Regressor
reg = linear_model.SGDRegressor()
reg.fit(X_train, y_train)
reg_predict_Train=reg.predict(X_train)

mean_squared_error(y_train,reg_predict_Train)
print("RMSE (training) for Decision Tree:{0:10f}".format(mean_squared_error(y_train,reg_predict_Train)))
reg_predict_Test=reg.predict(X_test)

pred7 = pd.DataFrame(reg.predict(X_test),columns=["Prediction"])

pd.concat([X_Test_id,pred7],axis=1).to_csv("/gdrive/My Drive/Colab Notebooks/Restaurantprediction/results_SGDR.csv", index = None)

res=pd.read_csv('/gdrive/My Drive/Colab Notebooks/Restaurantprediction/results_SGDR.csv')

In [0]:
#Hyperparameter tuning done for Gradient Regressor
parameters={'early_stopping': [True],'learning_rate': ['constant','adaptive','optimal','invscaling'], 'max_iter': range(100,1500,100)}

#do grid search with cross-validation
reg_grid = GridSearchCV(reg,parameters)
reg_grid.fit(X_train, y_train)
grid_parm_reg=reg_grid.best_params_
print(grid_parm_reg)

reg = linear_model.SGDRegressor(**grid_parm_reg)
reg.fit(X_train,y_train)
reg_predict = reg.predict(X_test)
pred8 = pd.DataFrame(reg.predict(X_test),columns=["Prediction"])

pd.concat([X_Test_id,pred8],axis=1).to_csv("/gdrive/My Drive/Colab Notebooks/Restaurantprediction/results_SGDR_HT.csv", index = None)

res1=pd.read_csv('/gdrive/My Drive/Colab Notebooks/Restaurantprediction/results_SGDR_HT.csv')

#run cross-validation on best hyperparameters, get auc score
clf_cv_score = cross_val_score(reg, X_train, y_train, cv=10)
print("=== All AUC Scores ===")
print(clf_cv_score)
print('\n')
print("=== Mean AUC Score ===")
print("Mean AUC Score - Decision Tree: ",clf_cv_score.mean())

In [0]:
#Gradient Boosting Regressor
abc =GradientBoostingRegressor()
abc.fit(X_train, y_train)
abc_predict_Train=abc.predict(X_train)

mean_squared_error(y_train,abc_predict_Train)
print("RMSE (training) for Decision Tree:{0:10f}".format(mean_squared_error(y_train,abc_predict_Train)))
abc_predict_Test=abc.predict(X_test)

pred9 = pd.DataFrame(abc.predict(X_test),columns=["Prediction"])

pd.concat([X_Test_id,pred9],axis=1).to_csv("/gdrive/My Drive/Colab Notebooks/Restaurantprediction/results_GBR.csv", index = None)

res=pd.read_csv('/gdrive/My Drive/Colab Notebooks/Restaurantprediction/results_GBR.csv’)

#Hyperparameter tuning done for Gradient Regressor
parameters={'n_estimators': range(50,100,10),'learning_rate': [0.1,0.2]}

#do grid search with cross-validation
abc_grid = GridSearchCV(abc,parameters)
abc_grid.fit(X_train, y_train)
grid_parm_abc=abc_grid.best_params_
print(grid_parm_abc)

abc = GradientBoostingRegressor(**grid_parm_abc)
abc.fit(X_train,y_train)
abc_predict = abc.predict(X_test)
pred10 = pd.DataFrame(abc.predict(X_test),columns=["Prediction"])

pd.concat([X_Test_id,pred10],axis=1).to_csv("/gdrive/My Drive/Colab Notebooks/Restaurantprediction/results_GBR_HT.csv", index = None)

res1=pd.read_csv('/gdrive/My Drive/Colab Notebooks/Restaurantprediction/results_GBR_HT.csv')

#run cross-validation on best hyperparameters, get auc score
clf_cv_score = cross_val_score(abc, X_train, y_train, cv=10)
print("=== All AUC Scores ===")
print(clf_cv_score)
print('\n')
print("=== Mean AUC Score ===")
print("Mean AUC Score - Decision Tree: ",clf_cv_score.mean())

In [0]:
#STACKING MODELS =====================================================================
print("___________________________________________________________________________________________\nEnsemble Methods Predictions using GradientBoosting, RandomForest and Decision Tree Classifier\n")

models = [ SVR(kernel='sigmoid',epsilon=0.01), 
          RandomForestRegressor(max_depth = 7,max_features= 4,min_samples_leaf = 10, min_samples_split= 12,n_estimators= 20), 
          DecisionTreeRegressor(max_depth=3,min_samples_split=90) ]
      
S_Train, S_Test = stacking(models,                   
                           X_train, y_train, X_test,   
                           regression=True, 
     
                           mode='oof_pred_bag', 
       
                           needs_proba=False,
         
                           save_dir=None, 
                                        
                           n_folds=4, 
                                                    
                           verbose=2)



___________________________________________________________________________________________
Ensemble Methods Predictions using GradientBoosting, RandomForest and Decision Tree Classifier

task:         [regression]
metric:       [mean_absolute_error]
mode:         [oof_pred_bag]
n_models:     [3]

model  0:     [SVR]
    fold  0:  [1838705.94338269]
    fold  1:  [1370367.85300829]
    fold  2:  [1977308.32380417]
    fold  3:  [1231245.82394656]
    ----
    MEAN:     [1604406.98603543] + [311438.18720708]
    FULL:     [1606117.19740293]

model  1:     [RandomForestRegressor]
    fold  0:  [1863942.99993295]
    fold  1:  [1530604.03139794]
    fold  2:  [1952219.04085066]
    fold  3:  [1379466.05369821]
    ----
    MEAN:     [1681558.03146994] + [234823.51452315]
    FULL:     [1682889.30861200]

model  2:     [DecisionTreeRegressor]
    fold  0:  [1933023.13724896]
    fold  1:  [1716339.54114305]
    fold  2:  [1965478.05306838]
    fold  3:  [1780241.53757136]
    ----
    MEAN

In [0]:
model = RandomForestRegressor(max_depth = 7,min_samples_leaf = 10, min_samples_split= 12,n_estimators= 20)
model = model.fit(S_Train, y_train)
y_pred_train = model.predict(S_Train)
y_pred_test = model.predict(S_Test)

In [0]:
#Save predictions
preds = pd.DataFrame(model.predict(S_Test),columns=["Prediction"])

pd.concat([X_Test_id,preds],axis=1).to_csv("/gdrive/My Drive/Colab Notebooks/Restaurantprediction/results_ST.csv", index = None)

res1=pd.read_csv('/gdrive/My Drive/Colab Notebooks/Restaurantprediction/results_ST.csv')

In [0]:
#Hyperparameter tuning done for stacked model
parameters={'min_samples_leaf' : range(10,100,10),'max_depth': range(1,40,2),'max_features': range(2,4,1) ,'min_samples_split': [8, 10, 12],'n_estimators':[20,30,40]}

#do grid search with cross-validation
st_grid = GridSearchCV(model,parameters)
st_grid.fit(S_Train, y_train)
grid_parm_st=st_grid.best_params_
print(grid_parm_st)

st = RandomForestRegressor(**grid_parm_st)
st.fit(S_Train,y_train)
st_predict = st.predict(S_Test)
predst = pd.DataFrame(st.predict(S_Test),columns=["Prediction"])

pd.concat([X_Test_id,predst],axis=1).to_csv("/gdrive/My Drive/Colab Notebooks/Restaurantprediction/results_ST_HT.csv", index = None)

res1=pd.read_csv('/gdrive/My Drive/Colab Notebooks/Restaurantprediction/results_ST_HT.csv')

#run cross-validation on best hyperparameters, get Auc score
clf_cv_score = cross_val_score(model, S_Train, y_train, cv=10)
print("=== All AUC Scores ===")
print(clf_cv_score)
print('\n')
print("=== Mean AUC Score ===")
print("Mean AUC Score - Decision Tree: ",clf_cv_score.mean())