In [1]:
import pandas as pd 
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np 
from sklearn.preprocessing import MinMaxScaler
import seaborn as sns 
from xgboost import XGBRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
import xgboost as xgb
from datetime import date
from statsmodels.stats.weightstats import ttest_ind
from scipy.stats import skew
from sklearn.model_selection import cross_validate
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import RandomizedSearchCV

# Zomato Only

In [22]:
df_zom = pd.read_csv('data/3_clean_zomato_feat.csv')

df_zom['rest_price_idr'] = df_zom['rest_price_idr'] / 1000
df_zom['rest_price_idr'] = df_zom['rest_price_idr'].astype(int)

drop_column = ['url','index','rating','lat','long','review','new_code_res_type','new_code_fac']
target_column = 'rating'

# remove 0 rating
df_zom = df_zom[df_zom['rating'] > 0]

# scaled the values 
features_columns = df_zom.drop(drop_column,axis=1).columns
features_columns = list(features_columns) + [target_column]
scaler = MinMaxScaler()
df_input_scale = pd.DataFrame(scaler.fit_transform(df_zom[features_columns]), columns = features_columns)


X_train, X_test, y_train, y_test = train_test_split(df_input_scale.drop(target_column, axis=1), df_input_scale[target_column], test_size=0.12, random_state=42)

# append result 
result_zom = {}
result_zom['dataset'] = 'zomato'

performance_report = []


## RF 

In [23]:
param_grid = {
    'bootstrap': [True],
    'max_depth': [30, 50, 100, 250],
    'max_features': [2, 3],
    'n_estimators': [250, 500, 1000]
}# Create a based model

scoring = {"MAE":'neg_mean_absolute_error',"MSE":'neg_mean_squared_error','r2':'r2'}

rf = RandomForestRegressor()
grid_search = RandomizedSearchCV(estimator = rf, param_distributions = param_grid, 
                          cv = 5, n_jobs = -1, verbose = 2, scoring='neg_mean_absolute_error', refit = scoring, n_iter= 50)

search = grid_search.fit(X_train, y_train)
best_param = search.best_params_

# performance report
y_pred = search.predict(X_test)

result_zom['rf'] = best_param
result_zom['mae_rf'] = mean_absolute_error(y_pred,y_test)
result_zom['mse_rf'] = mean_squared_error(y_pred,y_test)



Fitting 5 folds for each of 24 candidates, totalling 120 fits


## XGB

In [24]:
param_grid = {
    'n_estimators': [100, 500, 900, 1100, 1500],
    'max_depth':[2, 3, 5, 10, 15],
    'learning_rate':[0.05,0.1,0.15,0.20],
    'min_child_weight':[1,2,3,4],
    'booster':['gbtree','gblinear'],
}# Create a based model

scoring = {"MAE":'neg_mean_absolute_error',"MSE":'neg_mean_squared_error','r2':'r2'}

xgb = XGBRegressor()
grid_search = RandomizedSearchCV(estimator = xgb, param_distributions = param_grid, 
                          cv = 5, n_jobs = -1, verbose = 2, scoring='neg_mean_absolute_error', refit = scoring, n_iter= 50)

search = grid_search.fit(X_train, y_train)
best_param = search.best_params_
result_zom['xgb'] = best_param

# performance report
y_pred = search.predict(X_test)

result_zom['mae_xgb'] = mean_absolute_error(y_pred,y_test)
result_zom['mse_xgb'] = mean_squared_error(y_pred,y_test)

Fitting 5 folds for each of 50 candidates, totalling 250 fits


## SVR 

In [25]:
c_range = np.logspace(-0, 4, 8)
gamma_range = np.logspace(-4, 0, 8)
param_grid = {'kernel' : ['poly'],'C': [0.1, 1, 10, 100, 1000],'gamma': [1, 0.1, 0.01]}


scoring = {"MAE":'neg_mean_absolute_error',"MSE":'neg_mean_squared_error','r2':'r2'}

svr = SVR()
grid_search = RandomizedSearchCV(estimator = svr, param_distributions = param_grid, 
                          cv = 5, n_jobs = -1, verbose = 2, scoring = 'neg_mean_absolute_error', refit = scoring, n_iter= 50)

search = grid_search.fit(X_train, y_train)
best_param = search.best_params_
result_zom['svr'] = best_param

# performance report
y_pred = search.predict(X_test)

result_zom['mae_svr'] = mean_absolute_error(y_pred,y_test)
result_zom['mse_svr'] = mean_squared_error(y_pred,y_test)



Fitting 5 folds for each of 15 candidates, totalling 75 fits


In [27]:
df_result_zom = pd.DataFrame(result_zom)  
df_result_zom

Unnamed: 0,dataset,rf,mae_rf,mse_rf,xgb,mae_xgb,mse_xgb,svr,mae_svr,mse_svr
n_estimators,zomato,250,0.07482,0.009359,1100,0.076567,0.009439,,0.082527,0.01058
max_features,zomato,2,0.07482,0.009359,,0.076567,0.009439,,0.082527,0.01058
max_depth,zomato,50,0.07482,0.009359,3,0.076567,0.009439,,0.082527,0.01058
bootstrap,zomato,True,0.07482,0.009359,,0.076567,0.009439,,0.082527,0.01058
min_child_weight,zomato,,0.07482,0.009359,1,0.076567,0.009439,,0.082527,0.01058
learning_rate,zomato,,0.07482,0.009359,0.05,0.076567,0.009439,,0.082527,0.01058
booster,zomato,,0.07482,0.009359,gbtree,0.076567,0.009439,,0.082527,0.01058
kernel,zomato,,0.07482,0.009359,,0.076567,0.009439,poly,0.082527,0.01058
gamma,zomato,,0.07482,0.009359,,0.076567,0.009439,1,0.082527,0.01058
C,zomato,,0.07482,0.009359,,0.076567,0.009439,10,0.082527,0.01058


# Zomato POI 250 

In [4]:
df_zom_poi = pd.read_csv('data/3_clean_zomato_gof_250.csv')

df_zom_poi['rest_price_idr'] = df_zom_poi['rest_price_idr'] / 1000
df_zom_poi['rest_price_idr'] = df_zom_poi['rest_price_idr'].astype(int)

drop_column = ['url','index','rating','lat','long','review','new_code_res_type','new_code_fac','geohash','encode']
target_column = 'rating'

# remove 0 rating
df_zom_poi = df_zom_poi[df_zom_poi['rating'] > 0]

# scaled the values 
features_columns = df_zom_poi.drop(drop_column,axis=1).columns
features_columns = list(features_columns) + [target_column]
scaler = MinMaxScaler()
df_input_scale = pd.DataFrame(scaler.fit_transform(df_zom_poi[features_columns]), columns = features_columns)


X_train, X_test, y_train, y_test = train_test_split(df_input_scale.drop(target_column, axis=1), df_input_scale[target_column], test_size=0.12, random_state=42)

# append result 
result_zom_poi_250 = {}
result_zom_poi_250['dataset'] = 'zomato poi 250'

performance_report = []


## RF

In [3]:
param_grid = {
    'bootstrap': [True],
    'max_depth': [30, 50, 100, 250],
    'max_features': [2, 3],
    'n_estimators': [250, 500, 1000]
}# Create a based model

scoring = {"MAE":'neg_mean_absolute_error',"MSE":'neg_mean_squared_error','r2':'r2'}

rf = RandomForestRegressor()
grid_search = RandomizedSearchCV(estimator = rf, param_distributions = param_grid, 
                          cv = 5, n_jobs = -1, verbose = 2, scoring='neg_mean_absolute_error', refit = scoring, n_iter= 50)

search = grid_search.fit(X_train, y_train)
best_param = search.best_params_

# performance report
y_pred = search.predict(X_test)

result_zom_poi_250['rf'] = best_param
result_zom_poi_250['mae_rf'] = mean_absolute_error(y_pred,y_test)
result_zom_poi_250['mse_rf'] = mean_squared_error(y_pred,y_test)

NameError: name 'X_train' is not defined

## XGB 

In [17]:
param_grid = {
    'n_estimators': [100, 500, 900, 1100, 1500],
    'max_depth':[2, 3, 5, 10, 15],
    'learning_rate':[0.05,0.1,0.15,0.20],
    'min_child_weight':[1,2,3,4],
    'booster':['gbtree','gblinear'],
}# Create a based model

scoring = {"MAE":'neg_mean_absolute_error',"MSE":'neg_mean_squared_error','r2':'r2'}

xgb = XGBRegressor()
grid_search = RandomizedSearchCV(estimator = xgb, param_distributions = param_grid, 
                          cv = 5, n_jobs = -1, verbose = 2, scoring='neg_mean_absolute_error', refit = scoring, n_iter= 50)

search = grid_search.fit(X_train, y_train)
best_param = search.best_params_

# performance report
y_pred = search.predict(X_test)

result_zom_poi_250['rf'] = best_param
result_zom_poi_250['mae_xgb'] = mean_absolute_error(y_pred,y_test)
result_zom_poi_250['mse_xgb'] = mean_squared_error(y_pred,y_test)

Fitting 5 folds for each of 50 candidates, totalling 250 fits


## SVR 

In [18]:
c_range = np.logspace(-0, 4, 8)
gamma_range = np.logspace(-4, 0, 8)
param_grid = {'kernel' : ['poly'],'C': [0.1, 1, 10, 100, 1000],'gamma': [1, 0.1, 0.01]}


scoring = {"MAE":'neg_mean_absolute_error',"MSE":'neg_mean_squared_error','r2':'r2'}

svr = SVR()
grid_search = RandomizedSearchCV(estimator = svr, param_distributions = param_grid, 
                          cv = 5, n_jobs = -1, verbose = 2, scoring = 'neg_mean_absolute_error', refit = scoring, n_iter= 50)

search = grid_search.fit(X_train, y_train)
best_param = search.best_params_

# performance report
y_pred = search.predict(X_test)

result_zom_poi_250['svr'] = best_param
result_zom_poi_250['mae_svr'] = mean_absolute_error(y_pred,y_test)
result_zom_poi_250['mse_svr'] = mean_squared_error(y_pred,y_test)



Fitting 5 folds for each of 15 candidates, totalling 75 fits


In [28]:
df_result_zom_250 = pd.DataFrame(result_zom_poi_250)

# Zomato POI 500 

In [2]:
df_zom_poi = pd.read_csv('data/3_clean_zomato_gof_500.csv')

df_zom_poi['rest_price_idr'] = df_zom_poi['rest_price_idr'] / 1000
df_zom_poi['rest_price_idr'] = df_zom_poi['rest_price_idr'].astype(int)

drop_column = ['url','index','rating','lat','long','review','new_code_res_type','new_code_fac','geohash','encode']
target_column = 'rating'

# remove 0 rating
df_zom_poi = df_zom_poi[df_zom_poi['rating'] > 0]

# scaled the values 
features_columns = df_zom_poi.drop(drop_column,axis=1).columns
features_columns = list(features_columns) + [target_column]
scaler = MinMaxScaler()
df_input_scale = pd.DataFrame(scaler.fit_transform(df_zom_poi[features_columns]), columns = features_columns)


X_train, X_test, y_train, y_test = train_test_split(df_input_scale.drop(target_column, axis=1), df_input_scale[target_column], test_size=0.12, random_state=42)

# append result 
result_zom_poi_500 = {}
result_zom_poi_500['dataset'] = 'zomato poi 500'

performance_report = []


## RF 

In [32]:
param_grid = {
    'bootstrap': [True],
    'max_depth': [30, 50, 100, 250],
    'max_features': [2, 3],
    'n_estimators': [250, 500, 1000]
}# Create a based model

scoring = {"MAE":'neg_mean_absolute_error',"MSE":'neg_mean_squared_error','r2':'r2'}

rf = RandomForestRegressor()
grid_search = RandomizedSearchCV(estimator = rf, param_distributions = param_grid, 
                          cv = 5, n_jobs = -1, verbose = 2, scoring='neg_mean_absolute_error', refit = scoring, n_iter= 50)

search = grid_search.fit(X_train, y_train)
best_param = search.best_params_

# performance report
y_pred = search.predict(X_test)

result_zom_poi_500['rf'] = best_param
result_zom_poi_500['mae_rf'] = mean_absolute_error(y_pred,y_test)
result_zom_poi_500['mse_rf'] = mean_squared_error(y_pred,y_test)



Fitting 5 folds for each of 24 candidates, totalling 120 fits


## XGB 

In [33]:
param_grid = {
    'n_estimators': [100, 500, 900, 1100, 1500],
    'max_depth':[2, 3, 5, 10, 15],
    'learning_rate':[0.05,0.1,0.15,0.20],
    'min_child_weight':[1,2,3,4],
    'booster':['gbtree','gblinear'],
}# Create a based model

scoring = {"MAE":'neg_mean_absolute_error',"MSE":'neg_mean_squared_error','r2':'r2'}

xgb = XGBRegressor()
grid_search = RandomizedSearchCV(estimator = xgb, param_distributions = param_grid, 
                          cv = 5, n_jobs = -1, verbose = 2, scoring='neg_mean_absolute_error', refit = scoring, n_iter= 50)

search = grid_search.fit(X_train, y_train)
best_param = search.best_params_

# performance report
y_pred = search.predict(X_test)

result_zom_poi_500['rf'] = best_param
result_zom_poi_500['mae_xgb'] = mean_absolute_error(y_pred,y_test)
result_zom_poi_500['mse_xgb'] = mean_squared_error(y_pred,y_test)

Fitting 5 folds for each of 50 candidates, totalling 250 fits


## SVR 

In [3]:
c_range = np.logspace(-0, 4, 8)
gamma_range = np.logspace(-4, 0, 8)
param_grid = {'kernel' : ['poly'],'C': [0.1, 1, 10, 100, 1000],'gamma': [1, 0.1, 0.01]}


scoring = {"MAE":'neg_mean_absolute_error',"MSE":'neg_mean_squared_error','r2':'r2'}

svr = SVR()
grid_search = RandomizedSearchCV(estimator = svr, param_distributions = param_grid, 
                          cv = 5, n_jobs = -1, verbose = 2, scoring = 'neg_mean_absolute_error', refit = scoring, n_iter= 50)

search = grid_search.fit(X_train, y_train)
best_param = search.best_params_

# performance report
y_pred = search.predict(X_test)

result_zom_poi_500['svr'] = best_param
result_zom_poi_500['mae_svr'] = mean_absolute_error(y_pred,y_test)
result_zom_poi_500['mse_svr'] = mean_squared_error(y_pred,y_test)



Fitting 5 folds for each of 15 candidates, totalling 75 fits


In [4]:
df_result_zom_500 = pd.DataFrame(result_zom_poi_500)

In [1]:
df_result_zom_500

NameError: name 'df_result_zom_500' is not defined

# Zomato POI 1000

In [5]:
df_zom_poi = pd.read_csv('data/3_clean_zomato_gof_1000.csv')

df_zom_poi['rest_price_idr'] = df_zom_poi['rest_price_idr'] / 1000
df_zom_poi['rest_price_idr'] = df_zom_poi['rest_price_idr'].astype(int)

drop_column = ['url','index','rating','lat','long','review','new_code_res_type','new_code_fac','geohash','encode']
target_column = 'rating'

# remove 0 rating
df_zom_poi = df_zom_poi[df_zom_poi['rating'] > 0]

# scaled the values 
features_columns = df_zom_poi.drop(drop_column,axis=1).columns
features_columns = list(features_columns) + [target_column]
scaler = MinMaxScaler()
df_input_scale = pd.DataFrame(scaler.fit_transform(df_zom_poi[features_columns]), columns = features_columns)


X_train, X_test, y_train, y_test = train_test_split(df_input_scale.drop(target_column, axis=1), df_input_scale[target_column], test_size=0.12, random_state=42)

# append result 
result_zom_poi_1000 = {}
result_zom_poi_1000['dataset'] = 'zomato poi 1000'

performance_report = []


## RF 

In [None]:
param_grid = {
    'bootstrap': [True],
    'max_depth': [30, 50, 100, 250],
    'max_features': [2, 3],
    'n_estimators': [250, 500, 1000]
}# Create a based model

scoring = {"MAE":'neg_mean_absolute_error',"MSE":'neg_mean_squared_error','r2':'r2'}

rf = RandomForestRegressor()
grid_search = RandomizedSearchCV(estimator = rf, param_distributions = param_grid, 
                          cv = 5, n_jobs = -1, verbose = 2, scoring='neg_mean_absolute_error', refit = scoring, n_iter= 50)

search = grid_search.fit(X_train, y_train)
best_param = search.best_params_

# performance report
y_pred = search.predict(X_test)

result_zom_poi_1000['rf'] = best_param
result_zom_poi_1000['mae_rf'] = mean_absolute_error(y_pred,y_test)
result_zom_poi_1000['mse_rf'] = mean_squared_error(y_pred,y_test)

## XGB

In [6]:
param_grid = {
    'n_estimators': [100, 500, 900, 1100, 1500],
    'max_depth':[2, 3, 5, 10, 15],
    'learning_rate':[0.05,0.1,0.15,0.20],
    'min_child_weight':[1,2,3,4],
    'booster':['gbtree','gblinear'],
}# Create a based model

scoring = {"MAE":'neg_mean_absolute_error',"MSE":'neg_mean_squared_error','r2':'r2'}

xgb = XGBRegressor()
grid_search = RandomizedSearchCV(estimator = xgb, param_distributions = param_grid, 
                          cv = 5, n_jobs = -1, verbose = 2, scoring='neg_mean_absolute_error', refit = scoring, n_iter= 50)

search = grid_search.fit(X_train, y_train)
best_param = search.best_params_

# performance report
y_pred = search.predict(X_test)

result_zom_poi_1000['xgb'] = best_param
result_zom_poi_1000['mae_xgb'] = mean_absolute_error(y_pred,y_test)
result_zom_poi_1000['mse_xgb'] = mean_squared_error(y_pred,y_test)

Fitting 5 folds for each of 50 candidates, totalling 250 fits


KeyboardInterrupt: 

## SVR 

In [6]:
c_range = np.logspace(-0, 4, 8)
gamma_range = np.logspace(-4, 0, 8)
param_grid = {'kernel' : ['poly'],'C': [0.1, 1, 10, 100, 1000],'gamma': [1, 0.1, 0.01]}


scoring = {"MAE":'neg_mean_absolute_error',"MSE":'neg_mean_squared_error','r2':'r2'}

svr = SVR()
grid_search = RandomizedSearchCV(estimator = svr, param_distributions = param_grid, 
                          cv = 5, n_jobs = -1, verbose = 2, scoring = 'neg_mean_absolute_error', refit = scoring, n_iter= 50)

search = grid_search.fit(X_train, y_train)
best_param = search.best_params_

# performance report
y_pred = search.predict(X_test)

result_zom_poi_1000['svr'] = best_param
result_zom_poi_1000['mae_svr'] = mean_absolute_error(y_pred,y_test)
result_zom_poi_1000['mse_svr'] = mean_squared_error(y_pred,y_test)



Fitting 5 folds for each of 15 candidates, totalling 75 fits


: 

In [None]:
df_result_zom_1000 = pd.DataFrame(result_zom_poi_1000)