In [5]:
import pandas as pd 
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np 
from sklearn.preprocessing import MinMaxScaler
import seaborn as sns 
from xgboost import XGBRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
import xgboost as xgb
from datetime import date
from statsmodels.stats.weightstats import ttest_ind
from scipy.stats import skew
from sklearn.model_selection import cross_validate
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import RandomizedSearchCV

# Zomato Only

In [6]:
df_zom = pd.read_csv('data/3_jak_clean_zomato_feat.csv')

df_zom['rest_price_idr'] = df_zom['rest_price_idr'] / 1000
df_zom['rest_price_idr'] = df_zom['rest_price_idr'].astype(int)

drop_column = ['url','index','rating','lat','long','review','new_code_res_type','new_code_fac']
target_column = 'rating'


# scaled the values 
features_columns = df_zom.drop(drop_column,axis=1).columns
features_columns = list(features_columns) + [target_column]
scaler = MinMaxScaler()
df_input_scale = pd.DataFrame(scaler.fit_transform(df_zom[features_columns]), columns = features_columns)
# remove 0 rating
df_input_scale = df_input_scale[df_input_scale['rating'] > 0]

X_train, X_test, y_train, y_test = train_test_split(df_input_scale.drop(target_column, axis=1), df_input_scale[target_column], test_size=0.12, random_state=42)

result_zom = []

## RF 

In [18]:
param_grid = {
    'bootstrap': [True],
    'max_depth': [30, 50, 100, 250],
    'max_features': [2, 3],
    'n_estimators': [250, 500, 1000]
}# Create a based model

scoring = {"MAE":'neg_mean_absolute_error',"MSE":'neg_mean_squared_error','r2':'r2'}

rf = RandomForestRegressor()
grid_search = RandomizedSearchCV(estimator = rf, param_distributions = param_grid, 
                          cv = 5, n_jobs = -1, verbose = 2, refit = scoring, n_iter= 50)

search = grid_search.fit(X_train, y_train)
best_param = search.best_params_
best_param['algo'] = 'rf'
best_param['dataset'] = 'zomato'
result_zom.append(best_param)



Fitting 5 folds for each of 24 candidates, totalling 120 fits


## XGB

In [21]:
param_grid = {
    'n_estimators': [100, 500, 900, 1100, 1500],
    'max_depth':[2, 3, 5, 10, 15],
    'learning_rate':[0.05,0.1,0.15,0.20],
    'min_child_weight':[1,2,3,4],
    'booster':['gbtree','gblinear'],
}# Create a based model

scoring = {"MAE":'neg_mean_absolute_error',"MSE":'neg_mean_squared_error','r2':'r2'}

xgb = XGBRegressor()
grid_search = RandomizedSearchCV(estimator = xgb, param_distributions = param_grid, 
                          cv = 5, n_jobs = -1, verbose = 2, refit = scoring, n_iter= 50)

search = grid_search.fit(X_train, y_train)
best_param = search.best_params_
best_param['algo'] = 'xgb'
best_param['dataset'] = 'zomato'
result_zom.append(best_param)

## SVR 

In [None]:
param_grid = {
    'kernel':'poly',
    'C': [0.1, 1, 10, 100, 1000],
    'gamma': [1, 0.1, 0.01, 0.001, 0.0001]
}# Create a based model


scoring = {"MAE":'neg_mean_absolute_error',"MSE":'neg_mean_squared_error','r2':'r2'}

svr = SVR()
grid_search = RandomizedSearchCV(estimator = svr, param_distributions = param_grid, 
                          cv = 5, n_jobs = -1, verbose = 2, refit = scoring, n_iter= 50)

search = grid_search.fit(X_train, y_train)
best_param = search.best_params_
best_param['algo'] = 'svr'
best_param['dataset'] = 'zomato'
result_zom.append(best_param)