In [1]:
import pandas as pd 
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np 
from sklearn.preprocessing import MinMaxScaler
import seaborn as sns 
from xgboost import XGBRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
import xgboost as xgb
from datetime import date
from statsmodels.stats.weightstats import ttest_ind
from scipy.stats import skew
from sklearn.model_selection import cross_validate
from sklearn.model_selection import ShuffleSplit
from scipy import stats
import utility 

# Zomato

In [4]:
# zomato only
df_zom_train = pd.read_csv('data/3_clean_zomato_feat_ori_train.csv')
df_zom_train['rest_price_idr'] = df_zom_train['rest_price_idr'].astype(int)
df_zom_train = df_zom_train[df_zom_train['rating']>0]


df_zom_test = pd.read_csv('data/3_clean_zomato_feat_ori_test.csv')
df_zom_test['rest_price_idr'] = df_zom_test['rest_price_idr'].astype(int)
df_zom_test = df_zom_test[df_zom_test['rating']>0]

# filter chain and non chain 
df_zom_train = df_zom_train[df_zom_train['is_chain']==0]
df_zom_test = df_zom_test[df_zom_test['is_chain']==0]

# df_zom = pd.read_csv('data/3_jbdk_clean_zomato_feat.csv')

# ori 
drop_column = ['url','index','rating','lat','long','review']
target_column = 'rating'

# cross validation 
rf_param = {'n_estimators':500,'max_features':3, 'max_depth':30,'bootstrap':True}
xgb_param = {'n_estimators':100, 'max_depth':5, 'min_child_weight':3, 'learning_rate':0.15, 'booster':'gbtree'}
svr_param  = {'kernel':'poly','gamma':0.1,'C':0.1}


# scaled the values 
features_columns = df_zom_train.drop(drop_column,axis=1).columns
features_columns = list(features_columns) + [target_column]

scaler_train = MinMaxScaler()
scaler_test = MinMaxScaler()

df_input_train_scale = pd.DataFrame(scaler_train.fit_transform(df_zom_train[features_columns]), columns = features_columns)
df_input_test_scale = pd.DataFrame(scaler_test.fit_transform(df_zom_test[features_columns]), columns = features_columns)

# Prediction 

result_zomato = []

# Random Forest 
regr = RandomForestRegressor(**rf_param)
model_regr, y_pred_test_regr, y_pred_train_regr, result_regr = utility.model_perform(df_input_train_scale.drop(target_column,axis=1),  df_input_train_scale[target_column],  df_input_test_scale.drop(target_column,axis=1),  df_input_test_scale[target_column],  regr, name = 'rf', verbose = 0 )
result_zomato.append(result_regr)

# xgb 
xgbr = XGBRegressor(**xgb_param)
model_xgb, y_pred_test_xgb, y_pred_train_xgb, result_xgb = utility.model_perform(df_input_train_scale.drop(target_column,axis=1), df_input_train_scale[target_column], df_input_test_scale.drop(target_column,axis=1), df_input_test_scale[target_column], xgbr, name = 'xgb', verbose = 0 )
result_zomato.append(result_xgb)

# svr 
svr = SVR(**svr_param)
model_svr, y_pred_test_svr, y_pred_train_svr, result_svr = utility.model_perform(df_input_train_scale.drop(target_column, axis=1), df_input_train_scale[target_column], df_input_test_scale.drop(target_column,axis=1), df_input_test_scale[target_column], svr, name = 'svr', verbose = 0 )
result_zomato.append(result_svr)

df_result_zomato = pd.DataFrame(result_zomato)
df_result_zomato['dataset'] = 'zomato'

In [5]:
df_result_zomato

Unnamed: 0,mae,mse,R2,name,feature_size,train_size,test_size,pearson,dataset
0,0.1202,0.02436,0.78358,rf,41,2554,615,0.38741,zomato
1,0.11967,0.02482,0.44525,xgb,41,2554,615,0.41593,zomato
2,0.11561,0.02298,0.225,svr,41,2554,615,0.41602,zomato


# Zomato 250 

In [7]:
# zomato only
df_zom_250_train = pd.read_csv('data/3_clean_zomato_gof_ori_250_train.csv')
df_zom_250_train['rest_price_idr'] = df_zom_250_train['rest_price_idr'].astype(int)
df_zom_250_train = df_zom_250_train[df_zom_250_train['rating']>0]


df_zom_250_test = pd.read_csv('data/3_clean_zomato_gof_ori_250_test.csv')
df_zom_250_test['rest_price_idr'] = df_zom_250_test['rest_price_idr'].astype(int)
df_zom_250_test = df_zom_250_test[df_zom_250_test['rating']>0]

# df_zom = pd.read_csv('data/3_jbdk_clean_zomato_feat.csv')

# filter chain and non chain 
df_zom_250_train = df_zom_250_train[df_zom_250_train['is_chain']==0]
df_zom_250_test = df_zom_250_test[df_zom_250_test['is_chain']==0]

# ori
drop_column = ['url','index','rating','lat','long','review','geohash','encode']
target_column = 'rating'

# cross validation 
rf_param = {'n_estimators':250,'max_features':3, 'max_depth':30,'bootstrap':True}
xgb_param = {'n_estimators':100, 'max_depth':5, 'min_child_weight':1, 'learning_rate':0.15, 'booster':'gbtree'}
svr_param  = {'kernel':'poly','gamma':0.1,'C':0.1}


# scaled the values 
features_columns = df_zom_250_train.drop(drop_column,axis=1).columns
features_columns = list(features_columns) + [target_column]

scaler_train = MinMaxScaler()
scaler_test = MinMaxScaler()

df_input_train_scale = pd.DataFrame(scaler_train.fit_transform(df_zom_250_train[features_columns]), columns = features_columns)
df_input_test_scale = pd.DataFrame(scaler_test.fit_transform(df_zom_250_test[features_columns]), columns = features_columns)

# Prediction 

result_zomato_250 = []

# Random Forest 
regr = RandomForestRegressor(**rf_param)
model_regr, y_pred_test_regr, y_pred_train_regr, result_regr = utility.model_perform(df_input_train_scale.drop(target_column,axis=1),  df_input_train_scale[target_column],  df_input_test_scale.drop(target_column,axis=1),  df_input_test_scale[target_column],  regr, name = 'rf', verbose = 0 )
result_zomato_250.append(result_regr)

# xgb 
xgbr = XGBRegressor(**xgb_param)
model_xgb, y_pred_test_xgb, y_pred_train_xgb, result_xgb = utility.model_perform(df_input_train_scale.drop(target_column,axis=1), df_input_train_scale[target_column], df_input_test_scale.drop(target_column,axis=1), df_input_test_scale[target_column], xgbr, name = 'xgb', verbose = 0 )
result_zomato_250.append(result_xgb)

# svr 
svr = SVR(**svr_param)
model_svr, y_pred_test_svr, y_pred_train_svr, result_svr = utility.model_perform(df_input_train_scale.drop(target_column, axis=1), df_input_train_scale[target_column], df_input_test_scale.drop(target_column,axis=1), df_input_test_scale[target_column], svr, name = 'svr', verbose = 0 )
result_zomato_250.append(result_svr)

df_result_zomato_250 = pd.DataFrame(result_zomato_250)
df_result_zomato_250['dataset'] = 'zomato_250'

In [8]:
df_result_zomato_250

Unnamed: 0,mae,mse,R2,name,feature_size,train_size,test_size,pearson,dataset
0,0.11539,0.02277,0.84499,rf,60,2554,615,0.42505,zomato_250
1,0.11911,0.02517,0.54583,xgb,60,2554,615,0.39506,zomato_250
2,0.11529,0.02292,0.24859,svr,60,2554,615,0.41901,zomato_250


# Zomato 500 

In [9]:
# zomato only
df_zom_500_train = pd.read_csv('data/3_clean_zomato_gof_ori_500_train.csv')
df_zom_500_train['rest_price_idr'] = df_zom_500_train['rest_price_idr'].astype(int)
df_zom_500_train = df_zom_500_train[df_zom_500_train['rating']>0]


df_zom_500_test = pd.read_csv('data/3_clean_zomato_gof_ori_500_test.csv')
df_zom_500_test['rest_price_idr'] = df_zom_500_test['rest_price_idr'].astype(int)
df_zom_500_test = df_zom_500_test[df_zom_500_test['rating']>0]

# df_zom = pd.read_csv('data/3_jbdk_clean_zomato_feat.csv')

# filter chain and non chain 
df_zom_500_train = df_zom_500_train[df_zom_500_train['is_chain']==0]
df_zom_500_test = df_zom_500_test[df_zom_500_test['is_chain']==0]

# ori
drop_column = ['url','index','rating','lat','long','review','geohash','encode']
target_column = 'rating'

# cross validation
rf_param = {'n_estimators':500,'max_features':3, 'max_depth':30,'bootstrap':True}
xgb_param = {'n_estimators':100, 'max_depth':5, 'min_child_weight':3, 'learning_rate':0.2, 'booster':'gbtree'}
svr_param  = {'kernel':'poly','gamma':0.1,'C':0.1}

# scaled the values 
features_columns = df_zom_500_train.drop(drop_column,axis=1).columns
features_columns = list(features_columns) + [target_column]

scaler_train = MinMaxScaler()
scaler_test = MinMaxScaler()

df_input_train_scale = pd.DataFrame(scaler_train.fit_transform(df_zom_500_train[features_columns]), columns = features_columns)
df_input_test_scale = pd.DataFrame(scaler_test.fit_transform(df_zom_500_test[features_columns]), columns = features_columns)

# Prediction 

result_zomato_500 = []

# Random Forest 
regr = RandomForestRegressor(**rf_param)
model_regr, y_pred_test_regr, y_pred_train_regr, result_regr = utility.model_perform(df_input_train_scale.drop(target_column,axis=1),  df_input_train_scale[target_column],  df_input_test_scale.drop(target_column,axis=1),  df_input_test_scale[target_column],  regr, name = 'rf', verbose = 0 )
result_zomato_500.append(result_regr)

# xgb 
xgbr = XGBRegressor(**xgb_param)
model_xgb, y_pred_test_xgb, y_pred_train_xgb, result_xgb = utility.model_perform(df_input_train_scale.drop(target_column,axis=1), df_input_train_scale[target_column], df_input_test_scale.drop(target_column,axis=1), df_input_test_scale[target_column], xgbr, name = 'xgb', verbose = 0 )
result_zomato_500.append(result_xgb)

# svr 
svr = SVR(**svr_param)
model_svr, y_pred_test_svr, y_pred_train_svr, result_svr = utility.model_perform(df_input_train_scale.drop(target_column, axis=1), df_input_train_scale[target_column], df_input_test_scale.drop(target_column,axis=1), df_input_test_scale[target_column], svr, name = 'svr', verbose = 0 )
result_zomato_500.append(result_svr)

df_result_zomato_500 = pd.DataFrame(result_zomato_500)
df_result_zomato_500['dataset'] = 'zomato_500'

In [10]:
df_result_zomato_500

Unnamed: 0,mae,mse,R2,name,feature_size,train_size,test_size,pearson,dataset
0,0.11432,0.02255,0.88389,rf,60,2554,615,0.43937,zomato_500
1,0.11966,0.02463,0.76286,xgb,60,2554,615,0.40629,zomato_500
2,0.11484,0.02256,0.2674,svr,60,2554,615,0.43225,zomato_500


# Zomato 1000

In [11]:
# zomato only
df_zom_1000_train = pd.read_csv('data/3_clean_zomato_gof_ori_1000_train.csv')
df_zom_1000_train['rest_price_idr'] = df_zom_1000_train['rest_price_idr'].astype(int)
df_zom_1000_train = df_zom_1000_train[df_zom_1000_train['rating']>0]


df_zom_1000_test = pd.read_csv('data/3_clean_zomato_gof_ori_1000_test.csv')
df_zom_1000_test['rest_price_idr'] = df_zom_1000_test['rest_price_idr'].astype(int)
df_zom_1000_test = df_zom_1000_test[df_zom_1000_test['rating']>0]

# df_zom = pd.read_csv('data/3_jbdk_clean_zomato_feat.csv')

# filter chain and non chain 
df_zom_1000_train = df_zom_1000_train[df_zom_1000_train['is_chain']==0]
df_zom_1000_test = df_zom_1000_test[df_zom_1000_test['is_chain']==0]

# ori
drop_column = ['url','index','rating','lat','long','review','geohash','encode']
target_column = 'rating'

# cross validation
rf_param = {'n_estimators':500,'max_features':3, 'max_depth':30,'bootstrap':True}
xgb_param = {'n_estimators':100, 'max_depth':5, 'min_child_weight':3, 'learning_rate':0.2, 'booster':'gbtree'}
svr_param  = {'kernel':'poly','gamma':0.1,'C':0.1}

# scaled the values 
features_columns = df_zom_1000_train.drop(drop_column,axis=1).columns
features_columns = list(features_columns) + [target_column]

scaler_train = MinMaxScaler()
scaler_test = MinMaxScaler()

df_input_train_scale = pd.DataFrame(scaler_train.fit_transform(df_zom_1000_train[features_columns]), columns = features_columns)
df_input_test_scale = pd.DataFrame(scaler_test.fit_transform(df_zom_1000_test[features_columns]), columns = features_columns)

# Prediction 

result_zomato_1000 = []

# Random Forest 
regr = RandomForestRegressor(**rf_param)
model_regr, y_pred_test_regr, y_pred_train_regr, result_regr = utility.model_perform(df_input_train_scale.drop(target_column,axis=1),  df_input_train_scale[target_column],  df_input_test_scale.drop(target_column,axis=1),  df_input_test_scale[target_column],  regr, name = 'rf', verbose = 0 )
result_zomato_1000.append(result_regr)

# xgb 
xgbr = XGBRegressor(**xgb_param)
model_xgb, y_pred_test_xgb, y_pred_train_xgb, result_xgb = utility.model_perform(df_input_train_scale.drop(target_column,axis=1), df_input_train_scale[target_column], df_input_test_scale.drop(target_column,axis=1), df_input_test_scale[target_column], xgbr, name = 'xgb', verbose = 0 )
result_zomato_1000.append(result_xgb)

# svr 
svr = SVR(**svr_param)
model_svr, y_pred_test_svr, y_pred_train_svr, result_svr = utility.model_perform(df_input_train_scale.drop(target_column, axis=1), df_input_train_scale[target_column], df_input_test_scale.drop(target_column,axis=1), df_input_test_scale[target_column], svr, name = 'svr', verbose = 0 )
result_zomato_1000.append(result_svr)

df_result_zomato_1000 = pd.DataFrame(result_zomato_1000)
df_result_zomato_1000['dataset'] = 'zomato_1000'

In [12]:
df_result_zomato_1000

Unnamed: 0,mae,mse,R2,name,feature_size,train_size,test_size,pearson,dataset
0,0.11391,0.02206,0.88299,rf,60,2554,615,0.46622,zomato_1000
1,0.11969,0.02468,0.78107,xgb,60,2554,615,0.38912,zomato_1000
2,0.11429,0.02237,0.29301,svr,60,2554,615,0.4368,zomato_1000


# Final result 

In [13]:
# cross validation 
df_final_result = pd.concat([df_result_zomato ,df_result_zomato_250,df_result_zomato_500,df_result_zomato_1000])


In [14]:
df_final_result

Unnamed: 0,mae,mse,R2,name,feature_size,train_size,test_size,pearson,dataset
0,0.1202,0.02436,0.78358,rf,41,2554,615,0.38741,zomato
1,0.11967,0.02482,0.44525,xgb,41,2554,615,0.41593,zomato
2,0.11561,0.02298,0.225,svr,41,2554,615,0.41602,zomato
0,0.11539,0.02277,0.84499,rf,60,2554,615,0.42505,zomato_250
1,0.11911,0.02517,0.54583,xgb,60,2554,615,0.39506,zomato_250
2,0.11529,0.02292,0.24859,svr,60,2554,615,0.41901,zomato_250
0,0.11432,0.02255,0.88389,rf,60,2554,615,0.43937,zomato_500
1,0.11966,0.02463,0.76286,xgb,60,2554,615,0.40629,zomato_500
2,0.11484,0.02256,0.2674,svr,60,2554,615,0.43225,zomato_500
0,0.11391,0.02206,0.88299,rf,60,2554,615,0.46622,zomato_1000


In [15]:
today = date.today()
print("Today's date:", today)

Today's date: 2022-11-19


In [28]:
df_final_result.to_excel('report/ml_perform_final_non_chain.xlsx'.format(today),index=False)