In [2]:
import pandas as pd 
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np 
from sklearn.preprocessing import MinMaxScaler
import seaborn as sns 
from xgboost import XGBRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
import xgboost as xgb
from datetime import date
from statsmodels.stats.weightstats import ttest_ind
from scipy.stats import skew
from sklearn.model_selection import cross_validate
from sklearn.model_selection import ShuffleSplit
from scipy import stats
import utility 

# Zomato

In [3]:
# zomato only
df_zom_train = pd.read_csv('data/3_clean_zomato_feat_ori_sampling_train.csv')
df_zom_train['rest_price_idr'] = df_zom_train['rest_price_idr'].astype(int)
df_zom_train = df_zom_train[df_zom_train['rating']>0]


df_zom_test = pd.read_csv('data/3_clean_zomato_feat_ori_sampling_test.csv')
df_zom_test['rest_price_idr'] = df_zom_test['rest_price_idr'].astype(int)
df_zom_test = df_zom_test[df_zom_test['rating']>0]

# filter chain and non chain 
# df_zom_train = df_zom_train[df_zom_train['is_chain']==0]
# df_zom_test = df_zom_test[df_zom_test['is_chain']==0]

# df_zom = pd.read_csv('data/3_jbdk_clean_zomato_feat.csv')

# ori 
drop_column = ['url','index','rating','lat','long','review']
target_column = 'rating'

# best param
rf_param = {'n_estimators':500,'max_features':3, 'max_depth':30,'bootstrap':True}
xgb_param = {'n_estimators':100, 'max_depth':5, 'min_child_weight':3, 'learning_rate':0.15, 'booster':'gbtree'}
svr_param  = {'kernel':'poly','gamma':0.1,'C':0.1}


# scaled the values 
features_columns = df_zom_train.drop(drop_column,axis=1).columns
features_columns = list(features_columns) + [target_column]

scaler_train = MinMaxScaler()
scaler_test = MinMaxScaler()

df_input_train_scale = pd.DataFrame(scaler_train.fit_transform(df_zom_train[features_columns]), columns = features_columns)
df_input_test_scale = pd.DataFrame(scaler_test.fit_transform(df_zom_test[features_columns]), columns = features_columns)

# Prediction 

result_zomato = []

# Random Forest 
regr = RandomForestRegressor(**rf_param)
model_regr, y_pred_test_regr, y_pred_train_regr, result_regr = utility.model_perform(df_input_train_scale.drop(target_column,axis=1),  df_input_train_scale[target_column],  df_input_test_scale.drop(target_column,axis=1),  df_input_test_scale[target_column],  regr, name = 'rf', verbose = 0 )
result_zomato.append(result_regr)

# xgb 
xgbr = XGBRegressor(**xgb_param)
model_xgb, y_pred_test_xgb, y_pred_train_xgb, result_xgb = utility.model_perform(df_input_train_scale.drop(target_column,axis=1), df_input_train_scale[target_column], df_input_test_scale.drop(target_column,axis=1), df_input_test_scale[target_column], xgbr, name = 'xgb', verbose = 0 )
result_zomato.append(result_xgb)

# svr 
svr = SVR(**svr_param)
model_svr, y_pred_test_svr, y_pred_train_svr, result_svr = utility.model_perform(df_input_train_scale.drop(target_column, axis=1), df_input_train_scale[target_column], df_input_test_scale.drop(target_column,axis=1), df_input_test_scale[target_column], svr, name = 'svr', verbose = 0 )
result_zomato.append(result_svr)

df_result_zomato = pd.DataFrame(result_zomato)
df_result_zomato['dataset'] = 'zomato'

In [4]:
df_result_zomato

Unnamed: 0,mae,mse,R2,name,feature_size,train_size,test_size,pearson,dataset
0,0.11838,0.02232,0.97736,rf,42,7296,1825,0.84534,zomato
1,0.15621,0.03996,0.73595,xgb,42,7296,1825,0.65494,zomato
2,0.1604,0.03986,0.45625,svr,42,7296,1825,0.62686,zomato


# Zomato 250 

In [5]:
# zomato only
df_zom_250_train = pd.read_csv('data/3_clean_zomato_gof_ori_250_sampling_train.csv')
df_zom_250_train['rest_price_idr'] = df_zom_250_train['rest_price_idr'].astype(int)
df_zom_250_train = df_zom_250_train[df_zom_250_train['rating']>0]


df_zom_250_test = pd.read_csv('data/3_clean_zomato_gof_ori_250_sampling_test.csv')
df_zom_250_test['rest_price_idr'] = df_zom_250_test['rest_price_idr'].astype(int)
df_zom_250_test = df_zom_250_test[df_zom_250_test['rating']>0]

# df_zom = pd.read_csv('data/3_jbdk_clean_zomato_feat.csv')

# filter chain and non chain 
# df_zom_250_train = df_zom_250_train[df_zom_250_train['is_chain']==0]
# df_zom_250_test = df_zom_250_test[df_zom_250_test['is_chain']==0]

# ori
drop_column = ['url','index','rating','lat','long','review','geohash','encode']
target_column = 'rating'

# best param 
rf_param = {'n_estimators':250,'max_features':3, 'max_depth':30,'bootstrap':True}
xgb_param = {'n_estimators':100, 'max_depth':5, 'min_child_weight':1, 'learning_rate':0.15, 'booster':'gbtree'}
svr_param  = {'kernel':'poly','gamma':0.1,'C':0.1}


# scaled the values 
features_columns = df_zom_250_train.drop(drop_column,axis=1).columns
features_columns = list(features_columns) + [target_column]

scaler_train = MinMaxScaler()
scaler_test = MinMaxScaler()

df_input_train_scale = pd.DataFrame(scaler_train.fit_transform(df_zom_250_train[features_columns]), columns = features_columns)
df_input_test_scale = pd.DataFrame(scaler_test.fit_transform(df_zom_250_test[features_columns]), columns = features_columns)

# Prediction 

result_zomato_250 = []

# Random Forest 
regr = RandomForestRegressor(**rf_param)
model_regr, y_pred_test_regr, y_pred_train_regr, result_regr = utility.model_perform(df_input_train_scale.drop(target_column,axis=1),  df_input_train_scale[target_column],  df_input_test_scale.drop(target_column,axis=1),  df_input_test_scale[target_column],  regr, name = 'rf', verbose = 0 )
result_zomato_250.append(result_regr)

# xgb 
xgbr = XGBRegressor(**xgb_param)
model_xgb, y_pred_test_xgb, y_pred_train_xgb, result_xgb = utility.model_perform(df_input_train_scale.drop(target_column,axis=1), df_input_train_scale[target_column], df_input_test_scale.drop(target_column,axis=1), df_input_test_scale[target_column], xgbr, name = 'xgb', verbose = 0 )
result_zomato_250.append(result_xgb)

# svr 
svr = SVR(**svr_param)
model_svr, y_pred_test_svr, y_pred_train_svr, result_svr = utility.model_perform(df_input_train_scale.drop(target_column, axis=1), df_input_train_scale[target_column], df_input_test_scale.drop(target_column,axis=1), df_input_test_scale[target_column], svr, name = 'svr', verbose = 0 )
result_zomato_250.append(result_svr)

df_result_zomato_250 = pd.DataFrame(result_zomato_250)
df_result_zomato_250['dataset'] = 'zomato_250'

In [6]:
df_result_zomato_250

Unnamed: 0,mae,mse,R2,name,feature_size,train_size,test_size,pearson,dataset
0,0.10217,0.01589,0.97381,rf,61,7296,1825,0.89778,zomato_250
1,0.13984,0.03146,0.7833,xgb,61,7296,1825,0.74079,zomato_250
2,0.15479,0.03687,0.50103,svr,61,7296,1825,0.66587,zomato_250


# Zomato 500 

In [7]:
# zomato only
df_zom_500_train = pd.read_csv('data/3_clean_zomato_gof_ori_500_sampling_train.csv')
df_zom_500_train['rest_price_idr'] = df_zom_500_train['rest_price_idr'].astype(int)
df_zom_500_train = df_zom_500_train[df_zom_500_train['rating']>0]


df_zom_500_test = pd.read_csv('data/3_clean_zomato_gof_ori_500_sampling_test.csv')
df_zom_500_test['rest_price_idr'] = df_zom_500_test['rest_price_idr'].astype(int)
df_zom_500_test = df_zom_500_test[df_zom_500_test['rating']>0]

# df_zom = pd.read_csv('data/3_jbdk_clean_zomato_feat.csv')

# filter chain and non chain 
# df_zom_500_train = df_zom_500_train[df_zom_500_train['is_chain']==0]
# df_zom_500_test = df_zom_500_test[df_zom_500_test['is_chain']==0]

# ori
drop_column = ['url','index','rating','lat','long','review','geohash','encode']
target_column = 'rating'

# cross validation
rf_param = {'n_estimators':500,'max_features':3, 'max_depth':30,'bootstrap':True}
xgb_param = {'n_estimators':100, 'max_depth':5, 'min_child_weight':3, 'learning_rate':0.2, 'booster':'gbtree'}
svr_param  = {'kernel':'poly','gamma':0.1,'C':0.1}

# scaled the values 
features_columns = df_zom_500_train.drop(drop_column,axis=1).columns
features_columns = list(features_columns) + [target_column]

scaler_train = MinMaxScaler()
scaler_test = MinMaxScaler()

df_input_train_scale = pd.DataFrame(scaler_train.fit_transform(df_zom_500_train[features_columns]), columns = features_columns)
df_input_test_scale = pd.DataFrame(scaler_test.fit_transform(df_zom_500_test[features_columns]), columns = features_columns)

# Prediction 

result_zomato_500 = []

# Random Forest 
regr = RandomForestRegressor(**rf_param)
model_regr, y_pred_test_regr, y_pred_train_regr, result_regr = utility.model_perform(df_input_train_scale.drop(target_column,axis=1),  df_input_train_scale[target_column],  df_input_test_scale.drop(target_column,axis=1),  df_input_test_scale[target_column],  regr, name = 'rf', verbose = 0 )
result_zomato_500.append(result_regr)

# xgb 
xgbr = XGBRegressor(**xgb_param)
model_xgb, y_pred_test_xgb, y_pred_train_xgb, result_xgb = utility.model_perform(df_input_train_scale.drop(target_column,axis=1), df_input_train_scale[target_column], df_input_test_scale.drop(target_column,axis=1), df_input_test_scale[target_column], xgbr, name = 'xgb', verbose = 0 )
result_zomato_500.append(result_xgb)

# svr 
svr = SVR(**svr_param)
model_svr, y_pred_test_svr, y_pred_train_svr, result_svr = utility.model_perform(df_input_train_scale.drop(target_column, axis=1), df_input_train_scale[target_column], df_input_test_scale.drop(target_column,axis=1), df_input_test_scale[target_column], svr, name = 'svr', verbose = 0 )
result_zomato_500.append(result_svr)

df_result_zomato_500 = pd.DataFrame(result_zomato_500)
df_result_zomato_500['dataset'] = 'zomato_500'

In [8]:
df_result_zomato_500

Unnamed: 0,mae,mse,R2,name,feature_size,train_size,test_size,pearson,dataset
0,0.09487,0.0126,0.98548,rf,61,7296,1825,0.92921,zomato_500
1,0.12477,0.02377,0.86084,xgb,61,7296,1825,0.80707,zomato_500
2,0.14954,0.03433,0.5225,svr,61,7296,1825,0.69669,zomato_500


# Zomato 1000

In [9]:
# zomato only
df_zom_1000_train = pd.read_csv('data/3_clean_zomato_gof_ori_1000_sampling_train.csv')
df_zom_1000_train['rest_price_idr'] = df_zom_1000_train['rest_price_idr'].astype(int)
df_zom_1000_train = df_zom_1000_train[df_zom_1000_train['rating']>0]


df_zom_1000_test = pd.read_csv('data/3_clean_zomato_gof_ori_1000_sampling_test.csv')
df_zom_1000_test['rest_price_idr'] = df_zom_1000_test['rest_price_idr'].astype(int)
df_zom_1000_test = df_zom_1000_test[df_zom_1000_test['rating']>0]

# df_zom = pd.read_csv('data/3_jbdk_clean_zomato_feat.csv')

# filter chain and non chain 
# df_zom_1000_train = df_zom_1000_train[df_zom_1000_train['is_chain']==0]
# df_zom_1000_test = df_zom_1000_test[df_zom_1000_test['is_chain']==0]

# ori
drop_column = ['url','index','rating','lat','long','review','geohash','encode']
target_column = 'rating'

# cross validation
rf_param = {'n_estimators':500,'max_features':3, 'max_depth':30,'bootstrap':True}
xgb_param = {'n_estimators':100, 'max_depth':5, 'min_child_weight':3, 'learning_rate':0.2, 'booster':'gbtree'}
svr_param  = {'kernel':'poly','gamma':0.1,'C':0.1}

# scaled the values 
features_columns = df_zom_1000_train.drop(drop_column,axis=1).columns
features_columns = list(features_columns) + [target_column]

scaler_train = MinMaxScaler()
scaler_test = MinMaxScaler()

df_input_train_scale = pd.DataFrame(scaler_train.fit_transform(df_zom_1000_train[features_columns]), columns = features_columns)
df_input_test_scale = pd.DataFrame(scaler_test.fit_transform(df_zom_1000_test[features_columns]), columns = features_columns)

# Prediction 

result_zomato_1000 = []

# Random Forest 
regr = RandomForestRegressor(**rf_param)
model_regr, y_pred_test_regr, y_pred_train_regr, result_regr = utility.model_perform(df_input_train_scale.drop(target_column,axis=1),  df_input_train_scale[target_column],  df_input_test_scale.drop(target_column,axis=1),  df_input_test_scale[target_column],  regr, name = 'rf', verbose = 0 )
result_zomato_1000.append(result_regr)

# xgb 
xgbr = XGBRegressor(**xgb_param)
model_xgb, y_pred_test_xgb, y_pred_train_xgb, result_xgb = utility.model_perform(df_input_train_scale.drop(target_column,axis=1), df_input_train_scale[target_column], df_input_test_scale.drop(target_column,axis=1), df_input_test_scale[target_column], xgbr, name = 'xgb', verbose = 0 )
result_zomato_1000.append(result_xgb)

# svr 
svr = SVR(**svr_param)
model_svr, y_pred_test_svr, y_pred_train_svr, result_svr = utility.model_perform(df_input_train_scale.drop(target_column, axis=1), df_input_train_scale[target_column], df_input_test_scale.drop(target_column,axis=1), df_input_test_scale[target_column], svr, name = 'svr', verbose = 0 )
result_zomato_1000.append(result_svr)

df_result_zomato_1000 = pd.DataFrame(result_zomato_1000)
df_result_zomato_1000['dataset'] = 'zomato_1000'

In [10]:
df_zom_1000_train.shape, 

((7296, 69),)

In [11]:
df_result_zomato_1000

Unnamed: 0,mae,mse,R2,name,feature_size,train_size,test_size,pearson,dataset
0,0.12867,0.02193,0.98495,rf,61,7296,1825,0.89803,zomato_1000
1,0.147,0.03086,0.86639,xgb,61,7296,1825,0.75078,zomato_1000
2,0.15007,0.03533,0.54845,svr,61,7296,1825,0.68119,zomato_1000


# Final result 

In [12]:
# cross validation 
df_final_result = pd.concat([df_result_zomato ,df_result_zomato_250,df_result_zomato_500,df_result_zomato_1000])


In [13]:
df_final_result

Unnamed: 0,mae,mse,R2,name,feature_size,train_size,test_size,pearson,dataset
0,0.11838,0.02232,0.97736,rf,42,7296,1825,0.84534,zomato
1,0.15621,0.03996,0.73595,xgb,42,7296,1825,0.65494,zomato
2,0.1604,0.03986,0.45625,svr,42,7296,1825,0.62686,zomato
0,0.10217,0.01589,0.97381,rf,61,7296,1825,0.89778,zomato_250
1,0.13984,0.03146,0.7833,xgb,61,7296,1825,0.74079,zomato_250
2,0.15479,0.03687,0.50103,svr,61,7296,1825,0.66587,zomato_250
0,0.09487,0.0126,0.98548,rf,61,7296,1825,0.92921,zomato_500
1,0.12477,0.02377,0.86084,xgb,61,7296,1825,0.80707,zomato_500
2,0.14954,0.03433,0.5225,svr,61,7296,1825,0.69669,zomato_500
0,0.12867,0.02193,0.98495,rf,61,7296,1825,0.89803,zomato_1000


In [14]:
today = date.today()
print("Today's date:", today)

Today's date: 2022-11-23


In [28]:
df_final_result.to_excel('report/ml_perform_final_sampling.xlsx'.format(today),index=False)