In [1]:
import pandas as pd 
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np 
from sklearn.preprocessing import MinMaxScaler
import seaborn as sns 
from xgboost import XGBRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
import xgboost as xgb
from datetime import date
from statsmodels.stats.weightstats import ttest_ind
from scipy.stats import skew
from sklearn.model_selection import cross_validate
from sklearn.model_selection import ShuffleSplit
from scipy import stats
import utility 

# Zomato

In [2]:
# zomato only
df_zom_train = pd.read_csv('data/3_clean_zomato_feat_ori_sampling_train.csv')
df_zom_train['rest_price_idr'] = df_zom_train['rest_price_idr'].astype(int)
df_zom_train = df_zom_train[df_zom_train['rating']>0]


df_zom_test = pd.read_csv('data/3_clean_zomato_feat_ori_test.csv')
df_zom_test['rest_price_idr'] = df_zom_test['rest_price_idr'].astype(int)
df_zom_test = df_zom_test[df_zom_test['rating']>0]

# filter chain and non chain 
# df_zom_train = df_zom_train[df_zom_train['is_chain']==0]
# df_zom_test = df_zom_test[df_zom_test['is_chain']==0]

# df_zom = pd.read_csv('data/3_jbdk_clean_zomato_feat.csv')

# ori 
drop_column = ['url','index','rating','lat','long','review']
target_column = 'rating'

# best param
rf_param = {'n_estimators':500,'max_features':3, 'max_depth':30,'bootstrap':True}
xgb_param = {'n_estimators':100, 'max_depth':5, 'min_child_weight':3, 'learning_rate':0.15, 'booster':'gbtree'}
svr_param  = {'kernel':'poly','gamma':0.1,'C':0.1}


# scaled the values 
features_columns = df_zom_train.drop(drop_column,axis=1).columns
features_columns = list(features_columns) + [target_column]

scaler_train = MinMaxScaler()
scaler_test = MinMaxScaler()

df_input_train_scale = pd.DataFrame(scaler_train.fit_transform(df_zom_train[features_columns]), columns = features_columns)
df_input_test_scale = pd.DataFrame(scaler_test.fit_transform(df_zom_test[features_columns]), columns = features_columns)

# Prediction 

result_zomato = []

# Random Forest 
regr = RandomForestRegressor(**rf_param)
model_regr, y_pred_test_regr, y_pred_train_regr, result_regr = utility.model_perform(df_input_train_scale.drop(target_column,axis=1),  df_input_train_scale[target_column],  df_input_test_scale.drop(target_column,axis=1),  df_input_test_scale[target_column],  regr, name = 'rf', verbose = 0 )
result_zomato.append(result_regr)

# xgb 
xgbr = XGBRegressor(**xgb_param)
model_xgb, y_pred_test_xgb, y_pred_train_xgb, result_xgb = utility.model_perform(df_input_train_scale.drop(target_column,axis=1), df_input_train_scale[target_column], df_input_test_scale.drop(target_column,axis=1), df_input_test_scale[target_column], xgbr, name = 'xgb', verbose = 0 )
result_zomato.append(result_xgb)

# svr 
svr = SVR(**svr_param)
model_svr, y_pred_test_svr, y_pred_train_svr, result_svr = utility.model_perform(df_input_train_scale.drop(target_column, axis=1), df_input_train_scale[target_column], df_input_test_scale.drop(target_column,axis=1), df_input_test_scale[target_column], svr, name = 'svr', verbose = 0 )
result_zomato.append(result_svr)

df_result_zomato = pd.DataFrame(result_zomato)
df_result_zomato['dataset'] = 'zomato'

# Zomato 250 

In [3]:
# zomato only
df_zom_250_train = pd.read_csv('data/3_clean_zomato_gof_ori_250_sampling_train.csv')
df_zom_250_train['rest_price_idr'] = df_zom_250_train['rest_price_idr'].astype(int)
df_zom_250_train = df_zom_250_train[df_zom_250_train['rating']>0]


df_zom_250_test = pd.read_csv('data/3_clean_zomato_gof_ori_250_test.csv')
df_zom_250_test['rest_price_idr'] = df_zom_250_test['rest_price_idr'].astype(int)
df_zom_250_test = df_zom_250_test[df_zom_250_test['rating']>0]

# df_zom = pd.read_csv('data/3_jbdk_clean_zomato_feat.csv')

# filter chain and non chain 
# df_zom_250_train = df_zom_250_train[df_zom_250_train['is_chain']==0]
# df_zom_250_test = df_zom_250_test[df_zom_250_test['is_chain']==0]

# ori
drop_column = ['url','index','rating','lat','long','review','geohash','encode']
target_column = 'rating'

# best param 
rf_param = {'n_estimators':250,'max_features':3, 'max_depth':30,'bootstrap':True}
xgb_param = {'n_estimators':100, 'max_depth':5, 'min_child_weight':1, 'learning_rate':0.15, 'booster':'gbtree'}
svr_param  = {'kernel':'poly','gamma':0.1,'C':0.1}


# scaled the values 
features_columns = df_zom_250_train.drop(drop_column,axis=1).columns
features_columns = list(features_columns) + [target_column]

scaler_train = MinMaxScaler()
scaler_test = MinMaxScaler()

df_input_train_scale = pd.DataFrame(scaler_train.fit_transform(df_zom_250_train[features_columns]), columns = features_columns)
df_input_test_scale = pd.DataFrame(scaler_test.fit_transform(df_zom_250_test[features_columns]), columns = features_columns)

# Prediction 

result_zomato_250 = []

# Random Forest 
regr = RandomForestRegressor(**rf_param)
model_regr, y_pred_test_regr, y_pred_train_regr, result_regr = utility.model_perform(df_input_train_scale.drop(target_column,axis=1),  df_input_train_scale[target_column],  df_input_test_scale.drop(target_column,axis=1),  df_input_test_scale[target_column],  regr, name = 'rf', verbose = 0 )
result_zomato_250.append(result_regr)

# xgb 
xgbr = XGBRegressor(**xgb_param)
model_xgb, y_pred_test_xgb, y_pred_train_xgb, result_xgb = utility.model_perform(df_input_train_scale.drop(target_column,axis=1), df_input_train_scale[target_column], df_input_test_scale.drop(target_column,axis=1), df_input_test_scale[target_column], xgbr, name = 'xgb', verbose = 0 )
result_zomato_250.append(result_xgb)

# svr 
svr = SVR(**svr_param)
model_svr, y_pred_test_svr, y_pred_train_svr, result_svr = utility.model_perform(df_input_train_scale.drop(target_column, axis=1), df_input_train_scale[target_column], df_input_test_scale.drop(target_column,axis=1), df_input_test_scale[target_column], svr, name = 'svr', verbose = 0 )
result_zomato_250.append(result_svr)

df_result_zomato_250 = pd.DataFrame(result_zomato_250)
df_result_zomato_250['dataset'] = 'zomato_250'

In [4]:
df_result_zomato_250

Unnamed: 0,mae,mse,R2,name,feature_size,train_size,test_size,pearson,dataset
0,0.12314,0.02449,0.97328,rf,60,8519,1183,0.42098,zomato_250
1,0.14134,0.0315,0.77126,xgb,60,8519,1183,0.39352,zomato_250
2,0.14021,0.03102,0.52086,svr,60,8519,1183,0.40355,zomato_250


# Zomato 500 

In [5]:
# zomato only
df_zom_500_train = pd.read_csv('data/3_clean_zomato_gof_ori_500_sampling_train.csv')
df_zom_500_train['rest_price_idr'] = df_zom_500_train['rest_price_idr'].astype(int)
df_zom_500_train = df_zom_500_train[df_zom_500_train['rating']>0]


df_zom_500_test = pd.read_csv('data/3_clean_zomato_gof_ori_500_test.csv')
df_zom_500_test['rest_price_idr'] = df_zom_500_test['rest_price_idr'].astype(int)
df_zom_500_test = df_zom_500_test[df_zom_500_test['rating']>0]

# df_zom = pd.read_csv('data/3_jbdk_clean_zomato_feat.csv')

# filter chain and non chain 
# df_zom_500_train = df_zom_500_train[df_zom_500_train['is_chain']==0]
# df_zom_500_test = df_zom_500_test[df_zom_500_test['is_chain']==0]

# ori
drop_column = ['url','index','rating','lat','long','review','geohash','encode']
target_column = 'rating'

# cross validation
rf_param = {'n_estimators':500,'max_features':3, 'max_depth':30,'bootstrap':True}
xgb_param = {'n_estimators':100, 'max_depth':5, 'min_child_weight':3, 'learning_rate':0.2, 'booster':'gbtree'}
svr_param  = {'kernel':'poly','gamma':0.1,'C':0.1}

# scaled the values 
features_columns = df_zom_500_train.drop(drop_column,axis=1).columns
features_columns = list(features_columns) + [target_column]

scaler_train = MinMaxScaler()
scaler_test = MinMaxScaler()

df_input_train_scale = pd.DataFrame(scaler_train.fit_transform(df_zom_500_train[features_columns]), columns = features_columns)
df_input_test_scale = pd.DataFrame(scaler_test.fit_transform(df_zom_500_test[features_columns]), columns = features_columns)

# Prediction 

result_zomato_500 = []

# Random Forest 
regr = RandomForestRegressor(**rf_param)
model_regr, y_pred_test_regr, y_pred_train_regr, result_regr = utility.model_perform(df_input_train_scale.drop(target_column,axis=1),  df_input_train_scale[target_column],  df_input_test_scale.drop(target_column,axis=1),  df_input_test_scale[target_column],  regr, name = 'rf', verbose = 0 )
result_zomato_500.append(result_regr)

# xgb 
xgbr = XGBRegressor(**xgb_param)
model_xgb, y_pred_test_xgb, y_pred_train_xgb, result_xgb = utility.model_perform(df_input_train_scale.drop(target_column,axis=1), df_input_train_scale[target_column], df_input_test_scale.drop(target_column,axis=1), df_input_test_scale[target_column], xgbr, name = 'xgb', verbose = 0 )
result_zomato_500.append(result_xgb)

# svr 
svr = SVR(**svr_param)
model_svr, y_pred_test_svr, y_pred_train_svr, result_svr = utility.model_perform(df_input_train_scale.drop(target_column, axis=1), df_input_train_scale[target_column], df_input_test_scale.drop(target_column,axis=1), df_input_test_scale[target_column], svr, name = 'svr', verbose = 0 )
result_zomato_500.append(result_svr)

df_result_zomato_500 = pd.DataFrame(result_zomato_500)
df_result_zomato_500['dataset'] = 'zomato_500'

In [6]:
df_result_zomato_500

Unnamed: 0,mae,mse,R2,name,feature_size,train_size,test_size,pearson,dataset
0,0.12113,0.02366,0.98973,rf,60,8519,1183,0.44703,zomato_500
1,0.13737,0.03073,0.87373,xgb,60,8519,1183,0.4024,zomato_500
2,0.14286,0.03289,0.54316,svr,60,8519,1183,0.39508,zomato_500


# Zomato 1000

In [7]:
# zomato only
df_zom_1000_train = pd.read_csv('data/3_clean_zomato_gof_ori_1000_sampling_train.csv')
df_zom_1000_train['rest_price_idr'] = df_zom_1000_train['rest_price_idr'].astype(int)
df_zom_1000_train = df_zom_1000_train[df_zom_1000_train['rating']>0]


df_zom_1000_test = pd.read_csv('data/3_clean_zomato_gof_ori_1000_test.csv')
df_zom_1000_test['rest_price_idr'] = df_zom_1000_test['rest_price_idr'].astype(int)
df_zom_1000_test = df_zom_1000_test[df_zom_1000_test['rating']>0]

# df_zom = pd.read_csv('data/3_jbdk_clean_zomato_feat.csv')

# filter chain and non chain 
# df_zom_1000_train = df_zom_1000_train[df_zom_1000_train['is_chain']==0]
# df_zom_1000_test = df_zom_1000_test[df_zom_1000_test['is_chain']==0]

# ori
drop_column = ['url','index','rating','lat','long','review','geohash','encode']
target_column = 'rating'

# cross validation
rf_param = {'n_estimators':500,'max_features':3, 'max_depth':30,'bootstrap':True}
xgb_param = {'n_estimators':100, 'max_depth':5, 'min_child_weight':3, 'learning_rate':0.2, 'booster':'gbtree'}
svr_param  = {'kernel':'poly','gamma':0.1,'C':0.1}

# scaled the values 
features_columns = df_zom_1000_train.drop(drop_column,axis=1).columns
features_columns = list(features_columns) + [target_column]

scaler_train = MinMaxScaler()
scaler_test = MinMaxScaler()

df_input_train_scale = pd.DataFrame(scaler_train.fit_transform(df_zom_1000_train[features_columns]), columns = features_columns)
df_input_test_scale = pd.DataFrame(scaler_test.fit_transform(df_zom_1000_test[features_columns]), columns = features_columns)

# Prediction 

result_zomato_1000 = []

# Random Forest 
regr = RandomForestRegressor(**rf_param)
model_regr, y_pred_test_regr, y_pred_train_regr, result_regr = utility.model_perform(df_input_train_scale.drop(target_column,axis=1),  df_input_train_scale[target_column],  df_input_test_scale.drop(target_column,axis=1),  df_input_test_scale[target_column],  regr, name = 'rf', verbose = 0 )
result_zomato_1000.append(result_regr)

# xgb 
xgbr = XGBRegressor(**xgb_param)
model_xgb, y_pred_test_xgb, y_pred_train_xgb, result_xgb = utility.model_perform(df_input_train_scale.drop(target_column,axis=1), df_input_train_scale[target_column], df_input_test_scale.drop(target_column,axis=1), df_input_test_scale[target_column], xgbr, name = 'xgb', verbose = 0 )
result_zomato_1000.append(result_xgb)

# svr 
svr = SVR(**svr_param)
model_svr, y_pred_test_svr, y_pred_train_svr, result_svr = utility.model_perform(df_input_train_scale.drop(target_column, axis=1), df_input_train_scale[target_column], df_input_test_scale.drop(target_column,axis=1), df_input_test_scale[target_column], svr, name = 'svr', verbose = 0 )
result_zomato_1000.append(result_svr)

df_result_zomato_1000 = pd.DataFrame(result_zomato_1000)
df_result_zomato_1000['dataset'] = 'zomato_1000'

In [8]:
df_zom_1000_train.shape, 

((8519, 68),)

In [9]:
df_result_zomato_1000

Unnamed: 0,mae,mse,R2,name,feature_size,train_size,test_size,pearson,dataset
0,0.12024,0.02331,0.98976,rf,60,8519,1183,0.46285,zomato_1000
1,0.13636,0.02979,0.88752,xgb,60,8519,1183,0.39758,zomato_1000
2,0.14485,0.03333,0.55805,svr,60,8519,1183,0.40354,zomato_1000


# Final result 

In [10]:
# cross validation 
df_final_result = pd.concat([df_result_zomato ,df_result_zomato_250,df_result_zomato_500,df_result_zomato_1000])


In [11]:
df_final_result

Unnamed: 0,mae,mse,R2,name,feature_size,train_size,test_size,pearson,dataset
0,0.13202,0.02838,0.91481,rf,41,8519,1183,0.37923,zomato
1,0.14496,0.03321,0.65508,xgb,41,8519,1183,0.39273,zomato
2,0.14314,0.03198,0.45617,svr,41,8519,1183,0.39764,zomato
0,0.12314,0.02449,0.97328,rf,60,8519,1183,0.42098,zomato_250
1,0.14134,0.0315,0.77126,xgb,60,8519,1183,0.39352,zomato_250
2,0.14021,0.03102,0.52086,svr,60,8519,1183,0.40355,zomato_250
0,0.12113,0.02366,0.98973,rf,60,8519,1183,0.44703,zomato_500
1,0.13737,0.03073,0.87373,xgb,60,8519,1183,0.4024,zomato_500
2,0.14286,0.03289,0.54316,svr,60,8519,1183,0.39508,zomato_500
0,0.12024,0.02331,0.98976,rf,60,8519,1183,0.46285,zomato_1000


In [12]:
today = date.today()
print("Today's date:", today)

Today's date: 2022-11-24


In [13]:
df_final_result.to_excel('report/ml_perform_final_sampling.xlsx'.format(today),index=False)