In [16]:
import pandas as pd 
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np 
from sklearn.preprocessing import MinMaxScaler
import seaborn as sns 
from xgboost import XGBRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
import xgboost as xgb
from datetime import date
from statsmodels.stats.weightstats import ttest_ind
from scipy.stats import skew
from sklearn.model_selection import cross_validate
from sklearn.model_selection import ShuffleSplit

In [48]:
def model_perform(X_train,y_train, X_test, y_test, model, name, verbose = 0 ):
    # train 
    model.fit(X_train, y_train)

    # test 
    y_pred_test = model.predict(X_test)
    y_pred_train = model.predict(X_train)


    if verbose == 1 :
        print(' MAE {} '.format(mean_absolute_error(y_pred_test,y_test)))
        print(' MSE {} '.format(mean_squared_error(y_pred_test,y_test)))
        print(' R2 {} '.format(r2_score(y_train,y_pred_train)))
    else : 
        pass 
    
    result = {}
    result['mae'] = np.round(mean_absolute_error(y_pred_test,y_test),5)
    result['mse'] = np.round(mean_squared_error(y_pred_test,y_test),5)
    result['R2'] = np.round(r2_score(y_train,y_pred_train),5)
    result['name'] = name
    result['feature_size'] = X_train.shape[1]
    result['train_size'] = X_train.shape[0]
    result['test_size'] = X_test.shape[0]

    return model, y_pred_test, y_pred_train, result 

def show_error_pattern(y_pred, y_test):
    result_test = pd.DataFrame()
    result_test['score'] = y_pred
    result_test['type'] = 'predict'
    result_test['idx'] = np.arange(result_test.shape[0])

    result_test2 = pd.DataFrame()
    result_test2['score'] = np.squeeze(y_test)
    result_test2['type'] = 'test'
    result_test2['idx'] = np.arange(result_test2.shape[0])

    df_result = pd.concat([result_test2,result_test])

    sns.lineplot(data=df_result, x="idx", y='score', hue="type")


def bulk_train(df_input, drop_column, target_column, dataset_name, verbose = 0):

    error_report = []
    model_dict = {}

    # scaled the values 
    features_columns = df_input.drop(drop_column,axis=1).columns
    features_columns = list(features_columns) + [target_column]
    scaler = MinMaxScaler()
    df_input_scale = pd.DataFrame(scaler.fit_transform(df_input[features_columns]), columns = features_columns)
    

    # split train and test 
    X_train, X_test, y_train, y_test = train_test_split(df_input_scale.drop(target_column, axis=1), df_input_scale[target_column], test_size=0.12, random_state=42)

    regr = RandomForestRegressor(random_state=0)
    model_regr, y_pred_test_regr, y_pred_train_regr, result_rf   = model_perform(X_train, y_train, X_test, y_test, regr, name='rf', verbose = verbose )
    error_report.append(result_rf)

    xgbr = XGBRegressor(random_state=0)
    model_xgbr, y_pred_test_xgbr, y_pred_train_xgbr, result_xgb  = model_perform(X_train, y_train, X_test, y_test, xgbr, name='xgb', verbose = verbose)
    error_report.append(result_xgb)

    # SVR 
    svr = SVR(kernel='poly')
    model_svr, y_pred_test_svr, y_pred_train_svr, result_svr  = model_perform(X_train, y_train, X_test, y_test, svr, name='svr',verbose = verbose)
    error_report.append(result_svr)

    model_dict['rf'] = model_regr
    model_dict['xgbr'] = result_xgb
    model_dict['svr'] = result_svr
    
    df_report = pd.DataFrame(error_report)
    df_report['dataset'] = dataset_name
    return  model_dict, df_report


def bulk_train_k_fold(df_input, drop_column, target_column, dataset_name, verbose = 0):


    # scaled the values 
    features_columns = df_input.drop(drop_column,axis=1).columns
    features_columns = list(features_columns) + [target_column]
    scaler = MinMaxScaler()
    df_input_scale = pd.DataFrame(scaler.fit_transform(df_input[features_columns]), columns = features_columns)

    scoring = ['neg_mean_absolute_error','neg_mean_squared_error','r2']


    regr = RandomForestRegressor(random_state=0)
    result_rf = cross_validation(regr, df_input_scale.drop(target_column, axis=1), df_input_scale[target_column], scoring = scoring, cv =5 )
    df_rf = pd.DataFrame(result_rf)
    df_rf['name'] = 'rf'

    xgbr = XGBRegressor(random_state=0)
    result_xgb = cross_validation(xgbr, df_input_scale.drop(target_column, axis=1), df_input_scale[target_column], scoring = scoring, cv =5 )
    df_xgb = pd.DataFrame(result_xgb)
    df_xgb['name'] = 'xgb'


    svr = SVR(kernel='poly')
    result_svr = cross_validation(svr, df_input_scale.drop(target_column, axis=1), df_input_scale[target_column], scoring = scoring, cv =5 )
    df_svr = pd.DataFrame(result_svr)
    df_svr['name'] = 'svr'

    df_result = pd.concat([df_rf,df_xgb,df_svr])
    df_result['dataset'] = dataset_name

    return df_result


def cross_validation(model, X, y, scoring, cv=5):

    results = cross_validate(estimator=model,
                               X=X,
                               y=y,
                               cv=cv,
                               scoring=scoring,
                               return_train_score=True)

    return results


In [None]:
# zomato only
df_zom = pd.read_csv('data/3_clean_zomato_feat.csv')
df_zom['rest_price_idr'] = df_zom['rest_price_idr'] / 1000
df_zom['rest_price_idr'] = df_zom['rest_price_idr'].astype(int)

drop_column = ['url','index','rating','lat','long','review','new_code_res_type','new_code_fac']
target_column = 'rating'

# loop = 15 
# list_report_zom = []

# for i in range(0,15):
#     model_zom, report_zom = bulk_train(df_input=df_zom, drop_column=drop_column, target_column=target_column, dataset_name='zomato_only')
#     report_zom['iteration'] = i
#     list_report_zom.append(report_zom)

# df_report_zom = pd.concat(list_report_zom).reset_index()

# cross validation 

result_zomato= bulk_train_k_fold(df_input=df_zom, drop_column=drop_column, target_column=target_column, dataset_name='zomato_only')
result_zomato['test_neg_mean_squared_error']= result_zomato['test_neg_mean_squared_error']*-1
result_zomato['test_neg_mean_absolute_error']= result_zomato['test_neg_mean_absolute_error']*-1
result_zomato = result_zomato.reset_index()

In [57]:
# zomato geo 250
df_zom_poi = pd.read_csv('data/3_clean_zomato_gof_250.csv')
df_zom_poi['rest_price_idr'] = df_zom_poi['rest_price_idr'] / 1000
df_zom_poi['rest_price_idr'] = df_zom_poi['rest_price_idr'].astype(int)

drop_column = ['url','index','rating','lat','long','review','new_code_res_type','new_code_fac','geohash']
target_column = 'rating'

result_zomato_250 = bulk_train_k_fold(df_input=df_zom_poi, drop_column=drop_column, target_column=target_column, dataset_name='zomato_only')
result_zomato_250['test_neg_mean_squared_error'] = result_zomato_250['test_neg_mean_squared_error']*-1
result_zomato_250['test_neg_mean_absolute_error'] = result_zomato_250['test_neg_mean_absolute_error']*-1

result_zomato_250 = result_zomato_250.reset_index()

In [None]:
# zomato geo 500
df_zom_poi = pd.read_csv('data/3_clean_zomato_gof_500.csv')
df_zom_poi['rest_price_idr'] = df_zom_poi['rest_price_idr'] / 1000
df_zom_poi['rest_price_idr'] = df_zom_poi['rest_price_idr'].astype(int)

drop_column = ['url','index','rating','lat','long','review','new_code_res_type','new_code_fac','geohash']
target_column = 'rating'

result_zomato_500 = bulk_train_k_fold(df_input=df_zom_poi, drop_column=drop_column, target_column=target_column, dataset_name='zomato_only')
result_zomato_500['test_neg_mean_squared_error'] = result_zomato_500['test_neg_mean_squared_error']*-1
result_zomato_500['test_neg_mean_absolute_error'] = result_zomato_500['test_neg_mean_absolute_error']*-1

result_zomato_500 = result_zomato_500.reset_index()

In [None]:
# zomato geo 1000
df_zom_poi = pd.read_csv('data/3_clean_zomato_gof_1000.csv')
df_zom_poi['rest_price_idr'] = df_zom_poi['rest_price_idr'] / 1000
df_zom_poi['rest_price_idr'] = df_zom_poi['rest_price_idr'].astype(int)

drop_column = ['url','index','rating','lat','long','review','new_code_res_type','new_code_fac','geohash']
target_column = 'rating'

result_zomato_1000 = bulk_train_k_fold(df_input=df_zom_poi, drop_column=drop_column, target_column=target_column, dataset_name='zomato_only')
result_zomato_1000['test_neg_mean_squared_error'] = result_zomato_1000['test_neg_mean_squared_error']*-1
result_zomato_1000['test_neg_mean_absolute_error'] = result_zomato_1000['test_neg_mean_absolute_error']*-1

result_zomato_1000 = result_zomato_1000.reset_index()

In [56]:
result_zomato_1000

Unnamed: 0,fit_time,score_time,test_neg_mean_absolute_error,train_neg_mean_absolute_error,test_neg_mean_squared_error,train_neg_mean_squared_error,test_r2,train_r2,name
0,3.623448,0.028923,0.087345,-0.028033,0.011323,-0.001279,-0.072543,0.870441,rf
1,3.49374,0.025906,0.085549,-0.028158,0.010685,-0.001295,-0.025502,0.867003,rf
2,3.587095,0.027925,0.08259,-0.028205,0.010602,-0.001312,-0.06623,0.869666,rf
3,3.526174,0.028927,0.07913,-0.028064,0.009938,-0.001286,-0.101219,0.874136,rf
4,3.51571,0.027934,0.079599,-0.028087,0.010692,-0.001257,-0.212836,0.877452,rf
0,0.516587,0.006984,0.095923,-0.032591,0.014095,-0.001904,-0.335052,0.807122,xgb
1,0.685917,0.005984,0.082145,-0.03137,0.010389,-0.001734,0.002871,0.821982,xgb
2,0.652992,0.005984,0.08348,-0.032059,0.010876,-0.001838,-0.09383,0.817456,xgb
3,0.646959,0.010004,0.081505,-0.031139,0.01081,-0.001739,-0.197849,0.829835,xgb
4,0.626956,0.005984,0.080147,-0.032584,0.011014,-0.00189,-0.249328,0.815679,xgb
