In [1]:
import pandas as pd 
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np 
from sklearn.preprocessing import MinMaxScaler
import seaborn as sns 
from xgboost import XGBRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
import xgboost as xgb
from datetime import date
from statsmodels.stats.weightstats import ttest_ind
from scipy.stats import skew
from sklearn.model_selection import cross_validate
from sklearn.model_selection import ShuffleSplit

In [2]:
def model_perform(X_train,y_train, X_test, y_test, model, name, verbose = 0 ):
    # train 
    model.fit(X_train, y_train)

    # test 
    y_pred_test = model.predict(X_test)
    y_pred_train = model.predict(X_train)


    if verbose == 1 :
        print(' MAE {} '.format(mean_absolute_error(y_pred_test,y_test)))
        print(' MSE {} '.format(mean_squared_error(y_pred_test,y_test)))
        print(' R2 {} '.format(r2_score(y_train,y_pred_train)))
    else : 
        pass 
    
    result = {}
    result['mae'] = np.round(mean_absolute_error(y_pred_test,y_test),5)
    result['mse'] = np.round(mean_squared_error(y_pred_test,y_test),5)
    result['R2'] = np.round(r2_score(y_train,y_pred_train),5)
    result['name'] = name
    result['feature_size'] = X_train.shape[1]
    result['train_size'] = X_train.shape[0]
    result['test_size'] = X_test.shape[0]

    return model, y_pred_test, y_pred_train, result 

def show_error_pattern(y_pred, y_test):
    result_test = pd.DataFrame()
    result_test['score'] = y_pred
    result_test['type'] = 'predict'
    result_test['idx'] = np.arange(result_test.shape[0])

    result_test2 = pd.DataFrame()
    result_test2['score'] = np.squeeze(y_test)
    result_test2['type'] = 'test'
    result_test2['idx'] = np.arange(result_test2.shape[0])

    df_result = pd.concat([result_test2,result_test])

    sns.lineplot(data=df_result, x="idx", y='score', hue="type")


def bulk_train(df_input, drop_column, target_column, dataset_name, verbose = 0):

    error_report = []
    model_dict = {}

    # scaled the values 
    features_columns = df_input.drop(drop_column,axis=1).columns
    features_columns = list(features_columns) + [target_column]
    scaler = MinMaxScaler()
    df_input_scale = pd.DataFrame(scaler.fit_transform(df_input[features_columns]), columns = features_columns)
    

    # split train and test 
    X_train, X_test, y_train, y_test = train_test_split(df_input_scale.drop(target_column, axis=1), df_input_scale[target_column], test_size=0.12, random_state=42)

    regr = RandomForestRegressor(random_state=0)
    model_regr, y_pred_test_regr, y_pred_train_regr, result_rf   = model_perform(X_train, y_train, X_test, y_test, regr, name='rf', verbose = verbose )
    error_report.append(result_rf)

    xgbr = XGBRegressor(random_state=0)
    model_xgbr, y_pred_test_xgbr, y_pred_train_xgbr, result_xgb  = model_perform(X_train, y_train, X_test, y_test, xgbr, name='xgb', verbose = verbose)
    error_report.append(result_xgb)

    # SVR 
    svr = SVR(kernel='poly')
    model_svr, y_pred_test_svr, y_pred_train_svr, result_svr  = model_perform(X_train, y_train, X_test, y_test, svr, name='svr',verbose = verbose)
    error_report.append(result_svr)

    model_dict['rf'] = model_regr
    model_dict['xgbr'] = result_xgb
    model_dict['svr'] = result_svr
    
    df_report = pd.DataFrame(error_report)
    df_report['dataset'] = dataset_name
    return  model_dict, df_report


def bulk_train_k_fold(df_input, drop_column, target_column, dataset_name, verbose = 0):


    # scaled the values 
    features_columns = df_input.drop(drop_column,axis=1).columns
    features_columns = list(features_columns) + [target_column]
    scaler = MinMaxScaler()
    df_input_scale = pd.DataFrame(scaler.fit_transform(df_input[features_columns]), columns = features_columns)

    scoring = ['neg_mean_absolute_error','neg_mean_squared_error','r2']


    regr = RandomForestRegressor(random_state=0)
    result_rf = cross_validation(regr, df_input_scale.drop(target_column, axis=1), df_input_scale[target_column], scoring = scoring, cv =5 )
    df_rf = pd.DataFrame(result_rf)
    df_rf['name'] = 'rf'

    xgbr = XGBRegressor(random_state=0)
    result_xgb = cross_validation(xgbr, df_input_scale.drop(target_column, axis=1), df_input_scale[target_column], scoring = scoring, cv =5 )
    df_xgb = pd.DataFrame(result_xgb)
    df_xgb['name'] = 'xgb'


    svr = SVR(kernel='poly')
    result_svr = cross_validation(svr, df_input_scale.drop(target_column, axis=1), df_input_scale[target_column], scoring = scoring, cv =5 )
    df_svr = pd.DataFrame(result_svr)
    df_svr['name'] = 'svr'

    df_result = pd.concat([df_rf,df_xgb,df_svr])
    df_result['dataset'] = dataset_name

    return df_result


def cross_validation(model, X, y, scoring, cv=5):

    results = cross_validate(estimator=model,
                               X=X,
                               y=y,
                               cv=cv,
                               scoring=scoring,
                               return_train_score=True)

    return results


In [3]:
# zomato only
df_zom = pd.read_csv('data/3_clean_zomato_feat.csv')
df_zom['rest_price_idr'] = df_zom['rest_price_idr'] / 1000
df_zom['rest_price_idr'] = df_zom['rest_price_idr'].astype(int)

drop_column = ['url','index','rating','lat','long','review','new_code_res_type','new_code_fac']
target_column = 'rating'

# loop = 15 
# list_report_zom = []

# for i in range(0,15):
#     model_zom, report_zom = bulk_train(df_input=df_zom, drop_column=drop_column, target_column=target_column, dataset_name='zomato_only')
#     report_zom['iteration'] = i
#     list_report_zom.append(report_zom)

# df_report_zom = pd.concat(list_report_zom).reset_index()

# cross validation 

result_zomato= bulk_train_k_fold(df_input=df_zom, drop_column=drop_column, target_column=target_column, dataset_name='zomato_only')
result_zomato['test_neg_mean_squared_error']= result_zomato['test_neg_mean_squared_error']*-1
result_zomato['test_neg_mean_absolute_error']= result_zomato['test_neg_mean_absolute_error']*-1
result_zomato['feature_size'] = len(df_zom.drop(drop_column, axis=1).columns)
result_zomato['record'] = df_zom.drop(drop_column, axis=1).shape[0]
result_zomato = result_zomato.reset_index()

In [4]:
# zomato geo 250
df_zom_poi = pd.read_csv('data/3_clean_zomato_gof_250.csv')
df_zom_poi['rest_price_idr'] = df_zom_poi['rest_price_idr'] / 1000
df_zom_poi['rest_price_idr'] = df_zom_poi['rest_price_idr'].astype(int)

drop_column = ['url','index','rating','lat','long','review','new_code_res_type','new_code_fac','geohash','encode']
target_column = 'rating'

result_zomato_250 = bulk_train_k_fold(df_input=df_zom_poi, drop_column=drop_column, target_column=target_column, dataset_name='zomato_poi_250')
result_zomato_250['test_neg_mean_squared_error'] = result_zomato_250['test_neg_mean_squared_error']*-1
result_zomato_250['test_neg_mean_absolute_error'] = result_zomato_250['test_neg_mean_absolute_error']*-1
result_zomato_250['feature_size'] = len(df_zom_poi.drop(drop_column, axis=1).columns)
result_zomato_250['record'] = df_zom_poi.drop(drop_column, axis=1).shape[0]

result_zomato_250 = result_zomato_250.reset_index()

In [5]:
# zomato geo 500
df_zom_poi = pd.read_csv('data/3_clean_zomato_gof_500.csv')
df_zom_poi['rest_price_idr'] = df_zom_poi['rest_price_idr'] / 1000
df_zom_poi['rest_price_idr'] = df_zom_poi['rest_price_idr'].astype(int)

drop_column = ['url','index','rating','lat','long','review','new_code_res_type','new_code_fac','geohash','encode']
target_column = 'rating'

result_zomato_500 = bulk_train_k_fold(df_input=df_zom_poi, drop_column=drop_column, target_column=target_column, dataset_name='zomato_poi_500')
result_zomato_500['test_neg_mean_squared_error'] = result_zomato_500['test_neg_mean_squared_error']*-1
result_zomato_500['test_neg_mean_absolute_error'] = result_zomato_500['test_neg_mean_absolute_error']*-1
result_zomato_500['feature_size'] = len(df_zom_poi.drop(drop_column, axis=1).columns)
result_zomato_500['record'] = df_zom_poi.drop(drop_column, axis=1).shape[0]

result_zomato_500 = result_zomato_500.reset_index()

In [6]:
# zomato geo 1000
df_zom_poi = pd.read_csv('data/3_clean_zomato_gof_1000.csv')
df_zom_poi['rest_price_idr'] = df_zom_poi['rest_price_idr'] / 1000
df_zom_poi['rest_price_idr'] = df_zom_poi['rest_price_idr'].astype(int)

drop_column = ['url','index','rating','lat','long','review','new_code_res_type','new_code_fac','geohash','encode']
target_column = 'rating'

result_zomato_1000 = bulk_train_k_fold(df_input=df_zom_poi, drop_column=drop_column, target_column=target_column, dataset_name='zomato_poi_1000')
result_zomato_1000['test_neg_mean_squared_error'] = result_zomato_1000['test_neg_mean_squared_error']*-1
result_zomato_1000['test_neg_mean_absolute_error'] = result_zomato_1000['test_neg_mean_absolute_error']*-1
result_zomato_1000['feature_size'] = len(df_zom_poi.drop(drop_column, axis=1).columns)
result_zomato_1000['record'] = df_zom_poi.drop(drop_column, axis=1).shape[0]


result_zomato_1000 = result_zomato_1000.reset_index()

In [7]:
df_total_result = pd.concat([result_zomato,result_zomato_250, result_zomato_500, result_zomato_1000])

In [8]:
today = date.today()
print("Today's date:", today)

Today's date: 2022-11-13


In [9]:
df_total_result.to_excel('data/ml_perform_crossval_raw_{}.xlsx'.format(today),index=False)

In [10]:
df_zom_poi.drop(drop_column, axis=1).columns

Index(['rest_price_idr', 'is_chain', 'rank_res_type', 'rank_fac',
       'd_1000_airport', 'd_1000_atm', 'd_1000_bank', 'd_1000_cafe',
       'd_1000_convenience_store', 'd_1000_gas_station', 'd_1000_hospital',
       'd_1000_lodging', 'd_1000_meal_takeaway', 'd_1000_mosque',
       'd_1000_park', 'd_1000_restaurant', 'd_1000_school', 'd_1000_store',
       'd_1000_supermarket', 'd_1000_train_station', 'en_1000', 'n_compt_1000',
       'avg_n_rest'],
      dtype='object')

In [11]:
df_total_result_agg = df_total_result.groupby(['name','dataset','feature_size']).agg({'test_neg_mean_absolute_error':['mean','std','median'],'test_neg_mean_squared_error':['mean','std','median']}).reset_index()
df_total_result_agg.columns = ['name','dataset','feature_size','avg_mae','std_mae','med_mae','avg_mse','std_mse','med_mse']

df_total_result_agg.to_excel('data/ml_perform_crossval_agg_{}.xlsx'.format(today),index=False)

In [12]:
df_total_result_agg

Unnamed: 0,name,dataset,feature_size,avg_mae,std_mae,med_mae,avg_mse,std_mse,med_mse
0,rf,zomato_only,4,0.073786,0.003052,0.07216,0.008971,0.000431,0.008763
1,rf,zomato_poi_1000,23,0.077429,0.003732,0.076864,0.009407,0.000422,0.009433
2,rf,zomato_poi_250,23,0.075238,0.003326,0.073841,0.00904,0.000344,0.008836
3,rf,zomato_poi_500,23,0.076213,0.003478,0.074792,0.009181,0.0004,0.009137
4,svr,zomato_only,4,0.078523,0.005558,0.077548,0.009605,0.000889,0.00926
5,svr,zomato_poi_1000,23,0.085512,0.006136,0.084419,0.011676,0.001223,0.011634
6,svr,zomato_poi_250,23,0.085708,0.009581,0.08339,0.012844,0.004293,0.011415
7,svr,zomato_poi_500,23,0.082151,0.005144,0.081023,0.010643,0.000979,0.010373
8,xgb,zomato_only,4,0.073361,0.003671,0.071636,0.00882,0.000484,0.008611
9,xgb,zomato_poi_1000,23,0.083155,0.004426,0.08177,0.011068,0.001,0.010756
