# Data Importing

In [2]:
import numpy as np
import pandas as pd
import category_encoders as ce
import miceforest as mf
import optuna
import lightgbm as lgb
import xgboost as xgb

from utils import null_checker, evaluate_model
from sklearn.model_selection import train_test_split, KFold
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

  import pandas.util.testing as tm


In [3]:
df = pd.read_csv('after_prep.csv')
df.head()

Unnamed: 0,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Seats,Price,Brand,Series,Type,Mileage (kmpl),Engine (CC),Power (bhp)
0,Mumbai,2010,72000,CNG,Manual,First,5.0,1.75,Maruti,Wagon,R,26.6,998.0,58.16
1,Pune,2015,41000,Diesel,Manual,First,5.0,12.5,Hyundai,Creta,1.6,19.67,1582.0,126.2
2,Chennai,2011,46000,Petrol,Manual,First,5.0,4.5,Honda,Jazz,V,18.2,1199.0,88.7
3,Chennai,2012,87000,Diesel,Manual,First,7.0,6.0,Maruti,Ertiga,VDI,20.77,1248.0,88.76
4,Coimbatore,2013,40670,Diesel,Automatic,Second,5.0,17.74,Audi,A4,New,15.2,1968.0,140.8


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6019 entries, 0 to 6018
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Location           6019 non-null   object 
 1   Year               6019 non-null   int64  
 2   Kilometers_Driven  6019 non-null   int64  
 3   Fuel_Type          6019 non-null   object 
 4   Transmission       6019 non-null   object 
 5   Owner_Type         6019 non-null   object 
 6   Seats              5976 non-null   float64
 7   Price              6019 non-null   float64
 8   Brand              6019 non-null   object 
 9   Series             6019 non-null   object 
 10  Type               6019 non-null   object 
 11  Mileage (kmpl)     5951 non-null   float64
 12  Engine (CC)        5983 non-null   float64
 13  Power (bhp)        5876 non-null   float64
dtypes: float64(5), int64(2), object(7)
memory usage: 658.5+ KB


# Preprocessing

In [5]:
# Delete outlier
df = df[~(df.Kilometers_Driven > 1e6)]
df.shape

(6018, 14)

## Feature enginering

In [6]:
# Grouping category less than 10 to "Other"
for col in ["Brand", "Series", "Type"]:
    counts = df[col].value_counts()
    other = counts[counts < 10].index
    df[col] = df[col].replace(other, "Other")

In [7]:
# Make categorical feature interactions
from itertools import combinations
from sklearn.preprocessing import LabelEncoder

cat_cols = ['Location', 'Fuel_Type', 'Transmission', 'Owner_Type', 'Brand']

for col in combinations(cat_cols, 2):
    new_col = col[0]+'_'+col[1]
    df[new_col] = df[col[0]] + "_" + df[col[1]]
    
    counts = df[new_col].value_counts()
    other = counts[counts < 10].index
    df[new_col] = df[new_col].replace(other, "Other")

    encoder = LabelEncoder()
    df[new_col] = encoder.fit_transform(df[new_col])

## Train test split

In [8]:
# melakukan train test split di awal untuk mencegah data leakage
features = df.drop(columns=['Price'])
target = df['Price']
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.25, random_state=0)

## Encoding

In [9]:
# One hot encoding
col_to_encode = ['Location', 'Fuel_Type', 'Transmission', 'Owner_Type', 'Brand']
oh_encoder = ce.OneHotEncoder(cols=col_to_encode,
                              use_cat_names=True)
oh_encoder.fit(X_train)

# Encoding train set
X_train = oh_encoder.transform(X_train)
# Encoding test set
X_test = oh_encoder.transform(X_test)


is_categorical is deprecated and will be removed in a future version.  Use is_categorical_dtype instead



In [10]:
# Target Encoding
col_to_encode = ['Series', 'Type']
encoder = ce.TargetEncoder(cols=col_to_encode)
encoder.fit(X_train, y_train)

# Encoding train set
X_train = encoder.transform(X_train)
# Encoding test set
X_test = encoder.transform(X_test)


is_categorical is deprecated and will be removed in a future version.  Use is_categorical_dtype instead



## Missing Value Imputation

In [11]:
# memprediksi nilai missing value dengan MICE
imputer = mf.KernelDataSet(
  X_train,
  save_all_iterations=True,
  random_state=1991,
  mean_match_candidates=5
)
imputer.mice(10)

In [12]:
# Train set imputation
X_train = imputer.complete_data()

In [13]:
# Test set imputation
new_data = imputer.impute_new_data(X_test)
X_test = new_data.complete_data()

# Modeling

## Hyperparameter Tuning

### XGBoost

#### Study 1

In [14]:
def objective(trial):

    dtrain = xgb.DMatrix(X_train, label=y_train)

    params = {
        'objective': 'reg:squarederror',
        'tree_method': 'hist',
        'learning_rate': 0.1,
        'max_depth': trial.suggest_int('max_depth', 1, 10),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 6),
        'gamma': trial.suggest_loguniform("gamma", 1e-8, 1.0),
        'subsample':trial.suggest_uniform('subsample', 0.1, 1),
        'colsample_bytree':trial.suggest_uniform('colsample_bytree', 0.1, 1),
        "lambda": trial.suggest_loguniform("lambda", 1e-8, 1.0),
        "alpha": trial.suggest_loguniform("alpha", 1e-8, 1.0),
        "grow_policy": trial.suggest_categorical("grow_policy", ["depthwise", "lossguide"])
    }

    pruning_callback = optuna.integration.XGBoostPruningCallback(trial, "test-rmse")

    cv = KFold(n_splits=5, shuffle=True, random_state=0)
    history = xgb.cv(params, dtrain, num_boost_round=2000, 
                     early_stopping_rounds=100,
                     callbacks=[pruning_callback],
                     metrics='rmse', 
                     folds=cv)

    mean_score = history["test-rmse-mean"].values[-1]
    return mean_score

pruner = optuna.pruners.MedianPruner(n_warmup_steps=10)
study = optuna.create_study(pruner=pruner, direction='minimize')
study.optimize(objective, n_trials=1000)

print("Number of finished trials: {}".format(len(study.trials)))

print("Best trial:")
trial = study.best_trial

print("  Value: {}".format(trial.value))

print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

[32m[I 2020-10-30 17:09:35,031][0m A new study created in memory with name: no-name-3e686345-4b1d-4998-ae80-b96a694347f3[0m
[32m[I 2020-10-30 17:09:44,048][0m Trial 0 finished with value: 3.1786083999999994 and parameters: {'max_depth': 7, 'min_child_weight': 2, 'gamma': 1.3288870652937703e-05, 'subsample': 0.7663069456975505, 'colsample_bytree': 0.5299191465021912, 'lambda': 1.5629090426329995e-06, 'alpha': 0.0009656455564270013, 'grow_policy': 'lossguide'}. Best is trial 0 with value: 3.1786083999999994.[0m
[32m[I 2020-10-30 17:09:59,132][0m Trial 1 finished with value: 3.1673299999999998 and parameters: {'max_depth': 2, 'min_child_weight': 5, 'gamma': 0.0005549259398863449, 'subsample': 0.6912526592528124, 'colsample_bytree': 0.9894484334417263, 'lambda': 1.6541493428841292e-06, 'alpha': 0.00023444101714180054, 'grow_policy': 'depthwise'}. Best is trial 1 with value: 3.1673299999999998.[0m
[32m[I 2020-10-30 17:10:19,903][0m Trial 2 finished with value: 2.9929478 and param

Number of finished trials: 1000
Best trial:
  Value: 2.9929478
  Params: 
    max_depth: 3
    min_child_weight: 2
    gamma: 0.0018966046359492025
    subsample: 0.7386512488422983
    colsample_bytree: 0.1839562426229215
    lambda: 1.0397210733045859e-08
    alpha: 0.9956720864281428
    grow_policy: lossguide


In [15]:
# Get best params then add to param_1
xgb_study_1_params = study.best_params
xgb_param_1 = {
    'objective': 'reg:squarederror',
    'tree_method': 'hist',
    'learning_rate': 0.1,
}
xgb_param_1.update(xgb_study_1_params)
xgb_param_1

{'alpha': 0.9956720864281428,
 'colsample_bytree': 0.1839562426229215,
 'gamma': 0.0018966046359492025,
 'grow_policy': 'lossguide',
 'lambda': 1.0397210733045859e-08,
 'learning_rate': 0.1,
 'max_depth': 3,
 'min_child_weight': 2,
 'objective': 'reg:squarederror',
 'subsample': 0.7386512488422983,
 'tree_method': 'hist'}

In [16]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

cv = KFold(
    n_splits=5, 
    shuffle=True, 
    random_state=0
)
history = xgb.cv(
    xgb_param_1, dtrain, 
    num_boost_round=2000, 
    early_stopping_rounds=100,
    metrics='rmse',
    folds=cv
)
xgb_n_estimators_1 = history.shape[0]
xgb_n_estimators_1

1064

#### Study 2

In [17]:
def objective(trial):

    dtrain = xgb.DMatrix(X_train, label=y_train)

    params = {
        'objective': 'reg:squarederror',
        'tree_method': 'hist',
        'learning_rate': 0.01,
        'max_depth': trial.suggest_int('max_depth', 1, 10),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 6),
        'gamma': trial.suggest_loguniform("gamma", 1e-8, 1.0),
        'subsample':trial.suggest_uniform('subsample', 0.1, 1),
        'colsample_bytree':trial.suggest_uniform('colsample_bytree', 0.1, 1),
        "lambda": trial.suggest_loguniform("lambda", 1e-8, 1.0),
        "alpha": trial.suggest_loguniform("alpha", 1e-8, 1.0),
        "grow_policy": trial.suggest_categorical("grow_policy", ["depthwise", "lossguide"])
    }

    pruning_callback = optuna.integration.XGBoostPruningCallback(trial, "test-rmse")

    cv = KFold(n_splits=5, shuffle=True, random_state=0)
    history = xgb.cv(params, dtrain, num_boost_round=2000, 
                     early_stopping_rounds=100,
                     callbacks=[pruning_callback],
                     metrics='rmse', 
                     folds=cv)

    mean_score = history["test-rmse-mean"].values[-1]
    return mean_score

pruner = optuna.pruners.MedianPruner(n_warmup_steps=10)
study = optuna.create_study(pruner=pruner, direction='minimize')
study.optimize(objective, n_trials=1000)

print("Number of finished trials: {}".format(len(study.trials)))

print("Best trial:")
trial = study.best_trial

print("  Value: {}".format(trial.value))

print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

[32m[I 2020-10-30 17:32:46,060][0m A new study created in memory with name: no-name-37473542-485d-4eec-bd00-8ed58b4e782e[0m
[32m[I 2020-10-30 17:33:18,654][0m Trial 0 finished with value: 3.08781 and parameters: {'max_depth': 8, 'min_child_weight': 5, 'gamma': 0.0032705796530657014, 'subsample': 0.637464574199073, 'colsample_bytree': 0.4028818445000665, 'lambda': 0.006647962174245158, 'alpha': 1.7277044477753599e-07, 'grow_policy': 'depthwise'}. Best is trial 0 with value: 3.08781.[0m
[32m[I 2020-10-30 17:33:51,021][0m Trial 1 finished with value: 3.4508052 and parameters: {'max_depth': 2, 'min_child_weight': 5, 'gamma': 0.0010617824178609288, 'subsample': 0.1569542182043594, 'colsample_bytree': 0.5596747317193114, 'lambda': 0.27883976916185654, 'alpha': 5.442405893385483e-07, 'grow_policy': 'depthwise'}. Best is trial 0 with value: 3.08781.[0m
[32m[I 2020-10-30 17:34:33,757][0m Trial 2 finished with value: 3.4726942000000003 and parameters: {'max_depth': 6, 'min_child_weigh

Number of finished trials: 1000
Best trial:
  Value: 2.9156902000000002
  Params: 
    max_depth: 8
    min_child_weight: 1
    gamma: 6.998705246126152e-05
    subsample: 0.38728506152978953
    colsample_bytree: 0.8512246674082545
    lambda: 1.460791953682575e-06
    alpha: 0.00046889330174842644
    grow_policy: lossguide


In [18]:
# Get best params then add to param_2
xgb_study_2_params = study.best_params
xgb_param_2 = {
    'objective': 'reg:squarederror',
    'tree_method': 'hist',
    'learning_rate': 0.01,
}
xgb_param_2.update(xgb_study_2_params)
xgb_param_2

{'alpha': 0.00046889330174842644,
 'colsample_bytree': 0.8512246674082545,
 'gamma': 6.998705246126152e-05,
 'grow_policy': 'lossguide',
 'lambda': 1.460791953682575e-06,
 'learning_rate': 0.01,
 'max_depth': 8,
 'min_child_weight': 1,
 'objective': 'reg:squarederror',
 'subsample': 0.38728506152978953,
 'tree_method': 'hist'}

In [19]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

cv = KFold(
    n_splits=5, 
    shuffle=True, 
    random_state=0
)
history = xgb.cv(
    xgb_param_2, dtrain, 
    num_boost_round=2000, 
    early_stopping_rounds=100,
    metrics='rmse',
    folds=cv
)
xgb_n_estimators_2 = history.shape[0]
xgb_n_estimators_2

1757

#### Evaluation

In [20]:
xgb_study_1 = XGBRegressor(**xgb_param_1, n_estimators=xgb_n_estimators_1)
xgb_study_2 = XGBRegressor(**xgb_param_2, n_estimators=xgb_n_estimators_2)

xgb_models = {
    f'XGBRegressor ({xgb_n_estimators_1}) {xgb_param_1}': xgb_study_1,
    f'XGBRegressor ({xgb_n_estimators_2}) {xgb_param_2}': xgb_study_2
}
evaluate_model(xgb_models, X_train, X_test, y_train, y_test)

Unnamed: 0_level_0,Fit Time,Train R2,CV R2,Test R2,Train RMSE,CV RMSE,Test RMSE
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
"XGBRegressor (1757) {'objective': 'reg:squarederror', 'tree_method': 'hist', 'learning_rate': 0.01, 'max_depth': 8, 'min_child_weight': 1, 'gamma': 6.998705246126152e-05, 'subsample': 0.38728506152978953, 'colsample_bytree': 0.8512246674082545, 'lambda': 1.460791953682575e-06, 'alpha': 0.00046889330174842644, 'grow_policy': 'lossguide'}",10.3943,0.9961,0.9259,0.9087,0.696,2.9985,3.4499
"XGBRegressor (1064) {'objective': 'reg:squarederror', 'tree_method': 'hist', 'learning_rate': 0.1, 'max_depth': 3, 'min_child_weight': 2, 'gamma': 0.0018966046359492025, 'subsample': 0.7386512488422983, 'colsample_bytree': 0.1839562426229215, 'lambda': 1.0397210733045859e-08, 'alpha': 0.9956720864281428, 'grow_policy': 'lossguide'}",2.2611,0.9854,0.9224,0.9129,1.3406,3.0743,3.3692


#### Study 3

In [21]:
def objective(trial):

    dtrain = xgb.DMatrix(X_train, label=y_train)

    params = xgb_param_2.copy()
    params["learning_rate"] = trial.suggest_uniform('learning_rate', 0.001, 0.01)

    pruning_callback = optuna.integration.XGBoostPruningCallback(trial, "test-rmse")

    cv = KFold(n_splits=5, shuffle=True, random_state=0)
    history = xgb.cv(params, dtrain, num_boost_round=2000, 
                     early_stopping_rounds=100,
                     callbacks=[pruning_callback],
                     metrics='rmse', 
                     folds=cv)

    mean_score = history["test-rmse-mean"].values[-1]
    return mean_score

pruner = optuna.pruners.MedianPruner(n_warmup_steps=10)
study = optuna.create_study(pruner=pruner, direction='minimize')
study.optimize(objective, n_trials=1000)

print("Number of finished trials: {}".format(len(study.trials)))

print("Best trial:")
trial = study.best_trial

print("  Value: {}".format(trial.value))

print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

[32m[I 2020-10-30 19:24:15,609][0m A new study created in memory with name: no-name-ff299c9d-927a-4a16-bc0a-2280e92fcfb3[0m
[32m[I 2020-10-30 19:25:46,968][0m Trial 0 finished with value: 2.93288 and parameters: {'learning_rate': 0.009415371870382688}. Best is trial 0 with value: 2.93288.[0m
[32m[I 2020-10-30 19:27:27,206][0m Trial 1 finished with value: 2.9891972 and parameters: {'learning_rate': 0.0032895443044702617}. Best is trial 0 with value: 2.93288.[0m
[32m[I 2020-10-30 19:28:55,248][0m Trial 2 finished with value: 2.9428548 and parameters: {'learning_rate': 0.006281330377640523}. Best is trial 0 with value: 2.93288.[0m
[32m[I 2020-10-30 19:30:23,784][0m Trial 3 finished with value: 2.9349357999999994 and parameters: {'learning_rate': 0.007269182688900468}. Best is trial 0 with value: 2.93288.[0m
[32m[I 2020-10-30 19:31:58,722][0m Trial 4 finished with value: 2.9796752000000004 and parameters: {'learning_rate': 0.0036114558182526327}. Best is trial 0 with value

Number of finished trials: 1000
Best trial:
  Value: 2.9105894
  Params: 
    learning_rate: 0.00999794174209357


In [22]:
# Get best params then add to param_3
xgb_param_3 = xgb_param_2.copy()
xgb_param_3["learning_rate"] = study.best_params["learning_rate"]
xgb_param_3

{'alpha': 0.00046889330174842644,
 'colsample_bytree': 0.8512246674082545,
 'gamma': 6.998705246126152e-05,
 'grow_policy': 'lossguide',
 'lambda': 1.460791953682575e-06,
 'learning_rate': 0.00999794174209357,
 'max_depth': 8,
 'min_child_weight': 1,
 'objective': 'reg:squarederror',
 'subsample': 0.38728506152978953,
 'tree_method': 'hist'}

In [23]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

cv = KFold(
    n_splits=5, 
    shuffle=True, 
    random_state=0
)
history = xgb.cv(
    xgb_param_3, dtrain, 
    num_boost_round=2000, 
    early_stopping_rounds=100,
    metrics='rmse',
    folds=cv
)
xgb_n_estimators_3 = history.shape[0]
xgb_n_estimators_3

2000

#### Evaluation

In [24]:
xgb_study_1 = XGBRegressor(**xgb_param_1, n_estimators=xgb_n_estimators_1)
xgb_study_2 = XGBRegressor(**xgb_param_2, n_estimators=xgb_n_estimators_2)
xgb_study_3 = XGBRegressor(**xgb_param_3, n_estimators=xgb_n_estimators_3)

xgb_models = {
    f'XGBRegressor ({xgb_n_estimators_1}) {xgb_param_1}': xgb_study_1,
    f'XGBRegressor ({xgb_n_estimators_2}) {xgb_param_2}': xgb_study_2,
    f'XGBRegressor ({xgb_n_estimators_3}) {xgb_param_3}': xgb_study_3
}
xgb_result = evaluate_model(xgb_models, X_train, X_test, y_train, y_test)
xgb_result

Unnamed: 0_level_0,Fit Time,Train R2,CV R2,Test R2,Train RMSE,CV RMSE,Test RMSE
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
"XGBRegressor (1757) {'objective': 'reg:squarederror', 'tree_method': 'hist', 'learning_rate': 0.01, 'max_depth': 8, 'min_child_weight': 1, 'gamma': 6.998705246126152e-05, 'subsample': 0.38728506152978953, 'colsample_bytree': 0.8512246674082545, 'lambda': 1.460791953682575e-06, 'alpha': 0.00046889330174842644, 'grow_policy': 'lossguide'}",10.0355,0.9961,0.9259,0.9087,0.696,2.9985,3.4499
"XGBRegressor (2000) {'objective': 'reg:squarederror', 'tree_method': 'hist', 'learning_rate': 0.00999794174209357, 'max_depth': 8, 'min_child_weight': 1, 'gamma': 6.998705246126152e-05, 'subsample': 0.38728506152978953, 'colsample_bytree': 0.8512246674082545, 'lambda': 1.460791953682575e-06, 'alpha': 0.00046889330174842644, 'grow_policy': 'lossguide'}",11.6454,0.9968,0.9258,0.9085,0.6248,2.9999,3.4523
"XGBRegressor (1064) {'objective': 'reg:squarederror', 'tree_method': 'hist', 'learning_rate': 0.1, 'max_depth': 3, 'min_child_weight': 2, 'gamma': 0.0018966046359492025, 'subsample': 0.7386512488422983, 'colsample_bytree': 0.1839562426229215, 'lambda': 1.0397210733045859e-08, 'alpha': 0.9956720864281428, 'grow_policy': 'lossguide'}",2.2883,0.9854,0.9224,0.9129,1.3406,3.0743,3.3692


In [25]:
xgb_result.to_csv("tuning_imputed_all (XGB).csv", index=False)

### LightGBM

#### Study 1

In [26]:
def objective(trial):

    dtrain = lgb.Dataset(X_train, label=y_train)

    params = {
        "objective": "regression",
        "verbosity": -1,
        "boosting_type": "gbdt",
        "learning_rate": 0.1,
        "max_depth": trial.suggest_int("max_depth", 1, 30),
        "num_leaves": trial.suggest_int("num_leaves", 2, 256),
        "lambda_l1": trial.suggest_loguniform("lambda_l1", 1e-8, 1.0),
        "lambda_l2": trial.suggest_loguniform("lambda_l2", 1e-8, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
        'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.1, 1),
        "feature_fraction": trial.suggest_uniform("feature_fraction", 0.1, 1),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
    }

    pruning_callback = optuna.integration.LightGBMPruningCallback(trial, "rmse")

    cv = KFold(n_splits=5, shuffle=True, random_state=0)
    scores = lgb.cv(params, dtrain, 
                    num_boost_round=2000, 
                    early_stopping_rounds=100,
                    callbacks=[pruning_callback],
                    metrics='rmse', 
                    folds=cv)

    mean_score = scores['rmse-mean'][-1]
    return mean_score

pruner = optuna.pruners.MedianPruner(n_warmup_steps=10)
study = optuna.create_study(pruner=pruner, direction='minimize')
study.optimize(objective, n_trials=1000)


print("Number of finished trials: {}".format(len(study.trials)))

print("Best trial:")
trial = study.best_trial

print("  Value: {}".format(trial.value))

print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

[32m[I 2020-10-30 22:16:56,627][0m A new study created in memory with name: no-name-d52e1721-80de-458f-a80c-5d612ea1d208[0m
[32m[I 2020-10-30 22:17:07,169][0m Trial 0 finished with value: 3.2030961936014464 and parameters: {'max_depth': 28, 'num_leaves': 130, 'lambda_l1': 5.66659175082421e-07, 'lambda_l2': 6.169459341265024e-08, 'bagging_freq': 6, 'bagging_fraction': 0.9750292093319033, 'feature_fraction': 0.6658987086029302, 'min_child_samples': 100}. Best is trial 0 with value: 3.2030961936014464.[0m
[32m[I 2020-10-30 22:17:10,527][0m Trial 1 finished with value: 3.5100840360346255 and parameters: {'max_depth': 8, 'num_leaves': 255, 'lambda_l1': 0.032955676776748453, 'lambda_l2': 0.001292597758368385, 'bagging_freq': 2, 'bagging_fraction': 0.23663816208067615, 'feature_fraction': 0.19910826285542021, 'min_child_samples': 27}. Best is trial 0 with value: 3.2030961936014464.[0m
[32m[I 2020-10-30 22:17:16,363][0m Trial 2 finished with value: 3.2569586882844535 and parameters:

Number of finished trials: 1000
Best trial:
  Value: 2.941547594918401
  Params: 
    max_depth: 25
    num_leaves: 138
    lambda_l1: 0.006559936899315682
    lambda_l2: 0.14549900758382694
    bagging_freq: 2
    bagging_fraction: 0.6442024734908964
    feature_fraction: 0.7764307387211526
    min_child_samples: 14


In [27]:
# Get best params then add to param_1
lgb_study_1_params = study.best_params
lgb_param_1 = {
    "objective": "regression",
    "verbosity": -1,
    "boosting_type": "gbdt",
    "learning_rate": 0.1,
}
lgb_param_1.update(lgb_study_1_params)
lgb_param_1

{'bagging_fraction': 0.6442024734908964,
 'bagging_freq': 2,
 'boosting_type': 'gbdt',
 'feature_fraction': 0.7764307387211526,
 'lambda_l1': 0.006559936899315682,
 'lambda_l2': 0.14549900758382694,
 'learning_rate': 0.1,
 'max_depth': 25,
 'min_child_samples': 14,
 'num_leaves': 138,
 'objective': 'regression',
 'verbosity': -1}

In [28]:
dtrain = lgb.Dataset(X_train, label=y_train)

cv = KFold(
    n_splits=5, 
    shuffle=True, 
    random_state=0
)
history = lgb.cv(
    lgb_param_1, dtrain, 
    num_boost_round=2000,
    early_stopping_rounds=100,
    metrics='rmse', 
    folds=cv
)
lgb_n_estimators_1 = pd.DataFrame(history).shape[0]
lgb_n_estimators_1

331

#### Study 2

In [29]:
def objective(trial):

    dtrain = lgb.Dataset(X_train, label=y_train)

    params = {
        "objective": "regression",
        "verbosity": -1,
        "boosting_type": "gbdt",
        "learning_rate": 0.01,
        "max_depth": trial.suggest_int("max_depth", 1, 30),
        "num_leaves": trial.suggest_int("num_leaves", 2, 256),
        "lambda_l1": trial.suggest_loguniform("lambda_l1", 1e-8, 1.0),
        "lambda_l2": trial.suggest_loguniform("lambda_l2", 1e-8, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
        'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.1, 1),
        "feature_fraction": trial.suggest_uniform("feature_fraction", 0.1, 1),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
    }

    pruning_callback = optuna.integration.LightGBMPruningCallback(trial, "rmse")

    cv = KFold(n_splits=5, shuffle=True, random_state=0)
    scores = lgb.cv(params, dtrain, 
                    num_boost_round=2000, 
                    early_stopping_rounds=100,
                    callbacks=[pruning_callback],
                    metrics='rmse', 
                    folds=cv)

    mean_score = scores['rmse-mean'][-1]
    return mean_score

pruner = optuna.pruners.MedianPruner(n_warmup_steps=10)
study = optuna.create_study(pruner=pruner, direction='minimize')
study.optimize(objective, n_trials=1000)


print("Number of finished trials: {}".format(len(study.trials)))

print("Best trial:")
trial = study.best_trial

print("  Value: {}".format(trial.value))

print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

[32m[I 2020-10-30 22:34:06,339][0m A new study created in memory with name: no-name-cf1291d8-551c-4d42-81cf-59dc2c1bc81e[0m
[32m[I 2020-10-30 22:34:33,850][0m Trial 0 finished with value: 3.2730986411368037 and parameters: {'max_depth': 29, 'num_leaves': 185, 'lambda_l1': 0.00017283079591633524, 'lambda_l2': 2.7960033979147485e-06, 'bagging_freq': 2, 'bagging_fraction': 0.9342909379691601, 'feature_fraction': 0.7642244169313462, 'min_child_samples': 52}. Best is trial 0 with value: 3.2730986411368037.[0m
[32m[I 2020-10-30 22:34:49,241][0m Trial 1 finished with value: 3.4631492062528317 and parameters: {'max_depth': 21, 'num_leaves': 18, 'lambda_l1': 4.835520129719836e-06, 'lambda_l2': 3.384657431060082e-07, 'bagging_freq': 2, 'bagging_fraction': 0.8170374629794336, 'feature_fraction': 0.8162670013746193, 'min_child_samples': 89}. Best is trial 0 with value: 3.2730986411368037.[0m
[32m[I 2020-10-30 22:34:58,788][0m Trial 2 finished with value: 3.8703961508144276 and parameter

Number of finished trials: 1000
Best trial:
  Value: 2.960004873775475
  Params: 
    max_depth: 27
    num_leaves: 109
    lambda_l1: 0.15722860084023477
    lambda_l2: 1.2167505820029876e-07
    bagging_freq: 6
    bagging_fraction: 0.5529946548733212
    feature_fraction: 0.9662099089679588
    min_child_samples: 5


In [30]:
# Get best params then add to param_2
lgb_study_2_params = study.best_params
lgb_param_2 = {
    "objective": "regression",
    "verbosity": -1,
    "boosting_type": "gbdt",
    "learning_rate": 0.01,
}
lgb_param_2.update(lgb_study_2_params)
lgb_param_2

{'bagging_fraction': 0.5529946548733212,
 'bagging_freq': 6,
 'boosting_type': 'gbdt',
 'feature_fraction': 0.9662099089679588,
 'lambda_l1': 0.15722860084023477,
 'lambda_l2': 1.2167505820029876e-07,
 'learning_rate': 0.01,
 'max_depth': 27,
 'min_child_samples': 5,
 'num_leaves': 109,
 'objective': 'regression',
 'verbosity': -1}

In [31]:
lgb_param_2['learning_rate'] = 0.01
lgb_param_2

{'bagging_fraction': 0.5529946548733212,
 'bagging_freq': 6,
 'boosting_type': 'gbdt',
 'feature_fraction': 0.9662099089679588,
 'lambda_l1': 0.15722860084023477,
 'lambda_l2': 1.2167505820029876e-07,
 'learning_rate': 0.01,
 'max_depth': 27,
 'min_child_samples': 5,
 'num_leaves': 109,
 'objective': 'regression',
 'verbosity': -1}

In [32]:
dtrain = lgb.Dataset(X_train, label=y_train)

cv = KFold(
    n_splits=5, 
    shuffle=True, 
    random_state=0
)
history = lgb.cv(
    lgb_param_2, dtrain, 
    num_boost_round=2000,
    early_stopping_rounds=100,
    metrics='rmse', 
    folds=cv
)
lgb_n_estimators_2 = pd.DataFrame(history).shape[0]
lgb_n_estimators_2

942

#### Evaluation

In [33]:
lgb_study_1 = LGBMRegressor(**lgb_param_1, n_estimators=lgb_n_estimators_1)
lgb_study_2 = LGBMRegressor(**lgb_param_2, n_estimators=lgb_n_estimators_2)

lgb_models = {
    f'LGBMRegressor ({lgb_n_estimators_1}) {lgb_param_1}': lgb_study_1,
    f'LGBMRegressor ({lgb_n_estimators_2}) {lgb_param_2}': lgb_study_2
}
evaluate_model(lgb_models, X_train, X_test, y_train, y_test)

Unnamed: 0_level_0,Fit Time,Train R2,CV R2,Test R2,Train RMSE,CV RMSE,Test RMSE
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
"LGBMRegressor (942) {'objective': 'regression', 'verbosity': -1, 'boosting_type': 'gbdt', 'learning_rate': 0.01, 'max_depth': 27, 'num_leaves': 109, 'lambda_l1': 0.15722860084023477, 'lambda_l2': 1.2167505820029876e-07, 'bagging_freq': 6, 'bagging_fraction': 0.5529946548733212, 'feature_fraction': 0.9662099089679588, 'min_child_samples': 5}",5.2425,0.9909,0.9255,0.9084,1.0593,3.0069,3.4556
"LGBMRegressor (331) {'objective': 'regression', 'verbosity': -1, 'boosting_type': 'gbdt', 'learning_rate': 0.1, 'max_depth': 25, 'num_leaves': 138, 'lambda_l1': 0.006559936899315682, 'lambda_l2': 0.14549900758382694, 'bagging_freq': 2, 'bagging_fraction': 0.6442024734908964, 'feature_fraction': 0.7764307387211526, 'min_child_samples': 14}",1.4207,0.9924,0.9232,0.9116,0.9643,3.0562,3.3936


#### Study 3

In [34]:
def objective(trial):

    dtrain = lgb.Dataset(X_train, label=y_train)

    params = lgb_param_2.copy()
    params["learning_rate"] = trial.suggest_uniform('learning_rate', 0.001, 0.01)

    pruning_callback = optuna.integration.LightGBMPruningCallback(trial, "rmse")

    cv = KFold(n_splits=5, shuffle=True, random_state=0)
    scores = lgb.cv(params, dtrain, 
                    num_boost_round=2000, 
                    early_stopping_rounds=100,
                    callbacks=[pruning_callback],
                    metrics='rmse', 
                    folds=cv)

    mean_score = scores['rmse-mean'][-1]
    return mean_score

pruner = optuna.pruners.MedianPruner(n_warmup_steps=10)
study = optuna.create_study(pruner=pruner, direction='minimize')
study.optimize(objective, n_trials=1000)


print("Number of finished trials: {}".format(len(study.trials)))

print("Best trial:")
trial = study.best_trial

print("  Value: {}".format(trial.value))

print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

[32m[I 2020-10-31 00:03:47,110][0m A new study created in memory with name: no-name-bd794023-027f-4255-8c77-31bb25a3c7ee[0m
[32m[I 2020-10-31 00:04:38,706][0m Trial 0 finished with value: 3.381029092572443 and parameters: {'learning_rate': 0.0011827100464062417}. Best is trial 0 with value: 3.381029092572443.[0m
[32m[I 2020-10-31 00:05:12,876][0m Trial 1 finished with value: 2.9846476792084573 and parameters: {'learning_rate': 0.006498955099076817}. Best is trial 1 with value: 2.9846476792084573.[0m
[32m[I 2020-10-31 00:05:53,762][0m Trial 2 finished with value: 3.006555613794556 and parameters: {'learning_rate': 0.004577655686405948}. Best is trial 1 with value: 2.9846476792084573.[0m
[32m[I 2020-10-31 00:06:34,178][0m Trial 3 finished with value: 3.0235573568568297 and parameters: {'learning_rate': 0.005070395032273253}. Best is trial 1 with value: 2.9846476792084573.[0m
[32m[I 2020-10-31 00:07:08,207][0m Trial 4 finished with value: 2.98133460057051 and parameters: 

Number of finished trials: 1000
Best trial:
  Value: 2.9471219782655895
  Params: 
    learning_rate: 0.009987606676865423


In [35]:
# Get best params then add to param_3
lgb_param_3 = lgb_param_2.copy()
lgb_param_3["learning_rate"] = study.best_params["learning_rate"]
lgb_param_3

{'bagging_fraction': 0.5529946548733212,
 'bagging_freq': 6,
 'boosting_type': 'gbdt',
 'feature_fraction': 0.9662099089679588,
 'lambda_l1': 0.15722860084023477,
 'lambda_l2': 1.2167505820029876e-07,
 'learning_rate': 0.009987606676865423,
 'max_depth': 27,
 'min_child_samples': 5,
 'num_leaves': 109,
 'objective': 'regression',
 'verbosity': -1}

In [36]:
dtrain = lgb.Dataset(X_train, label=y_train)

cv = KFold(
    n_splits=5, 
    shuffle=True, 
    random_state=0
)
history = lgb.cv(
    lgb_param_3, dtrain, 
    num_boost_round=2000,
    early_stopping_rounds=100,
    metrics='rmse', 
    folds=cv
)
lgb_n_estimators_3 = pd.DataFrame(history).shape[0]
lgb_n_estimators_3

940

#### Evaluation

In [37]:
lgb_study_1 = LGBMRegressor(**lgb_param_1, n_estimators=lgb_n_estimators_1)
lgb_study_2 = LGBMRegressor(**lgb_param_2, n_estimators=lgb_n_estimators_2)
lgb_study_3 = LGBMRegressor(**lgb_param_3, n_estimators=lgb_n_estimators_3)

lgb_models = {
    f'LGBMRegressor ({lgb_n_estimators_1}) {lgb_param_1}': lgb_study_1,
    f'LGBMRegressor ({lgb_n_estimators_2}) {lgb_param_2}': lgb_study_2,
    f'LGBMRegressor ({lgb_n_estimators_3}) {lgb_param_3}': lgb_study_3
}
lgb_result = evaluate_model(lgb_models, X_train, X_test, y_train, y_test)
lgb_result

Unnamed: 0_level_0,Fit Time,Train R2,CV R2,Test R2,Train RMSE,CV RMSE,Test RMSE
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
"LGBMRegressor (940) {'objective': 'regression', 'verbosity': -1, 'boosting_type': 'gbdt', 'learning_rate': 0.009987606676865423, 'max_depth': 27, 'num_leaves': 109, 'lambda_l1': 0.15722860084023477, 'lambda_l2': 1.2167505820029876e-07, 'bagging_freq': 6, 'bagging_fraction': 0.5529946548733212, 'feature_fraction': 0.9662099089679588, 'min_child_samples': 5}",5.3027,0.9907,0.9258,0.9095,1.0706,3.0011,3.4347
"LGBMRegressor (942) {'objective': 'regression', 'verbosity': -1, 'boosting_type': 'gbdt', 'learning_rate': 0.01, 'max_depth': 27, 'num_leaves': 109, 'lambda_l1': 0.15722860084023477, 'lambda_l2': 1.2167505820029876e-07, 'bagging_freq': 6, 'bagging_fraction': 0.5529946548733212, 'feature_fraction': 0.9662099089679588, 'min_child_samples': 5}",5.3318,0.9909,0.9255,0.9084,1.0593,3.0069,3.4556
"LGBMRegressor (331) {'objective': 'regression', 'verbosity': -1, 'boosting_type': 'gbdt', 'learning_rate': 0.1, 'max_depth': 25, 'num_leaves': 138, 'lambda_l1': 0.006559936899315682, 'lambda_l2': 0.14549900758382694, 'bagging_freq': 2, 'bagging_fraction': 0.6442024734908964, 'feature_fraction': 0.7764307387211526, 'min_child_samples': 14}",1.4528,0.9924,0.9232,0.9116,0.9643,3.0562,3.3936


In [38]:
lgb_result.to_csv("tuning_imputed_all (LGB).csv", index=False)

## Combine Result

In [39]:
combined_result = pd.concat([xgb_result, lgb_result], axis=0)
combined_result.sort_values(by='CV RMSE', inplace=True)
combined_result.to_csv("tuning_imputed_all (XGB+LGB).csv", index=True)