# Data Importing

In [None]:
import numpy as np
import pandas as pd
import category_encoders as ce
import miceforest as mf
import optuna
import lightgbm as lgb
import xgboost as xgb

from utils import null_checker, evaluate_model
from sklearn.model_selection import train_test_split, KFold
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

In [None]:
df = pd.read_csv('../data/processed/after_prep.csv')
df.head()

Unnamed: 0,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Seats,Price,Brand,Series,Type,Mileage (kmpl),Engine (CC),Power (bhp)
0,Mumbai,2010,72000,CNG,Manual,First,5.0,1.75,Maruti,Wagon,R,26.6,998.0,58.16
1,Pune,2015,41000,Diesel,Manual,First,5.0,12.5,Hyundai,Creta,1.6,19.67,1582.0,126.2
2,Chennai,2011,46000,Petrol,Manual,First,5.0,4.5,Honda,Jazz,V,18.2,1199.0,88.7
3,Chennai,2012,87000,Diesel,Manual,First,7.0,6.0,Maruti,Ertiga,VDI,20.77,1248.0,88.76
4,Coimbatore,2013,40670,Diesel,Automatic,Second,5.0,17.74,Audi,A4,New,15.2,1968.0,140.8


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6019 entries, 0 to 6018
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Location           6019 non-null   object 
 1   Year               6019 non-null   int64  
 2   Kilometers_Driven  6019 non-null   int64  
 3   Fuel_Type          6019 non-null   object 
 4   Transmission       6019 non-null   object 
 5   Owner_Type         6019 non-null   object 
 6   Seats              5976 non-null   float64
 7   Price              6019 non-null   float64
 8   Brand              6019 non-null   object 
 9   Series             6019 non-null   object 
 10  Type               6019 non-null   object 
 11  Mileage (kmpl)     5951 non-null   float64
 12  Engine (CC)        5983 non-null   float64
 13  Power (bhp)        5876 non-null   float64
dtypes: float64(5), int64(2), object(7)
memory usage: 658.5+ KB


# Preprocessing

In [None]:
# Delete outlier
df = df[~(df.Kilometers_Driven > 1e6)]
df.shape

(6018, 14)

In [None]:
# Drop missing values
df = df.dropna()
null_checker(df)

Unnamed: 0,null (sum),null (%)
Location,0,0.0
Year,0,0.0
Kilometers_Driven,0,0.0
Fuel_Type,0,0.0
Transmission,0,0.0
Owner_Type,0,0.0
Seats,0,0.0
Price,0,0.0
Brand,0,0.0
Series,0,0.0


## Feature enginering

In [None]:
# Grouping category less than 10 to "Other"
for col in ["Brand", "Series", "Type"]:
    counts = df[col].value_counts()
    other = counts[counts < 10].index
    df[col] = df[col].replace(other, "Other")

In [None]:
# Make categorical feature interactions
from itertools import combinations
from sklearn.preprocessing import LabelEncoder

cat_cols = ['Location', 'Fuel_Type', 'Transmission', 'Owner_Type', 'Brand']

for col in combinations(cat_cols, 2):
    new_col = col[0]+'_'+col[1]
    df[new_col] = df[col[0]] + "_" + df[col[1]]
    
    counts = df[new_col].value_counts()
    other = counts[counts < 10].index
    df[new_col] = df[new_col].replace(other, "Other")

    encoder = LabelEncoder()
    df[new_col] = encoder.fit_transform(df[new_col])

## Train test split

In [None]:
# melakukan train test split di awal untuk mencegah data leakage
features = df.drop(columns=['Price'])
target = df['Price']
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.25, random_state=0)

## Encoding

In [None]:
# One hot encoding
col_to_encode = ['Location', 'Fuel_Type', 'Transmission', 'Owner_Type', 'Brand']
oh_encoder = ce.OneHotEncoder(cols=col_to_encode,
                              use_cat_names=True)
oh_encoder.fit(X_train)

# Encoding train set
X_train = oh_encoder.transform(X_train)
# Encoding test set
X_test = oh_encoder.transform(X_test)


is_categorical is deprecated and will be removed in a future version.  Use is_categorical_dtype instead



In [None]:
# Target Encoding
col_to_encode = ['Series', 'Type']
encoder = ce.TargetEncoder(cols=col_to_encode)
encoder.fit(X_train, y_train)

# Encoding train set
X_train = encoder.transform(X_train)
# Encoding test set
X_test = encoder.transform(X_test)


is_categorical is deprecated and will be removed in a future version.  Use is_categorical_dtype instead



# Modeling

## Hyperparameter Tuning

### XGBoost

#### Study 1

In [None]:
def objective(trial):

    dtrain = xgb.DMatrix(X_train, label=y_train)

    params = {
        'objective': 'reg:squarederror',
        'tree_method': 'hist',
        'learning_rate': 0.1,
        'max_depth': trial.suggest_int('max_depth', 1, 10),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 6),
        'gamma': trial.suggest_loguniform("gamma", 1e-8, 1.0),
        'subsample':trial.suggest_uniform('subsample', 0.1, 1),
        'colsample_bytree':trial.suggest_uniform('colsample_bytree', 0.1, 1),
        "lambda": trial.suggest_loguniform("lambda", 1e-8, 1.0),
        "alpha": trial.suggest_loguniform("alpha", 1e-8, 1.0),
        "grow_policy": trial.suggest_categorical("grow_policy", ["depthwise", "lossguide"])
    }

    pruning_callback = optuna.integration.XGBoostPruningCallback(trial, "test-rmse")

    cv = KFold(n_splits=5, shuffle=True, random_state=0)
    history = xgb.cv(params, dtrain, num_boost_round=2000, 
                     early_stopping_rounds=100,
                     callbacks=[pruning_callback],
                     metrics='rmse', 
                     folds=cv)

    mean_score = history["test-rmse-mean"].values[-1]
    return mean_score

pruner = optuna.pruners.MedianPruner(n_warmup_steps=10)
study = optuna.create_study(pruner=pruner, direction='minimize')
study.optimize(objective, n_trials=1000)

print("Number of finished trials: {}".format(len(study.trials)))

print("Best trial:")
trial = study.best_trial

print("  Value: {}".format(trial.value))

print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

[32m[I 2020-10-30 07:21:51,399][0m A new study created in memory with name: no-name-ec5f0d08-98a3-4e0b-a555-e30e688be9a0[0m
[32m[I 2020-10-30 07:22:07,425][0m Trial 0 finished with value: 4.558777600000001 and parameters: {'max_depth': 1, 'min_child_weight': 1, 'gamma': 7.010463012214196e-07, 'subsample': 0.938496388763982, 'colsample_bytree': 0.8911451412529441, 'lambda': 4.908751293152921e-06, 'alpha': 8.318387482870321e-07, 'grow_policy': 'depthwise'}. Best is trial 0 with value: 4.558777600000001.[0m
[32m[I 2020-10-30 07:22:15,711][0m Trial 1 finished with value: 3.416033 and parameters: {'max_depth': 8, 'min_child_weight': 3, 'gamma': 7.452675553704548e-06, 'subsample': 0.19169423850878003, 'colsample_bytree': 0.13607431136394976, 'lambda': 0.0001295608400399946, 'alpha': 0.0006100941957659861, 'grow_policy': 'depthwise'}. Best is trial 1 with value: 3.416033.[0m
[32m[I 2020-10-30 07:22:33,061][0m Trial 2 finished with value: 3.2390412 and parameters: {'max_depth': 5, '

Number of finished trials: 1000
Best trial:
  Value: 2.9204086
  Params: 
    max_depth: 10
    min_child_weight: 6
    gamma: 3.020622071249327e-06
    subsample: 0.4352692942911126
    colsample_bytree: 0.8728193404268683
    lambda: 2.839454531904628e-07
    alpha: 3.0286129349372527e-07
    grow_policy: lossguide


In [None]:
# Get best params then add to param_1
xgb_study_1_params = study.best_params
xgb_param_1 = {
    'objective': 'reg:squarederror',
    'tree_method': 'hist',
    'learning_rate': 0.1,
}
xgb_param_1.update(xgb_study_1_params)
xgb_param_1

{'alpha': 3.0286129349372527e-07,
 'colsample_bytree': 0.8728193404268683,
 'gamma': 3.020622071249327e-06,
 'grow_policy': 'lossguide',
 'lambda': 2.839454531904628e-07,
 'learning_rate': 0.1,
 'max_depth': 10,
 'min_child_weight': 6,
 'objective': 'reg:squarederror',
 'subsample': 0.4352692942911126,
 'tree_method': 'hist'}

In [None]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

cv = KFold(
    n_splits=5, 
    shuffle=True, 
    random_state=0
)
history = xgb.cv(
    xgb_param_1, dtrain, 
    num_boost_round=2000, 
    early_stopping_rounds=100,
    metrics='rmse',
    folds=cv
)
xgb_n_estimators_1 = history.shape[0]
xgb_n_estimators_1

131

#### Study 2

In [None]:
def objective(trial):

    dtrain = xgb.DMatrix(X_train, label=y_train)

    params = {
        'objective': 'reg:squarederror',
        'tree_method': 'hist',
        'learning_rate': 0.01,
        'max_depth': trial.suggest_int('max_depth', 1, 10),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 6),
        'gamma': trial.suggest_loguniform("gamma", 1e-8, 1.0),
        'subsample':trial.suggest_uniform('subsample', 0.1, 1),
        'colsample_bytree':trial.suggest_uniform('colsample_bytree', 0.1, 1),
        "lambda": trial.suggest_loguniform("lambda", 1e-8, 1.0),
        "alpha": trial.suggest_loguniform("alpha", 1e-8, 1.0),
        "grow_policy": trial.suggest_categorical("grow_policy", ["depthwise", "lossguide"])
    }

    pruning_callback = optuna.integration.XGBoostPruningCallback(trial, "test-rmse")

    cv = KFold(n_splits=5, shuffle=True, random_state=0)
    history = xgb.cv(params, dtrain, num_boost_round=2000, 
                     early_stopping_rounds=100,
                     callbacks=[pruning_callback],
                     metrics='rmse', 
                     folds=cv)

    mean_score = history["test-rmse-mean"].values[-1]
    return mean_score

pruner = optuna.pruners.MedianPruner(n_warmup_steps=10)
study = optuna.create_study(pruner=pruner, direction='minimize')
study.optimize(objective, n_trials=1000)

print("Number of finished trials: {}".format(len(study.trials)))

print("Best trial:")
trial = study.best_trial

print("  Value: {}".format(trial.value))

print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

[32m[I 2020-10-30 07:45:09,256][0m A new study created in memory with name: no-name-04829d01-4a52-4b97-b44e-a35cf362b065[0m
[32m[I 2020-10-30 07:45:43,269][0m Trial 0 finished with value: 3.4086556000000003 and parameters: {'max_depth': 2, 'min_child_weight': 3, 'gamma': 2.187807901976827e-06, 'subsample': 0.6891184875092443, 'colsample_bytree': 0.5774780891844628, 'lambda': 0.05503263692762871, 'alpha': 0.00846677125318954, 'grow_policy': 'lossguide'}. Best is trial 0 with value: 3.4086556000000003.[0m
[32m[I 2020-10-30 07:46:23,034][0m Trial 1 finished with value: 3.3120082 and parameters: {'max_depth': 5, 'min_child_weight': 1, 'gamma': 1.3810789375977578e-08, 'subsample': 0.41482586714703407, 'colsample_bytree': 0.15141180091494574, 'lambda': 1.184419886857891e-08, 'alpha': 1.5324609738112442e-05, 'grow_policy': 'lossguide'}. Best is trial 1 with value: 3.3120082.[0m
[32m[I 2020-10-30 07:47:45,333][0m Trial 2 finished with value: 3.3139703999999996 and parameters: {'max_

Number of finished trials: 1000
Best trial:
  Value: 2.9371882
  Params: 
    max_depth: 9
    min_child_weight: 3
    gamma: 0.00032783802740044254
    subsample: 0.26379475087937865
    colsample_bytree: 0.6914870316667033
    lambda: 0.0008495959295682414
    alpha: 0.009495064792943987
    grow_policy: depthwise


In [None]:
# Get best params then add to param_2
xgb_study_2_params = study.best_params
xgb_param_2 = {
    'objective': 'reg:squarederror',
    'tree_method': 'hist',
    'learning_rate': 0.01,
}
xgb_param_2.update(xgb_study_2_params)
xgb_param_2

{'alpha': 0.009495064792943987,
 'colsample_bytree': 0.6914870316667033,
 'gamma': 0.00032783802740044254,
 'grow_policy': 'depthwise',
 'lambda': 0.0008495959295682414,
 'learning_rate': 0.01,
 'max_depth': 9,
 'min_child_weight': 3,
 'objective': 'reg:squarederror',
 'subsample': 0.26379475087937865,
 'tree_method': 'hist'}

In [None]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

cv = KFold(
    n_splits=5, 
    shuffle=True, 
    random_state=0
)
history = xgb.cv(
    xgb_param_2, dtrain, 
    num_boost_round=2000, 
    early_stopping_rounds=100,
    metrics='rmse',
    folds=cv
)
xgb_n_estimators_2 = history.shape[0]
xgb_n_estimators_2

1354

#### Evaluation

In [77]:
xgb_study_1 = XGBRegressor(**xgb_param_1, n_estimators=xgb_n_estimators_1)
xgb_study_2 = XGBRegressor(**xgb_param_2, n_estimators=xgb_n_estimators_2)

xgb_models = {
    f'XGBRegressor ({xgb_n_estimators_1}) {xgb_param_1}': xgb_study_1,
    f'XGBRegressor ({xgb_n_estimators_2}) {xgb_param_2}': xgb_study_2
}
evaluate_model(xgb_models, X_train, X_test, y_train, y_test)

Unnamed: 0_level_0,Fit Time,Train R2,CV R2,Test R2,Train RMSE,CV RMSE,Test RMSE
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
"XGBRegressor (1354) {'alpha': 0.009495064792943987, 'colsample_bytree': 0.6914870316667033, 'gamma': 0.00032783802740044254, 'grow_policy': 'depthwise', 'lambda': 0.0008495959295682414, 'learning_rate': 0.01, 'max_depth': 9, 'min_child_weight': 3, 'objective': 'reg:squarederror', 'subsample': 0.26379475087937865, 'tree_method': 'hist'}",5.2534,0.9827,0.9252,0.936,1.4419,2.9362,2.9712
"XGBRegressor (131) {'alpha': 3.0286129349372527e-07, 'colsample_bytree': 0.8728193404268683, 'gamma': 3.020622071249327e-06, 'grow_policy': 'lossguide', 'lambda': 2.839454531904628e-07, 'learning_rate': 0.1, 'max_depth': 10, 'min_child_weight': 6, 'objective': 'reg:squarederror', 'subsample': 0.4352692942911126, 'tree_method': 'hist'}",0.6248,0.9805,0.9171,0.9261,1.5295,3.1138,3.1928


#### Study 3

In [42]:
def objective(trial):

    dtrain = xgb.DMatrix(X_train, label=y_train)

    params = xgb_param_2.copy()
    params["learning_rate"] = trial.suggest_uniform('learning_rate', 0.001, 0.01)

    pruning_callback = optuna.integration.XGBoostPruningCallback(trial, "test-rmse")

    cv = KFold(n_splits=5, shuffle=True, random_state=0)
    history = xgb.cv(params, dtrain, num_boost_round=2000, 
                     early_stopping_rounds=100,
                     callbacks=[pruning_callback],
                     metrics='rmse', 
                     folds=cv)

    mean_score = history["test-rmse-mean"].values[-1]
    return mean_score

pruner = optuna.pruners.MedianPruner(n_warmup_steps=10)
study = optuna.create_study(pruner=pruner, direction='minimize')
study.optimize(objective, n_trials=1000)

print("Number of finished trials: {}".format(len(study.trials)))

print("Best trial:")
trial = study.best_trial

print("  Value: {}".format(trial.value))

print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

[32m[I 2020-10-30 10:16:53,665][0m A new study created in memory with name: no-name-a060a3ab-6d68-4fde-a219-63c422b88196[0m
[32m[I 2020-10-30 10:18:27,628][0m Trial 0 finished with value: 3.5541846 and parameters: {'learning_rate': 0.0011799544267268}. Best is trial 0 with value: 3.5541846.[0m
[32m[I 2020-10-30 10:19:20,953][0m Trial 1 finished with value: 2.9286193999999997 and parameters: {'learning_rate': 0.006921087295537538}. Best is trial 1 with value: 2.9286193999999997.[0m
[32m[I 2020-10-30 10:20:14,343][0m Trial 2 finished with value: 2.9301014000000003 and parameters: {'learning_rate': 0.00689364712447553}. Best is trial 1 with value: 2.9286193999999997.[0m
[32m[I 2020-10-30 10:20:59,595][0m Trial 3 finished with value: 2.940917 and parameters: {'learning_rate': 0.00893057166299717}. Best is trial 1 with value: 2.9286193999999997.[0m
[32m[I 2020-10-30 10:22:32,215][0m Trial 4 finished with value: 3.4513572000000003 and parameters: {'learning_rate': 0.00125583

Number of finished trials: 1000
Best trial:
  Value: 2.9203242
  Params: 
    learning_rate: 0.009501678089908304


In [43]:
# Get best params then add to param_3
xgb_param_3 = xgb_param_2.copy()
xgb_param_3["learning_rate"] = study.best_params["learning_rate"]
xgb_param_3

{'alpha': 0.009495064792943987,
 'colsample_bytree': 0.6914870316667033,
 'gamma': 0.00032783802740044254,
 'grow_policy': 'depthwise',
 'lambda': 0.0008495959295682414,
 'learning_rate': 0.009501678089908304,
 'max_depth': 9,
 'min_child_weight': 3,
 'objective': 'reg:squarederror',
 'subsample': 0.26379475087937865,
 'tree_method': 'hist'}

In [44]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

cv = KFold(
    n_splits=5, 
    shuffle=True, 
    random_state=0
)
history = xgb.cv(
    xgb_param_3, dtrain, 
    num_boost_round=2000, 
    early_stopping_rounds=100,
    metrics='rmse',
    folds=cv
)
xgb_n_estimators_3 = history.shape[0]
xgb_n_estimators_3

1279

#### Evaluation

In [79]:
xgb_study_1 = XGBRegressor(**xgb_param_1, n_estimators=xgb_n_estimators_1)
xgb_study_2 = XGBRegressor(**xgb_param_2, n_estimators=xgb_n_estimators_2)
xgb_study_3 = XGBRegressor(**xgb_param_3, n_estimators=xgb_n_estimators_3)

xgb_models = {
    f'XGBRegressor ({xgb_n_estimators_1}) {xgb_param_1}': xgb_study_1,
    f'XGBRegressor ({xgb_n_estimators_2}) {xgb_param_2}': xgb_study_2,
    f'XGBRegressor ({xgb_n_estimators_3}) {xgb_param_3}': xgb_study_3
}
xgb_result = evaluate_model(xgb_models, X_train, X_test, y_train, y_test)
xgb_result

Unnamed: 0_level_0,Fit Time,Train R2,CV R2,Test R2,Train RMSE,CV RMSE,Test RMSE
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
"XGBRegressor (1354) {'alpha': 0.009495064792943987, 'colsample_bytree': 0.6914870316667033, 'gamma': 0.00032783802740044254, 'grow_policy': 'depthwise', 'lambda': 0.0008495959295682414, 'learning_rate': 0.01, 'max_depth': 9, 'min_child_weight': 3, 'objective': 'reg:squarederror', 'subsample': 0.26379475087937865, 'tree_method': 'hist'}",5.2654,0.9827,0.9252,0.936,1.4419,2.9362,2.9712
"XGBRegressor (1279) {'alpha': 0.009495064792943987, 'colsample_bytree': 0.6914870316667033, 'gamma': 0.00032783802740044254, 'grow_policy': 'depthwise', 'lambda': 0.0008495959295682414, 'learning_rate': 0.009501678089908304, 'max_depth': 9, 'min_child_weight': 3, 'objective': 'reg:squarederror', 'subsample': 0.26379475087937865, 'tree_method': 'hist'}",5.0459,0.9807,0.9248,0.9348,1.5216,2.9431,2.998
"XGBRegressor (131) {'alpha': 3.0286129349372527e-07, 'colsample_bytree': 0.8728193404268683, 'gamma': 3.020622071249327e-06, 'grow_policy': 'lossguide', 'lambda': 2.839454531904628e-07, 'learning_rate': 0.1, 'max_depth': 10, 'min_child_weight': 6, 'objective': 'reg:squarederror', 'subsample': 0.4352692942911126, 'tree_method': 'hist'}",0.6253,0.9805,0.9171,0.9261,1.5295,3.1138,3.1928


In [80]:
xgb_result.to_csv("../data/processed/tuning_dropna_all (XGB).csv", index=False)

### LightGBM

#### Study 1

In [47]:
def objective(trial):

    dtrain = lgb.Dataset(X_train, label=y_train)

    params = {
        "objective": "regression",
        "verbosity": -1,
        "boosting_type": "gbdt",
        "learning_rate": 0.1,
        "max_depth": trial.suggest_int("max_depth", 1, 30),
        "num_leaves": trial.suggest_int("num_leaves", 2, 256),
        "lambda_l1": trial.suggest_loguniform("lambda_l1", 1e-8, 1.0),
        "lambda_l2": trial.suggest_loguniform("lambda_l2", 1e-8, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
        'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.1, 1),
        "feature_fraction": trial.suggest_uniform("feature_fraction", 0.1, 1),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
    }

    pruning_callback = optuna.integration.LightGBMPruningCallback(trial, "rmse")

    cv = KFold(n_splits=5, shuffle=True, random_state=0)
    scores = lgb.cv(params, dtrain, 
                    num_boost_round=2000, 
                    early_stopping_rounds=100,
                    callbacks=[pruning_callback],
                    metrics='rmse', 
                    folds=cv)

    mean_score = scores['rmse-mean'][-1]
    return mean_score

pruner = optuna.pruners.MedianPruner(n_warmup_steps=10)
study = optuna.create_study(pruner=pruner, direction='minimize')
study.optimize(objective, n_trials=1000)


print("Number of finished trials: {}".format(len(study.trials)))

print("Best trial:")
trial = study.best_trial

print("  Value: {}".format(trial.value))

print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

[32m[I 2020-10-30 13:59:27,486][0m A new study created in memory with name: no-name-797efc47-e560-4644-aaad-7ebca5da1f49[0m
[32m[I 2020-10-30 13:59:32,298][0m Trial 0 finished with value: 3.679383895665654 and parameters: {'max_depth': 11, 'num_leaves': 176, 'lambda_l1': 0.0026744591059189595, 'lambda_l2': 2.9924672123697797e-05, 'bagging_freq': 4, 'bagging_fraction': 0.45335814329162616, 'feature_fraction': 0.11237327417273804, 'min_child_samples': 85}. Best is trial 0 with value: 3.679383895665654.[0m
[32m[I 2020-10-30 13:59:34,747][0m Trial 1 finished with value: 3.0794446595194502 and parameters: {'max_depth': 6, 'num_leaves': 148, 'lambda_l1': 0.013363349024751527, 'lambda_l2': 0.002227013726178783, 'bagging_freq': 7, 'bagging_fraction': 0.8352074826684289, 'feature_fraction': 0.36644806852262324, 'min_child_samples': 21}. Best is trial 1 with value: 3.0794446595194502.[0m
[32m[I 2020-10-30 13:59:41,130][0m Trial 2 finished with value: 3.3193009479708273 and parameters:

Number of finished trials: 1000
Best trial:
  Value: 2.9422405250435935
  Params: 
    max_depth: 17
    num_leaves: 123
    lambda_l1: 9.210064365252262e-06
    lambda_l2: 2.6170647026962826e-06
    bagging_freq: 3
    bagging_fraction: 0.6562019033557935
    feature_fraction: 0.7187318113607815
    min_child_samples: 8


In [48]:
# Get best params then add to param_1
lgb_study_1_params = study.best_params
lgb_param_1 = {
    "objective": "regression",
    "verbosity": -1,
    "boosting_type": "gbdt",
    "learning_rate": 0.1,
}
lgb_param_1.update(lgb_study_1_params)
lgb_param_1

{'bagging_fraction': 0.6562019033557935,
 'bagging_freq': 3,
 'boosting_type': 'gbdt',
 'feature_fraction': 0.7187318113607815,
 'lambda_l1': 9.210064365252262e-06,
 'lambda_l2': 2.6170647026962826e-06,
 'learning_rate': 0.1,
 'max_depth': 17,
 'min_child_samples': 8,
 'num_leaves': 123,
 'objective': 'regression',
 'verbosity': -1}

In [49]:
dtrain = lgb.Dataset(X_train, label=y_train)

cv = KFold(
    n_splits=5, 
    shuffle=True, 
    random_state=0
)
history = lgb.cv(
    lgb_param_1, dtrain, 
    num_boost_round=2000,
    early_stopping_rounds=100,
    metrics='rmse', 
    folds=cv
)
lgb_n_estimators_1 = pd.DataFrame(history).shape[0]
lgb_n_estimators_1

178

#### Study 2

In [50]:
def objective(trial):

    dtrain = lgb.Dataset(X_train, label=y_train)

    params = {
        "objective": "regression",
        "verbosity": -1,
        "boosting_type": "gbdt",
        "learning_rate": 0.01,
        "max_depth": trial.suggest_int("max_depth", 1, 30),
        "num_leaves": trial.suggest_int("num_leaves", 2, 256),
        "lambda_l1": trial.suggest_loguniform("lambda_l1", 1e-8, 1.0),
        "lambda_l2": trial.suggest_loguniform("lambda_l2", 1e-8, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
        'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.1, 1),
        "feature_fraction": trial.suggest_uniform("feature_fraction", 0.1, 1),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
    }

    pruning_callback = optuna.integration.LightGBMPruningCallback(trial, "rmse")

    cv = KFold(n_splits=5, shuffle=True, random_state=0)
    scores = lgb.cv(params, dtrain, 
                    num_boost_round=2000, 
                    early_stopping_rounds=100,
                    callbacks=[pruning_callback],
                    metrics='rmse', 
                    folds=cv)

    mean_score = scores['rmse-mean'][-1]
    return mean_score

pruner = optuna.pruners.MedianPruner(n_warmup_steps=10)
study = optuna.create_study(pruner=pruner, direction='minimize')
study.optimize(objective, n_trials=1000)


print("Number of finished trials: {}".format(len(study.trials)))

print("Best trial:")
trial = study.best_trial

print("  Value: {}".format(trial.value))

print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

[32m[I 2020-10-30 14:15:13,321][0m A new study created in memory with name: no-name-d5d094ad-8c83-4abd-88b2-e3a4c971dc4a[0m
[32m[I 2020-10-30 14:15:20,052][0m Trial 0 finished with value: 3.2539513734573324 and parameters: {'max_depth': 5, 'num_leaves': 161, 'lambda_l1': 5.0590463810270455e-06, 'lambda_l2': 0.2223705014043537, 'bagging_freq': 6, 'bagging_fraction': 0.5453320146064133, 'feature_fraction': 0.4570134822211589, 'min_child_samples': 36}. Best is trial 0 with value: 3.2539513734573324.[0m
[32m[I 2020-10-30 14:15:33,154][0m Trial 1 finished with value: 3.6135066134640526 and parameters: {'max_depth': 24, 'num_leaves': 61, 'lambda_l1': 1.239540478218001e-08, 'lambda_l2': 0.05802178110089162, 'bagging_freq': 1, 'bagging_fraction': 0.342684990266268, 'feature_fraction': 0.9900487551687552, 'min_child_samples': 55}. Best is trial 0 with value: 3.2539513734573324.[0m
[32m[I 2020-10-30 14:16:03,600][0m Trial 2 finished with value: 3.048495672793108 and parameters: {'max_

Number of finished trials: 1000
Best trial:
  Value: 2.869579897845337
  Params: 
    max_depth: 25
    num_leaves: 109
    lambda_l1: 8.242344877787786e-06
    lambda_l2: 1.5661398846534485e-05
    bagging_freq: 2
    bagging_fraction: 0.4191531736891667
    feature_fraction: 0.931818501072818
    min_child_samples: 5


In [51]:
# Get best params then add to param_2
lgb_study_2_params = study.best_params
lgb_param_2 = {
    "objective": "regression",
    "verbosity": -1,
    "boosting_type": "gbdt",
    "learning_rate": 0.01,
}
lgb_param_2.update(lgb_study_2_params)
lgb_param_2

{'bagging_fraction': 0.4191531736891667,
 'bagging_freq': 2,
 'boosting_type': 'gbdt',
 'feature_fraction': 0.931818501072818,
 'lambda_l1': 8.242344877787786e-06,
 'lambda_l2': 1.5661398846534485e-05,
 'learning_rate': 0.1,
 'max_depth': 25,
 'min_child_samples': 5,
 'num_leaves': 109,
 'objective': 'regression',
 'verbosity': -1}

In [58]:
lgb_param_2['learning_rate'] = 0.01
lgb_param_2

{'bagging_fraction': 0.4191531736891667,
 'bagging_freq': 2,
 'boosting_type': 'gbdt',
 'feature_fraction': 0.931818501072818,
 'lambda_l1': 8.242344877787786e-06,
 'lambda_l2': 1.5661398846534485e-05,
 'learning_rate': 0.01,
 'max_depth': 25,
 'min_child_samples': 5,
 'num_leaves': 109,
 'objective': 'regression',
 'verbosity': -1}

In [59]:
dtrain = lgb.Dataset(X_train, label=y_train)

cv = KFold(
    n_splits=5, 
    shuffle=True, 
    random_state=0
)
history = lgb.cv(
    lgb_param_2, dtrain, 
    num_boost_round=2000,
    early_stopping_rounds=100,
    metrics='rmse', 
    folds=cv
)
lgb_n_estimators_2 = pd.DataFrame(history).shape[0]
lgb_n_estimators_2

1244

#### Evaluation

In [63]:
lgb_study_1 = LGBMRegressor(**lgb_param_1, n_estimators=lgb_n_estimators_1)
lgb_study_2 = LGBMRegressor(**lgb_param_2, n_estimators=lgb_n_estimators_2)

lgb_models = {
    f'LGBMRegressor ({lgb_n_estimators_1}) {lgb_param_1}': lgb_study_1,
    f'LGBMRegressor ({lgb_n_estimators_2}) {lgb_param_2}': lgb_study_2
}
evaluate_model(lgb_models, X_train, X_test, y_train, y_test)

Unnamed: 0_level_0,Fit Time,Train R2,CV R2,Test R2,Train RMSE,CV RMSE,Test RMSE
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
"LGBMRegressor (1244) {'objective': 'regression', 'verbosity': -1, 'boosting_type': 'gbdt', 'learning_rate': 0.01, 'max_depth': 25, 'num_leaves': 109, 'lambda_l1': 8.242344877787786e-06, 'lambda_l2': 1.5661398846534485e-05, 'bagging_freq': 2, 'bagging_fraction': 0.4191531736891667, 'feature_fraction': 0.931818501072818, 'min_child_samples': 5}",5.8926,0.9886,0.9281,0.9371,1.1696,2.8803,2.9456
"LGBMRegressor (178) {'objective': 'regression', 'verbosity': -1, 'boosting_type': 'gbdt', 'learning_rate': 0.1, 'max_depth': 17, 'num_leaves': 123, 'lambda_l1': 9.210064365252262e-06, 'lambda_l2': 2.6170647026962826e-06, 'bagging_freq': 3, 'bagging_fraction': 0.6562019033557935, 'feature_fraction': 0.7187318113607815, 'min_child_samples': 8}",0.603,0.993,0.9214,0.932,0.9174,3.0128,3.0624


#### Study 3

In [64]:
def objective(trial):

    dtrain = lgb.Dataset(X_train, label=y_train)

    params = lgb_param_2.copy()
    params["learning_rate"] = trial.suggest_uniform('learning_rate', 0.001, 0.01)

    pruning_callback = optuna.integration.LightGBMPruningCallback(trial, "rmse")

    cv = KFold(n_splits=5, shuffle=True, random_state=0)
    scores = lgb.cv(params, dtrain, 
                    num_boost_round=2000, 
                    early_stopping_rounds=100,
                    callbacks=[pruning_callback],
                    metrics='rmse', 
                    folds=cv)

    mean_score = scores['rmse-mean'][-1]
    return mean_score

pruner = optuna.pruners.MedianPruner(n_warmup_steps=10)
study = optuna.create_study(pruner=pruner, direction='minimize')
study.optimize(objective, n_trials=1000)


print("Number of finished trials: {}".format(len(study.trials)))

print("Best trial:")
trial = study.best_trial

print("  Value: {}".format(trial.value))

print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

[32m[I 2020-10-30 15:34:20,492][0m A new study created in memory with name: no-name-a0efc7d9-981b-4594-8a0d-4f5cbf838cde[0m
[32m[I 2020-10-30 15:35:06,952][0m Trial 0 finished with value: 2.967848337079857 and parameters: {'learning_rate': 0.0037296312680058703}. Best is trial 0 with value: 2.967848337079857.[0m
[32m[I 2020-10-30 15:35:53,422][0m Trial 1 finished with value: 2.931532635109323 and parameters: {'learning_rate': 0.004510041717428174}. Best is trial 1 with value: 2.931532635109323.[0m
[32m[I 2020-10-30 15:36:30,435][0m Trial 2 finished with value: 2.8947271343521965 and parameters: {'learning_rate': 0.007769151296202674}. Best is trial 2 with value: 2.8947271343521965.[0m
[32m[I 2020-10-30 15:37:09,106][0m Trial 3 finished with value: 2.9155988658285663 and parameters: {'learning_rate': 0.007266313362992954}. Best is trial 2 with value: 2.8947271343521965.[0m
[32m[I 2020-10-30 15:37:50,209][0m Trial 4 finished with value: 2.9121726965438266 and parameters:

Number of finished trials: 1000
Best trial:
  Value: 2.8669981832342186
  Params: 
    learning_rate: 0.009998145518967253


In [65]:
# Get best params then add to param_3
lgb_param_3 = lgb_param_2.copy()
lgb_param_3["learning_rate"] = study.best_params["learning_rate"]
lgb_param_3

{'bagging_fraction': 0.4191531736891667,
 'bagging_freq': 2,
 'boosting_type': 'gbdt',
 'feature_fraction': 0.931818501072818,
 'lambda_l1': 8.242344877787786e-06,
 'lambda_l2': 1.5661398846534485e-05,
 'learning_rate': 0.009998145518967253,
 'max_depth': 25,
 'min_child_samples': 5,
 'num_leaves': 109,
 'objective': 'regression',
 'verbosity': -1}

In [66]:
dtrain = lgb.Dataset(X_train, label=y_train)

cv = KFold(
    n_splits=5, 
    shuffle=True, 
    random_state=0
)
history = lgb.cv(
    lgb_param_3, dtrain, 
    num_boost_round=2000,
    early_stopping_rounds=100,
    metrics='rmse', 
    folds=cv
)
lgb_n_estimators_3 = pd.DataFrame(history).shape[0]
lgb_n_estimators_3

1244

#### Evaluation

In [69]:
lgb_study_1 = LGBMRegressor(**lgb_param_1, n_estimators=lgb_n_estimators_1)
lgb_study_2 = LGBMRegressor(**lgb_param_2, n_estimators=lgb_n_estimators_2)
lgb_study_3 = LGBMRegressor(**lgb_param_3, n_estimators=lgb_n_estimators_3)

lgb_models = {
    f'LGBMRegressor ({lgb_n_estimators_1}) {lgb_param_1}': lgb_study_1,
    f'LGBMRegressor ({lgb_n_estimators_2}) {lgb_param_2}': lgb_study_2,
    f'LGBMRegressor ({lgb_n_estimators_3}) {lgb_param_3}': lgb_study_3
}
lgb_result = evaluate_model(lgb_models, X_train, X_test, y_train, y_test)
lgb_result

Unnamed: 0_level_0,Fit Time,Train R2,CV R2,Test R2,Train RMSE,CV RMSE,Test RMSE
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
"LGBMRegressor (1244) {'objective': 'regression', 'verbosity': -1, 'boosting_type': 'gbdt', 'learning_rate': 0.009998145518967253, 'max_depth': 25, 'num_leaves': 109, 'lambda_l1': 8.242344877787786e-06, 'lambda_l2': 1.5661398846534485e-05, 'bagging_freq': 2, 'bagging_fraction': 0.4191531736891667, 'feature_fraction': 0.931818501072818, 'min_child_samples': 5}",5.9132,0.9887,0.9288,0.9373,1.1671,2.8661,2.9412
"LGBMRegressor (1244) {'objective': 'regression', 'verbosity': -1, 'boosting_type': 'gbdt', 'learning_rate': 0.01, 'max_depth': 25, 'num_leaves': 109, 'lambda_l1': 8.242344877787786e-06, 'lambda_l2': 1.5661398846534485e-05, 'bagging_freq': 2, 'bagging_fraction': 0.4191531736891667, 'feature_fraction': 0.931818501072818, 'min_child_samples': 5}",5.9058,0.9886,0.9281,0.9371,1.1696,2.8803,2.9456
"LGBMRegressor (178) {'objective': 'regression', 'verbosity': -1, 'boosting_type': 'gbdt', 'learning_rate': 0.1, 'max_depth': 17, 'num_leaves': 123, 'lambda_l1': 9.210064365252262e-06, 'lambda_l2': 2.6170647026962826e-06, 'bagging_freq': 3, 'bagging_fraction': 0.6562019033557935, 'feature_fraction': 0.7187318113607815, 'min_child_samples': 8}",0.61,0.993,0.9214,0.932,0.9174,3.0128,3.0624


In [70]:
lgb_result.to_csv("../data/processed/tuning_dropna_all (LGB).csv", index=False)

## Combine Result

In [82]:
combined_result = pd.concat([xgb_result, lgb_result], axis=0)
combined_result.sort_values(by='CV RMSE', inplace=True)
combined_result.to_csv("../data/processed/tuning_dropna_all (XGB+LGB).csv", index=True)