# Data Importing

In [3]:
import pandas as pd
import numpy as np
import category_encoders as ce
import miceforest as mf
import optuna
import lightgbm as lgb
import xgboost as xgb

from utils import *
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate, KFold
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, Lasso
from sklearn import metrics
from sklearn.impute import SimpleImputer

In [4]:
df = pd.read_csv('../data/processed/after_prep.csv')
df.head()

Unnamed: 0,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Seats,Price,Brand,Series,Type,Mileage (kmpl),Engine (CC),Power (bhp)
0,Mumbai,2010,72000,CNG,Manual,First,5.0,1.75,Maruti,Wagon,R,26.6,998.0,58.16
1,Pune,2015,41000,Diesel,Manual,First,5.0,12.5,Hyundai,Creta,1.6,19.67,1582.0,126.2
2,Chennai,2011,46000,Petrol,Manual,First,5.0,4.5,Honda,Jazz,V,18.2,1199.0,88.7
3,Chennai,2012,87000,Diesel,Manual,First,7.0,6.0,Maruti,Ertiga,VDI,20.77,1248.0,88.76
4,Coimbatore,2013,40670,Diesel,Automatic,Second,5.0,17.74,Audi,A4,New,15.2,1968.0,140.8


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6019 entries, 0 to 6018
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Location           6019 non-null   object 
 1   Year               6019 non-null   int64  
 2   Kilometers_Driven  6019 non-null   int64  
 3   Fuel_Type          6019 non-null   object 
 4   Transmission       6019 non-null   object 
 5   Owner_Type         6019 non-null   object 
 6   Seats              5976 non-null   float64
 7   Price              6019 non-null   float64
 8   Brand              6019 non-null   object 
 9   Series             6019 non-null   object 
 10  Type               6019 non-null   object 
 11  Mileage (kmpl)     5951 non-null   float64
 12  Engine (CC)        5983 non-null   float64
 13  Power (bhp)        5876 non-null   float64
dtypes: float64(5), int64(2), object(7)
memory usage: 658.5+ KB


# Preprocessing

In [6]:
# Delete outlier
df = df[~(df.Kilometers_Driven > 1e6)]
df.shape

(6018, 14)

In [7]:
# Drop missing values
df = df.dropna()
null_checker(df)

Unnamed: 0,null (sum),null (%)
Location,0,0.0
Year,0,0.0
Kilometers_Driven,0,0.0
Fuel_Type,0,0.0
Transmission,0,0.0
Owner_Type,0,0.0
Seats,0,0.0
Price,0,0.0
Brand,0,0.0
Series,0,0.0


## Feature enginering

In [8]:
# Grouping category less than 10 to "Other"
for col in ["Brand", "Series", "Type"]:
    counts = df[col].value_counts()
    other = counts[counts < 10].index
    df[col] = df[col].replace(other, "Other")

In [9]:
# Make categorical feature interactions
from itertools import combinations
from sklearn.preprocessing import LabelEncoder

cat_cols = ['Location', 'Fuel_Type', 'Transmission', 'Owner_Type', 'Brand']

for col in combinations(cat_cols, 2):
    new_col = col[0]+'_'+col[1]
    df[new_col] = df[col[0]] + "_" + df[col[1]]
    
    counts = df[new_col].value_counts()
    other = counts[counts < 10].index
    df[new_col] = df[new_col].replace(other, "Other")
    
df.head()

Unnamed: 0,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Seats,Price,Brand,Series,...,Location_Fuel_Type,Location_Transmission,Location_Owner_Type,Location_Brand,Fuel_Type_Transmission,Fuel_Type_Owner_Type,Fuel_Type_Brand,Transmission_Owner_Type,Transmission_Brand,Owner_Type_Brand
0,Mumbai,2010,72000,CNG,Manual,First,5.0,1.75,Maruti,Wagon,...,Mumbai_CNG,Mumbai_Manual,Mumbai_First,Mumbai_Maruti,CNG_Manual,CNG_First,CNG_Maruti,Manual_First,Manual_Maruti,First_Maruti
1,Pune,2015,41000,Diesel,Manual,First,5.0,12.5,Hyundai,Creta,...,Pune_Diesel,Pune_Manual,Pune_First,Pune_Hyundai,Diesel_Manual,Diesel_First,Diesel_Hyundai,Manual_First,Manual_Hyundai,First_Hyundai
2,Chennai,2011,46000,Petrol,Manual,First,5.0,4.5,Honda,Jazz,...,Chennai_Petrol,Chennai_Manual,Chennai_First,Chennai_Honda,Petrol_Manual,Petrol_First,Petrol_Honda,Manual_First,Manual_Honda,First_Honda
3,Chennai,2012,87000,Diesel,Manual,First,7.0,6.0,Maruti,Ertiga,...,Chennai_Diesel,Chennai_Manual,Chennai_First,Chennai_Maruti,Diesel_Manual,Diesel_First,Diesel_Maruti,Manual_First,Manual_Maruti,First_Maruti
4,Coimbatore,2013,40670,Diesel,Automatic,Second,5.0,17.74,Audi,A4,...,Coimbatore_Diesel,Coimbatore_Automatic,Coimbatore_Second,Coimbatore_Audi,Diesel_Automatic,Diesel_Second,Diesel_Audi,Automatic_Second,Automatic_Audi,Second_Audi


## Train test split

In [10]:
# melakukan train test split di awal untuk mencegah data bocor ke test set saat dilakukan encoding/imputation
train_data, test_data = train_test_split(df, test_size=0.25, random_state=0)

## Encoding

In [11]:
# Define category mapping for label encoding
mapping_owner = {
    'First': 1, 
    'Second': 2, 
    'Third': 3, 
    'Fourth & Above': 4
}
mapping_trans = {
    'Manual': 0, 
    'Automatic': 1, 
}

# Encoding train set
train_data["Owner_Type"] = train_data["Owner_Type"].map(mapping_owner)
train_data["Transmission"] = train_data["Transmission"].map(mapping_trans)
# Encoding test set
test_data["Owner_Type"] = test_data["Owner_Type"].map(mapping_owner)
test_data["Transmission"] = test_data["Transmission"].map(mapping_trans)

In [12]:
import kfold_target_encoder as enc
col_to_encode = train_data.select_dtypes("object").columns.tolist()
col_to_encode

# Encoding train set
for col in col_to_encode:
    targetc = enc.KFoldTargetEncoderTrain(col, "Price", n_fold=5)
    train_data = targetc.fit_transform(train_data)

# Encoding test set
for col in col_to_encode:
    test_targetc = enc.KFoldTargetEncoderTest(train_data, col, col+"_Enc")
    test_data = test_targetc.fit_transform(test_data)

# Delete old features
train_data.drop(columns=col_to_encode, inplace=True)
test_data.drop(columns=col_to_encode, inplace=True)

Correlation between the new feature, Location_Enc and, Price is 0.22953890285650633.
Correlation between the new feature, Fuel_Type_Enc and, Price is 0.31902625145603397.
Correlation between the new feature, Brand_Enc and, Price is 0.7558118852201624.
Correlation between the new feature, Series_Enc and, Price is 0.7679810838662673.
Correlation between the new feature, Type_Enc and, Price is 0.6936821252171319.
Correlation between the new feature, Location_Fuel_Type_Enc and, Price is 0.4040042711000571.
Correlation between the new feature, Location_Transmission_Enc and, Price is 0.6047463551016796.
Correlation between the new feature, Location_Owner_Type_Enc and, Price is 0.23746243798549335.
Correlation between the new feature, Location_Brand_Enc and, Price is 0.6984005568634631.
Correlation between the new feature, Fuel_Type_Transmission_Enc and, Price is 0.6637391383193235.
Correlation between the new feature, Fuel_Type_Owner_Type_Enc and, Price is 0.3302114220837294.
Correlation bet

In [13]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1462 entries, 4872 to 2309
Data columns (total 24 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Year                         1462 non-null   int64  
 1   Kilometers_Driven            1462 non-null   int64  
 2   Transmission                 1462 non-null   int64  
 3   Owner_Type                   1462 non-null   int64  
 4   Seats                        1462 non-null   float64
 5   Price                        1462 non-null   float64
 6   Mileage (kmpl)               1462 non-null   float64
 7   Engine (CC)                  1462 non-null   float64
 8   Power (bhp)                  1462 non-null   float64
 9   Location_Enc                 1462 non-null   float64
 10  Fuel_Type_Enc                1462 non-null   float64
 11  Brand_Enc                    1462 non-null   float64
 12  Series_Enc                   1462 non-null   float64
 13  Type_Enc       

# Modeling

## Functions

In [14]:
def get_cv_score(models, X_train, y_train):
    
    cv = KFold(n_splits=5, shuffle=True, random_state=0)
    summary = []
    for label, model in models.items():
        cv_results = cross_validate(model, X_train, y_train, cv=cv, 
                                    scoring=['r2',
                                             'neg_root_mean_squared_error',
                                             'neg_mean_absolute_error'])
        
        temp = pd.DataFrame(cv_results).copy()
        temp['Model'] = label
        summary.append(temp)
    
    summary = pd.concat(summary)
    summary = summary.groupby('Model').mean()
    
    summary.drop(columns=['fit_time', 'score_time'], inplace=True)
    summary.columns = ['CV R2', 'CV RMSE', 'CV MAE']
    summary[['CV RMSE', 'CV MAE']] = summary[['CV RMSE', 'CV MAE']] * -1
    
    return summary

In [15]:
def evaluate_model(models, X_train, X_test, y_train, y_test):

    summary = {'Model':[], 'Train R2':[], 'Train RMSE':[], 'Train MAE':[],
               'Test R2':[], 'Test RMSE':[], 'Test MAE':[]}

    for label, model in models.items():
        model.fit(X_train, y_train)

        y_train_pred = model.predict(X_train)
        y_test_pred = model.predict(X_test)

        summary['Model'].append(label)

        summary['Train R2'].append(
            metrics.r2_score(y_train, y_train_pred))
        summary['Train RMSE'].append(
            np.sqrt(metrics.mean_squared_error(y_train, y_train_pred)))
        summary['Train MAE'].append(
            metrics.mean_absolute_error(y_train, y_train_pred))

        summary['Test R2'].append(
            metrics.r2_score(y_test, y_test_pred))
        summary['Test RMSE'].append(
            np.sqrt(metrics.mean_squared_error(y_test, y_test_pred)))
        summary['Test MAE'].append(
            metrics.mean_absolute_error(y_test, y_test_pred))
    
    summary = pd.DataFrame(summary)
    summary.set_index('Model', inplace=True)

    cv_scores = get_cv_score(models, X_train, y_train)
    summary = summary.join(cv_scores)
    summary.reset_index(inplace=True)
    summary = summary[['Train R2', 'CV R2', 'Test R2',
                       'Train RMSE', 'CV RMSE', 'Test RMSE',
                       'Train MAE', 'CV MAE', 'Test MAE', 'Model']]
    
    return round(summary.sort_values(by='CV RMSE'), 4)

In [16]:
X_train = train_data.drop(columns="Price")
y_train = train_data["Price"]
X_test = test_data.drop(columns="Price")
y_test = test_data["Price"]

In [17]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((4383, 23), (4383,), (1462, 23), (1462,))

## Hyperparameter Tuning

### XGBoost

#### Study 1

In [19]:
def objective(trial):

    dtrain = xgb.DMatrix(X_train, label=y_train)

    param = {
        'objective': 'reg:squarederror',
        'tree_method': 'gpu_hist',
        'learning_rate': 0.1,
        'max_depth': trial.suggest_int('max_depth', 1, 10),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 6),
        'gamma': trial.suggest_loguniform("gamma", 1e-8, 1.0),
        'subsample':trial.suggest_uniform('subsample', 0.1, 1),
        'colsample_bytree':trial.suggest_uniform('colsample_bytree', 0.1, 1),
        "lambda": trial.suggest_loguniform("lambda", 1e-8, 1.0),
        "alpha": trial.suggest_loguniform("alpha", 1e-8, 1.0),
        "grow_policy": trial.suggest_categorical("grow_policy", ["depthwise", "lossguide"])
    }

    pruning_callback = optuna.integration.XGBoostPruningCallback(trial, "test-rmse")

    cv = KFold(n_splits=5, shuffle=True, random_state=0)
    history = xgb.cv(param, dtrain, num_boost_round=2000, 
                     early_stopping_rounds=100,
                     callbacks=[pruning_callback],
                     metrics='rmse', 
                     folds=cv)

    mean_score = history["test-rmse-mean"].values[-1]
    return mean_score

pruner = optuna.pruners.MedianPruner(n_warmup_steps=10)
study = optuna.create_study(pruner=pruner, direction='minimize')
study.optimize(objective, n_trials=1000)

print("Number of finished trials: {}".format(len(study.trials)))

print("Best trial:")
trial = study.best_trial

print("  Value: {}".format(trial.value))

print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

[32m[I 2020-10-30 12:51:06,413][0m A new study created in memory with name: no-name-5f01eb69-b3d4-48f1-9367-26b78782afc5[0m
[32m[I 2020-10-30 12:51:15,077][0m Trial 0 finished with value: 4.6973112 and parameters: {'max_depth': 1, 'min_child_weight': 1, 'gamma': 0.13741488645576821, 'subsample': 0.4269568096317711, 'colsample_bytree': 0.836250440384237, 'lambda': 1.201350549061885e-05, 'alpha': 0.005055729577608604, 'grow_policy': 'depthwise'}. Best is trial 0 with value: 4.6973112.[0m
[32m[I 2020-10-30 12:52:19,401][0m Trial 1 finished with value: 3.7865275999999994 and parameters: {'max_depth': 6, 'min_child_weight': 3, 'gamma': 0.12769774023948166, 'subsample': 0.7605083478923543, 'colsample_bytree': 0.10177924609534657, 'lambda': 4.582839066555021e-05, 'alpha': 3.275501370610254e-07, 'grow_policy': 'lossguide'}. Best is trial 1 with value: 3.7865275999999994.[0m


KeyboardInterrupt: 

In [None]:
# Get best params then add to param_1
study_1_params = study.best_params
param_1 = {
    'objective': 'reg:squarederror',
    'tree_method': 'hist',
    'learning_rate': 0.1,
}
param_1.update(study_1_params)
param_1

{'alpha': 0.7590127908977213,
 'colsample_bytree': 0.9607197516247233,
 'gamma': 2.3958951034113745e-07,
 'grow_policy': 'depthwise',
 'lambda': 0.00013762368579039482,
 'learning_rate': 0.1,
 'max_depth': 10,
 'min_child_weight': 4,
 'objective': 'reg:squarederror',
 'subsample': 0.6186395456914845,
 'tree_method': 'hist'}

In [None]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

cv = KFold(
    n_splits=5, 
    shuffle=True, 
    random_state=0
)
history = xgb.cv(
    param_1, dtrain, 
    num_boost_round=2000, 
    early_stopping_rounds=100,
    metrics='rmse',
    folds=cv
)
n_estimators_1 = history.shape[0]
n_estimators_1

73

#### Study 2

In [None]:
def objective(trial):

    dtrain = xgb.DMatrix(X_train, label=y_train)

    param = {
        'objective': 'reg:squarederror',
        'tree_method': 'hist',
        'learning_rate': 0.01,
        'max_depth': trial.suggest_int('max_depth', 1, 10),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 6),
        'gamma': trial.suggest_loguniform("gamma", 1e-8, 1.0),
        'subsample':trial.suggest_uniform('subsample', 0.1, 1),
        'colsample_bytree':trial.suggest_uniform('colsample_bytree', 0.1, 1),
        "lambda": trial.suggest_loguniform("lambda", 1e-8, 1.0),
        "alpha": trial.suggest_loguniform("alpha", 1e-8, 1.0),
        "grow_policy": trial.suggest_categorical("grow_policy", ["depthwise", "lossguide"])
    }

    pruning_callback = optuna.integration.XGBoostPruningCallback(trial, "test-rmse")

    cv = KFold(n_splits=5, shuffle=True, random_state=0)
    history = xgb.cv(param, dtrain, num_boost_round=2000, 
                     early_stopping_rounds=100,
                     callbacks=[pruning_callback],
                     metrics='rmse', 
                     folds=cv)

    mean_score = history["test-rmse-mean"].values[-1]
    return mean_score

pruner = optuna.pruners.MedianPruner(n_warmup_steps=10)
study = optuna.create_study(pruner=pruner, direction='minimize')
study.optimize(objective, n_trials=1000)

print("Number of finished trials: {}".format(len(study.trials)))

print("Best trial:")
trial = study.best_trial

print("  Value: {}".format(trial.value))

print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

[32m[I 2020-10-29 17:27:34,387][0m A new study created in memory with name: no-name-9de25694-6aae-460e-bd6e-c9a1e6053a48[0m
[32m[I 2020-10-29 17:27:53,580][0m Trial 0 finished with value: 3.169108 and parameters: {'max_depth': 3, 'min_child_weight': 4, 'gamma': 2.8175426413045556e-05, 'subsample': 0.347474169111479, 'colsample_bytree': 0.9837370310385078, 'lambda': 0.0007521936991226943, 'alpha': 0.07774597566476385, 'grow_policy': 'lossguide'}. Best is trial 0 with value: 3.169108.[0m
[32m[I 2020-10-29 17:28:24,362][0m Trial 1 finished with value: 4.2587564 and parameters: {'max_depth': 1, 'min_child_weight': 2, 'gamma': 0.00012409400570614071, 'subsample': 0.5307739269217518, 'colsample_bytree': 0.8489245015220577, 'lambda': 0.1347898363403124, 'alpha': 1.25707339853515e-07, 'grow_policy': 'depthwise'}. Best is trial 0 with value: 3.169108.[0m
[32m[I 2020-10-29 17:28:48,100][0m Trial 2 finished with value: 2.9726986 and parameters: {'max_depth': 5, 'min_child_weight': 4, '

Number of finished trials: 1000
Best trial:
  Value: 2.8765868000000006
  Params: 
    max_depth: 10
    min_child_weight: 3
    gamma: 1.1273311837112433e-07
    subsample: 0.48167905375660824
    colsample_bytree: 0.9626653908113837
    lambda: 3.4544441383016645e-08
    alpha: 4.241571364142641e-08
    grow_policy: depthwise


In [None]:
# Get best params then add to param_2
study_2_params = study.best_params
param_2 = {
    'objective': 'reg:squarederror',
    'tree_method': 'hist',
    'learning_rate': 0.01,
}
param_2.update(study_2_params)
param_2

{'alpha': 4.241571364142641e-08,
 'colsample_bytree': 0.9626653908113837,
 'gamma': 1.1273311837112433e-07,
 'grow_policy': 'depthwise',
 'lambda': 3.4544441383016645e-08,
 'learning_rate': 0.01,
 'max_depth': 10,
 'min_child_weight': 3,
 'objective': 'reg:squarederror',
 'subsample': 0.48167905375660824,
 'tree_method': 'hist'}

In [None]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

cv = KFold(
    n_splits=5, 
    shuffle=True, 
    random_state=0
)
history = xgb.cv(
    param_2, dtrain, 
    num_boost_round=2000, 
    early_stopping_rounds=100,
    metrics='rmse',
    folds=cv
)
n_estimators_2 = history.shape[0]
n_estimators_2

737

#### Evaluation

In [None]:
xgb_study_1 = XGBRegressor(**param_1, n_estimators=n_estimators_1)
xgb_study_2 = XGBRegressor(**param_2, n_estimators=n_estimators_2)

models = {
    f'XGBRegressor ({n_estimators_1})': xgb_study_1,
    f'XGBRegressor ({n_estimators_2})': xgb_study_2
}
evaluate_model(models, X_train, X_test, y_train, y_test)

Unnamed: 0,Train R2,CV R2,Test R2,Train RMSE,CV RMSE,Test RMSE,Train MAE,CV MAE,Test MAE,Model
1,0.9872,0.9272,0.9143,1.2411,2.9078,3.4386,0.5346,1.1063,1.2659,XGBRegressor (737)
0,0.9883,0.9215,0.9129,1.1862,3.0241,3.4659,0.5356,1.1452,1.3158,XGBRegressor (73)


#### Study 3

In [None]:
def objective(trial):

    dtrain = xgb.DMatrix(X_train, label=y_train)

    param = param_2
    param["learning_rate"] = trial.suggest_uniform('learning_rate', 0.001, 0.01)

    pruning_callback = optuna.integration.XGBoostPruningCallback(trial, "test-rmse")

    cv = KFold(n_splits=5, shuffle=True, random_state=0)
    history = xgb.cv(param, dtrain, num_boost_round=2000, 
                     early_stopping_rounds=100,
                     callbacks=[pruning_callback],
                     metrics='rmse', 
                     folds=cv)

    mean_score = history["test-rmse-mean"].values[-1]
    return mean_score

pruner = optuna.pruners.MedianPruner(n_warmup_steps=10)
study = optuna.create_study(pruner=pruner, direction='minimize')
study.optimize(objective, n_trials=1000)

print("Number of finished trials: {}".format(len(study.trials)))

print("Best trial:")
trial = study.best_trial

print("  Value: {}".format(trial.value))

print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

[32m[I 2020-10-29 20:34:16,981][0m A new study created in memory with name: no-name-f670fbf7-3cd4-4d89-b24c-df8a96bd10cb[0m
[32m[I 2020-10-29 20:35:21,142][0m Trial 0 finished with value: 2.8993066 and parameters: {'learning_rate': 0.006717476196364012}. Best is trial 0 with value: 2.8993066.[0m
[32m[I 2020-10-29 20:37:04,821][0m Trial 1 finished with value: 2.8944406000000003 and parameters: {'learning_rate': 0.004528649551108013}. Best is trial 1 with value: 2.8944406000000003.[0m
[32m[I 2020-10-29 20:39:02,243][0m Trial 2 finished with value: 2.9068328 and parameters: {'learning_rate': 0.0034886000818777083}. Best is trial 1 with value: 2.8944406000000003.[0m
[32m[I 2020-10-29 20:41:25,756][0m Trial 3 finished with value: 2.9089868 and parameters: {'learning_rate': 0.0024890236114943794}. Best is trial 1 with value: 2.8944406000000003.[0m
[32m[I 2020-10-29 20:43:33,843][0m Trial 4 finished with value: 2.8996508 and parameters: {'learning_rate': 0.0030767161347567415

In [None]:
# Get best params then add to param_3
param_3 = param_2.copy()
param_3["learning_rate"] = study.best_params["learning_rate"]
param_3

In [None]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

cv = KFold(
    n_splits=5, 
    shuffle=True, 
    random_state=0
)
history = xgb.cv(
    param_3, dtrain, 
    num_boost_round=2000, 
    early_stopping_rounds=100,
    metrics='rmse',
    folds=cv
)
n_estimators_3 = history.shape[0]
n_estimators_3

#### Evaluation

In [None]:
xgb_study_1 = XGBRegressor(**param_1, n_estimators=n_estimators_1)
xgb_study_2 = XGBRegressor(**param_2, n_estimators=n_estimators_2)
xgb_study_3 = XGBRegressor(**param_3, n_estimators=n_estimators_3)

models = {
    f'XGBRegressor ({n_estimators_1}) {param_1}': xgb_study_1,
    f'XGBRegressor ({n_estimators_2}) {param_2}': xgb_study_2,
    f'XGBRegressor ({n_estimators_3}) {param_3}': xgb_study_3
}
result = evaluate_model(models, X_train, X_test, y_train, y_test)
result

In [None]:
result.to_csv("tuning_dropna_all (XGB).csv", index=False)

### LightGBM

#### Study 1

In [None]:
def objective(trial):

    dtrain = lgb.Dataset(X_train, label=y_train)

    params = {
        "objective": "regression",
        "verbosity": -1,
        "boosting_type": "gbdt",
        "learning_rate": 0.1,
        "max_depth": trial.suggest_int("max_depth", 1, 30),
        "num_leaves": trial.suggest_int("num_leaves", 2, 256),
        "lambda_l1": trial.suggest_loguniform("lambda_l1", 1e-8, 1.0),
        "lambda_l2": trial.suggest_loguniform("lambda_l2", 1e-8, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
        'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.1, 1),
        "feature_fraction": trial.suggest_uniform("feature_fraction", 0.1, 1),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
    }

    pruning_callback = optuna.integration.LightGBMPruningCallback(trial, "rmse")

    cv = KFold(n_splits=5, shuffle=True, random_state=0)
    scores = lgb.cv(params, dtrain, 
                    num_boost_round=2000, 
                    early_stopping_rounds=100,
                    callbacks=[pruning_callback],
                    metrics='rmse', 
                    folds=cv)

    mean_score = scores['rmse-mean'][-1]
    return mean_score

pruner = optuna.pruners.MedianPruner(n_warmup_steps=10)
study = optuna.create_study(pruner=pruner, direction='minimize')
study.optimize(objective, n_trials=1000)


print("Number of finished trials: {}".format(len(study.trials)))

print("Best trial:")
trial = study.best_trial

print("  Value: {}".format(trial.value))

print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

In [None]:
# Get best params then add to param_1
study_1_params = study.best_params
param_1 = {
    "objective": "regression",
    "verbosity": -1,
    "boosting_type": "gbdt",
    "learning_rate": 0.1,
}
param_1.update(study_1_params)
param_1

In [None]:
dtrain = lgb.Dataset(X_train, label=y_train)

cv = KFold(
    n_splits=5, 
    shuffle=True, 
    random_state=0
)
history = lgb.cv(
    param_1, dtrain, 
    num_boost_round=2000,
    early_stopping_rounds=100,
    metrics='rmse', 
    folds=cv
)
n_estimators_1 = pd.DataFrame(history).shape[0]
n_estimators_1

#### Study 2

In [None]:
def objective(trial):

    dtrain = lgb.Dataset(X_train, label=y_train)

    params = {
        "objective": "regression",
        "verbosity": -1,
        "boosting_type": "gbdt",
        "learning_rate": 0.01,
        "max_depth": trial.suggest_int("max_depth", 1, 30),
        "num_leaves": trial.suggest_int("num_leaves", 2, 256),
        "lambda_l1": trial.suggest_loguniform("lambda_l1", 1e-8, 1.0),
        "lambda_l2": trial.suggest_loguniform("lambda_l2", 1e-8, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
        'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.1, 1),
        "feature_fraction": trial.suggest_uniform("feature_fraction", 0.1, 1),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
    }

    pruning_callback = optuna.integration.LightGBMPruningCallback(trial, "rmse")

    cv = KFold(n_splits=5, shuffle=True, random_state=0)
    scores = lgb.cv(params, dtrain, 
                    num_boost_round=2000, 
                    early_stopping_rounds=100,
                    callbacks=[pruning_callback],
                    metrics='rmse', 
                    folds=cv)

    mean_score = scores['rmse-mean'][-1]
    return mean_score

pruner = optuna.pruners.MedianPruner(n_warmup_steps=10)
study = optuna.create_study(pruner=pruner, direction='minimize')
study.optimize(objective, n_trials=1000)


print("Number of finished trials: {}".format(len(study.trials)))

print("Best trial:")
trial = study.best_trial

print("  Value: {}".format(trial.value))

print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

In [None]:
# Get best params then add to param_2
study_2_params = study.best_params
param_2 = {
    "objective": "regression",
    "verbosity": -1,
    "boosting_type": "gbdt",
    "learning_rate": 0.1,
}
param_2.update(study_2_params)
param_2

In [None]:
dtrain = lgb.Dataset(X_train, label=y_train)

cv = KFold(
    n_splits=5, 
    shuffle=True, 
    random_state=0
)
history = lgb.cv(
    param_2, dtrain, 
    num_boost_round=2000,
    early_stopping_rounds=100,
    metrics='rmse', 
    folds=cv
)
n_estimators_2 = pd.DataFrame(history).shape[0]
n_estimators_2

#### Evaluation

In [None]:
lgb_study_1 = LGBMRegressor(**param_1, n_estimators=n_estimators_1)
lgb_study_2 = LGBMRegressor(**param_2, n_estimators=n_estimators_2)

models = {
    f'LGBMRegressor ({n_estimators_1}) {param_1}': lgb_study_1,
    f'LGBMRegressor ({n_estimators_2}) {param_2}': lgb_study_2
}
result = evaluate_model(models, X_train, X_test, y_train, y_test)
result

#### Study 3

In [None]:
def objective(trial):

    dtrain = lgb.Dataset(X_train, label=y_train)

    param = param_2
    param["learning_rate"] = trial.suggest_uniform('learning_rate', 0.001, 0.01)

    pruning_callback = optuna.integration.LightGBMPruningCallback(trial, "rmse")

    cv = KFold(n_splits=5, shuffle=True, random_state=0)
    scores = lgb.cv(params, dtrain, 
                    num_boost_round=2000, 
                    early_stopping_rounds=100,
                    callbacks=[pruning_callback],
                    metrics='rmse', 
                    folds=cv)

    mean_score = scores['rmse-mean'][-1]
    return mean_score

pruner = optuna.pruners.MedianPruner(n_warmup_steps=10)
study = optuna.create_study(pruner=pruner, direction='minimize')
study.optimize(objective, n_trials=1000)


print("Number of finished trials: {}".format(len(study.trials)))

print("Best trial:")
trial = study.best_trial

print("  Value: {}".format(trial.value))

print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

In [None]:
# Get best params then add to param_3
param_3 = param_2.copy()
param_3["learning_rate"] = study.best_params["learning_rate"]
param_3

In [None]:
dtrain = lgb.Dataset(X_train, label=y_train)

cv = KFold(
    n_splits=5, 
    shuffle=True, 
    random_state=0
)
history = lgb.cv(
    param_3, dtrain, 
    num_boost_round=2000,
    early_stopping_rounds=100,
    metrics='rmse', 
    folds=cv
)
n_estimators_3 = pd.DataFrame(history).shape[0]
n_estimators_3

#### Evaluation

In [None]:
lgb_study_1 = LGBMRegressor(**param_1, n_estimators=n_estimators_1)
lgb_study_2 = LGBMRegressor(**param_2, n_estimators=n_estimators_2)
lgb_study_3 = LGBMRegressor(**param_3, n_estimators=n_estimators_3)

models = {
    f'LGBMRegressor ({n_estimators_1}) {param_1}': lgb_study_1,
    f'LGBMRegressor ({n_estimators_2}) {param_2}': lgb_study_2,
    f'LGBMRegressor ({n_estimators_3}) {param_3}': lgb_study_3
}
result = evaluate_model(models, X_train, X_test, y_train, y_test)
result

In [None]:
result.to_csv("tuning_dropna_all (LGB).csv", index=False)