In [1]:
import time
from tqdm import tqdm
import shap
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import minmax_scale
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_log_error
from lightgbm import LGBMClassifier, LGBMRegressor
from xgboost import XGBClassifier, XGBRegressor
from xgboost import plot_importance
from catboost import CatBoostClassifier, CatBoostRegressor
import optuna

warnings.filterwarnings("ignore")

# <h1><center> DATA READING & MERGING </center></h1>

In [2]:
train = pd.read_csv("data/train.csv")
test = pd.read_csv("data/test.csv")
price = pd.read_csv("data/price.csv")

In [3]:
train

Unnamed: 0,Customer,Area,City Code,Billing Date,Product,Plant,Litres
0,10048261,Karadeniz Bölgesi,883,2014-01-01,Benzin,T102,0.0
1,10048261,Karadeniz Bölgesi,883,2014-01-01,Benzin,T155,0.0
2,10048261,Karadeniz Bölgesi,883,2014-01-01,Motorin,T102,0.0
3,10048261,Karadeniz Bölgesi,883,2014-01-01,Motorin,T155,0.0
4,10048261,Karadeniz Bölgesi,883,2014-01-02,Motorin,T155,16685.0
...,...,...,...,...,...,...,...
1379255,12663344,Doğu Anadolu Bölgesi,218,2018-12-22,Motorin,T317,1504.0
1379256,12663344,Doğu Anadolu Bölgesi,218,2018-12-28,Benzin,T317,11111.0
1379257,12663344,Doğu Anadolu Bölgesi,218,2018-12-28,Motorin,T317,22882.0
1379258,12664221,Marmara Bölgesi,941,2018-12-29,Benzin,T102,6528.0


In [4]:
test

Unnamed: 0,index,Customer,Area,City Code,Product,Week Start Date
0,0,10048261,Karadeniz Bölgesi,883,Motorin,2019-01-07
1,1,10048261,Karadeniz Bölgesi,883,Motorin,2019-01-14
2,2,10048261,Karadeniz Bölgesi,883,Motorin,2019-01-21
3,3,10048261,Karadeniz Bölgesi,883,Motorin,2019-01-28
4,4,10048261,Karadeniz Bölgesi,883,Motorin,2019-02-04
...,...,...,...,...,...,...
108467,108467,12739737,Doğu Anadolu Bölgesi,194,Benzin,2019-12-02
108468,108468,12739737,Doğu Anadolu Bölgesi,194,Benzin,2019-12-09
108469,108469,12739737,Doğu Anadolu Bölgesi,194,Benzin,2019-12-16
108470,108470,12739737,Doğu Anadolu Bölgesi,194,Benzin,2019-12-23


In [5]:
price

Unnamed: 0,City Code,Validfrom,Product,CRP
0,116,2016-07-27,Benzin,4.46
1,116,2016-07-28,Benzin,4.46
2,116,2016-07-29,Benzin,4.46
3,116,2016-07-30,Benzin,4.40
4,116,2016-07-31,Benzin,4.40
...,...,...,...,...
311739,990,2019-12-27,Motorin,6.74
311740,990,2019-12-28,Motorin,6.74
311741,990,2019-12-29,Motorin,6.74
311742,990,2019-12-30,Motorin,6.74


In [6]:
train.drop(columns=["Plant"], inplace = True)
train.rename(columns={"City Code":"City_Code", "Billing Date":"Date"}, inplace = True)
train

Unnamed: 0,Customer,Area,City_Code,Date,Product,Litres
0,10048261,Karadeniz Bölgesi,883,2014-01-01,Benzin,0.0
1,10048261,Karadeniz Bölgesi,883,2014-01-01,Benzin,0.0
2,10048261,Karadeniz Bölgesi,883,2014-01-01,Motorin,0.0
3,10048261,Karadeniz Bölgesi,883,2014-01-01,Motorin,0.0
4,10048261,Karadeniz Bölgesi,883,2014-01-02,Motorin,16685.0
...,...,...,...,...,...,...
1379255,12663344,Doğu Anadolu Bölgesi,218,2018-12-22,Motorin,1504.0
1379256,12663344,Doğu Anadolu Bölgesi,218,2018-12-28,Benzin,11111.0
1379257,12663344,Doğu Anadolu Bölgesi,218,2018-12-28,Motorin,22882.0
1379258,12664221,Marmara Bölgesi,941,2018-12-29,Benzin,6528.0


In [7]:
price.rename(columns={"City Code":"City_Code", "Validfrom":"Date"}, inplace = True)
price

Unnamed: 0,City_Code,Date,Product,CRP
0,116,2016-07-27,Benzin,4.46
1,116,2016-07-28,Benzin,4.46
2,116,2016-07-29,Benzin,4.46
3,116,2016-07-30,Benzin,4.40
4,116,2016-07-31,Benzin,4.40
...,...,...,...,...
311739,990,2019-12-27,Motorin,6.74
311740,990,2019-12-28,Motorin,6.74
311741,990,2019-12-29,Motorin,6.74
311742,990,2019-12-30,Motorin,6.74


In [8]:
train_price = pd.merge(train, price, on = ["City_Code", "Date", "Product"])
train_price.drop_duplicates(inplace = True)
train_price

Unnamed: 0,Customer,Area,City_Code,Date,Product,Litres,CRP
0,10048261,Karadeniz Bölgesi,883,2014-01-01,Benzin,0.0,4.98
2,10049900,Karadeniz Bölgesi,883,2014-01-01,Benzin,0.0,4.98
3,10048261,Karadeniz Bölgesi,883,2014-01-01,Motorin,0.0,4.58
5,10048264,Karadeniz Bölgesi,883,2014-01-01,Motorin,0.0,4.58
6,10049900,Karadeniz Bölgesi,883,2014-01-01,Motorin,0.0,4.58
...,...,...,...,...,...,...,...
1359584,12663274,Doğu Anadolu Bölgesi,518,2018-12-31,Motorin,17102.0,5.84
1359585,12663344,Doğu Anadolu Bölgesi,218,2018-12-12,Benzin,6546.0,6.28
1359586,12663344,Doğu Anadolu Bölgesi,218,2018-12-12,Motorin,26785.0,6.11
1359587,12663344,Doğu Anadolu Bölgesi,218,2018-12-28,Benzin,11111.0,6.01


In [9]:
test.drop(columns=["index"], inplace = True)
test.rename(columns={"City Code":"City_Code", "Week Start Date":"Date"}, inplace = True)
test

Unnamed: 0,Customer,Area,City_Code,Product,Date
0,10048261,Karadeniz Bölgesi,883,Motorin,2019-01-07
1,10048261,Karadeniz Bölgesi,883,Motorin,2019-01-14
2,10048261,Karadeniz Bölgesi,883,Motorin,2019-01-21
3,10048261,Karadeniz Bölgesi,883,Motorin,2019-01-28
4,10048261,Karadeniz Bölgesi,883,Motorin,2019-02-04
...,...,...,...,...,...
108467,12739737,Doğu Anadolu Bölgesi,194,Benzin,2019-12-02
108468,12739737,Doğu Anadolu Bölgesi,194,Benzin,2019-12-09
108469,12739737,Doğu Anadolu Bölgesi,194,Benzin,2019-12-16
108470,12739737,Doğu Anadolu Bölgesi,194,Benzin,2019-12-23


In [10]:
test_price = pd.merge(test, price, on = ["City_Code", "Date", "Product"], how = "left")
test_price

Unnamed: 0,Customer,Area,City_Code,Product,Date,CRP
0,10048261,Karadeniz Bölgesi,883,Motorin,2019-01-07,5.76
1,10048261,Karadeniz Bölgesi,883,Motorin,2019-01-14,5.94
2,10048261,Karadeniz Bölgesi,883,Motorin,2019-01-21,6.14
3,10048261,Karadeniz Bölgesi,883,Motorin,2019-01-28,6.14
4,10048261,Karadeniz Bölgesi,883,Motorin,2019-02-04,6.14
...,...,...,...,...,...,...
108467,12739737,Doğu Anadolu Bölgesi,194,Benzin,2019-12-02,7.14
108468,12739737,Doğu Anadolu Bölgesi,194,Benzin,2019-12-09,7.14
108469,12739737,Doğu Anadolu Bölgesi,194,Benzin,2019-12-16,7.14
108470,12739737,Doğu Anadolu Bölgesi,194,Benzin,2019-12-23,7.14


# <h1><center> DATA PREPROCESSING </center></h1>

In [11]:
X_train_val = train_price.drop(["Litres"], axis = 1)
X_train_val

Unnamed: 0,Customer,Area,City_Code,Date,Product,CRP
0,10048261,Karadeniz Bölgesi,883,2014-01-01,Benzin,4.98
2,10049900,Karadeniz Bölgesi,883,2014-01-01,Benzin,4.98
3,10048261,Karadeniz Bölgesi,883,2014-01-01,Motorin,4.58
5,10048264,Karadeniz Bölgesi,883,2014-01-01,Motorin,4.58
6,10049900,Karadeniz Bölgesi,883,2014-01-01,Motorin,4.58
...,...,...,...,...,...,...
1359584,12663274,Doğu Anadolu Bölgesi,518,2018-12-31,Motorin,5.84
1359585,12663344,Doğu Anadolu Bölgesi,218,2018-12-12,Benzin,6.28
1359586,12663344,Doğu Anadolu Bölgesi,218,2018-12-12,Motorin,6.11
1359587,12663344,Doğu Anadolu Bölgesi,218,2018-12-28,Benzin,6.01


In [12]:
y_train_val = train_price["Litres"]
y_train_val

0              0.0
2              0.0
3              0.0
5              0.0
6              0.0
            ...   
1359584    17102.0
1359585     6546.0
1359586    26785.0
1359587    11111.0
1359588    22882.0
Name: Litres, Length: 1355061, dtype: float64

In [13]:
X_test = test_price.copy()
X_test

Unnamed: 0,Customer,Area,City_Code,Product,Date,CRP
0,10048261,Karadeniz Bölgesi,883,Motorin,2019-01-07,5.76
1,10048261,Karadeniz Bölgesi,883,Motorin,2019-01-14,5.94
2,10048261,Karadeniz Bölgesi,883,Motorin,2019-01-21,6.14
3,10048261,Karadeniz Bölgesi,883,Motorin,2019-01-28,6.14
4,10048261,Karadeniz Bölgesi,883,Motorin,2019-02-04,6.14
...,...,...,...,...,...,...
108467,12739737,Doğu Anadolu Bölgesi,194,Benzin,2019-12-02,7.14
108468,12739737,Doğu Anadolu Bölgesi,194,Benzin,2019-12-09,7.14
108469,12739737,Doğu Anadolu Bölgesi,194,Benzin,2019-12-16,7.14
108470,12739737,Doğu Anadolu Bölgesi,194,Benzin,2019-12-23,7.14


In [14]:
cols = list(X_test.columns)
cols = cols[:3] + ["Date", "Product"] + cols[5:]
X_test = X_test[cols]
X_test

Unnamed: 0,Customer,Area,City_Code,Date,Product,CRP
0,10048261,Karadeniz Bölgesi,883,2019-01-07,Motorin,5.76
1,10048261,Karadeniz Bölgesi,883,2019-01-14,Motorin,5.94
2,10048261,Karadeniz Bölgesi,883,2019-01-21,Motorin,6.14
3,10048261,Karadeniz Bölgesi,883,2019-01-28,Motorin,6.14
4,10048261,Karadeniz Bölgesi,883,2019-02-04,Motorin,6.14
...,...,...,...,...,...,...
108467,12739737,Doğu Anadolu Bölgesi,194,2019-12-02,Benzin,7.14
108468,12739737,Doğu Anadolu Bölgesi,194,2019-12-09,Benzin,7.14
108469,12739737,Doğu Anadolu Bölgesi,194,2019-12-16,Benzin,7.14
108470,12739737,Doğu Anadolu Bölgesi,194,2019-12-23,Benzin,7.14


In [15]:
lbl = LabelEncoder() 
X_train_val['Area'] = lbl.fit_transform(X_train_val['Area'].astype(str))
X_train_val['Date'] = lbl.fit_transform(X_train_val['Date'].astype(str)) 
X_train_val['Product'] = lbl.fit_transform(X_train_val['Product'].astype(str)) 
X_train_val

Unnamed: 0,Customer,Area,City_Code,Date,Product,CRP
0,10048261,4,883,0,0,4.98
2,10049900,4,883,0,0,4.98
3,10048261,4,883,0,1,4.58
5,10048264,4,883,0,1,4.58
6,10049900,4,883,0,1,4.58
...,...,...,...,...,...,...
1359584,12663274,1,518,1777,1,5.84
1359585,12663344,1,218,1758,0,6.28
1359586,12663344,1,218,1758,1,6.11
1359587,12663344,1,218,1774,0,6.01


In [16]:
lbl = LabelEncoder() 
X_test['Area'] = lbl.fit_transform(X_test['Area'].astype(str))
X_test['Date'] = lbl.fit_transform(X_test['Date'].astype(str)) 
X_test['Product'] = lbl.fit_transform(X_test['Product'].astype(str)) 
X_test

Unnamed: 0,Customer,Area,City_Code,Date,Product,CRP
0,10048261,4,883,0,1,5.76
1,10048261,4,883,1,1,5.94
2,10048261,4,883,2,1,6.14
3,10048261,4,883,3,1,6.14
4,10048261,4,883,4,1,6.14
...,...,...,...,...,...,...
108467,12739737,1,194,47,0,7.14
108468,12739737,1,194,48,0,7.14
108469,12739737,1,194,49,0,7.14
108470,12739737,1,194,50,0,7.14


In [17]:
fold_number = 10 if X_train_val.shape[0] < 100_000 else 5 if  X_train_val.shape[0] < 1_000_000 else 3
initial_split = 0.3
ho_split = 0.5

# <h1><center> MODEL COMPARISON</center></h1>

# LGBM

In [18]:
rmsle_scores = []

for _ in tqdm(range(fold_number), desc="Training..."):
    X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=initial_split)
    lgbmr_base = LGBMRegressor()
    lgbmr_base.fit(X_train, y_train)
    y_pred = lgbmr_base.predict(X_val)
    y_pred_norm = minmax_scale(y_pred, feature_range=(0,1))
    y_val_norm = minmax_scale(y_val, feature_range=(0,1))
    rmsle_score = mean_squared_log_error(y_pred_norm, y_val_norm, squared=False)
    rmsle_scores.append(rmsle_score)
    
print("RMSLE Score: {}".format(np.mean(rmsle_scores)))

Training...: 100%|███████████████████████████████████████████████████████████████████████| 3/3 [00:11<00:00,  3.84s/it]

RMSLE Score: 0.23329783802030923





# XGBOOST

In [19]:
rmsle_scores = []

for _ in tqdm(range(fold_number), desc="Training..."):
    X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=initial_split)
    xgbr_base = XGBRegressor()
    xgbr_base.fit(X_train, y_train)
    y_pred = xgbr_base.predict(X_val)
    y_pred_norm = minmax_scale(y_pred, feature_range=(0,1))
    y_val_norm = minmax_scale(y_val, feature_range=(0,1))
    rmsle_score = mean_squared_log_error(y_pred_norm, y_val_norm, squared=False)
    rmsle_scores.append(rmsle_score)
    
print("RMSLE Score: {}".format(np.mean(rmsle_scores)))

Training...: 100%|███████████████████████████████████████████████████████████████████████| 3/3 [01:49<00:00, 36.53s/it]

RMSLE Score: 0.2211671503114243





# CATBOOST

In [20]:
rmsle_scores = []

for _ in tqdm(range(fold_number), desc="Training..."):
    X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=initial_split)
    cbr_base = CatBoostRegressor(verbose=0)
    cbr_base.fit(X_train, y_train)
    y_pred = cbr_base.predict(X_val)
    y_pred_norm = minmax_scale(y_pred, feature_range=(0,1))
    y_val_norm = minmax_scale(y_val, feature_range=(0,1))
    rmsle_score = mean_squared_log_error(y_pred_norm, y_val_norm, squared=False)
    rmsle_scores.append(rmsle_score)
    
print("RMSLE Score: {}".format(np.mean(rmsle_scores)))

Training...: 100%|██████████████████████████████████████████████████████████████████████| 3/3 [06:23<00:00, 127.78s/it]

RMSLE Score: 0.22396463665652352





# <h1><center> HYPERPARAMETER TUNING </center></h1>

# LGBM

In [26]:
def objective(trial):
    params = {
        #'metric': 'mlogloss', 
        #'n_estimators': trial.suggest_int('num_leaves', 10, 1000),
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-3, 10.0),
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-3, 10.0),
        'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.3,0.4,0.5,0.6,0.7,0.8,0.9, 1.0]),
        'subsample': trial.suggest_categorical('subsample', [0.4,0.5,0.6,0.7,0.8,1.0]),
        'learning_rate': trial.suggest_categorical('learning_rate', [0.006,0.008,0.01,0.014,0.017,0.02, 0.1]),
        'max_depth': trial.suggest_categorical('max_depth', [10,20,100]),
        'num_leaves' : trial.suggest_int('num_leaves', 1, 1000),
        'min_child_samples': trial.suggest_int('min_child_samples', 1, 300),
        "scale_pos_weight": trial.suggest_int('scale_pos_weight', 1, 10),
    }    
    rmsle_scores = []
    
    for _ in range(fold_number):
        X_train, X_inner_val, y_train, y_inner_val = train_test_split(X_train_inner_val, y_train_inner_val, test_size=initial_split)
        lgbmr = LGBMRegressor(**params)
        lgbmr.fit(X_train, y_train)
        y_inner_pred = lgbmr.predict(X_inner_val)
        y_inner_pred_norm = minmax_scale(y_inner_pred, feature_range=(0,1))
        y_inner_val_norm = minmax_scale(y_inner_val, feature_range=(0,1))
        rmsle_score = mean_squared_log_error(y_inner_pred_norm, y_inner_val_norm, squared=False)
        rmsle_scores.append(rmsle_score)
    return np.mean(rmsle_scores)

X_train_inner_val, X_outer_val, y_train_inner_val, y_outer_val = train_test_split(X_train_val, y_train_val, test_size=ho_split)
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=50)

[32m[I 2022-12-24 12:46:43,606][0m A new study created in memory with name: no-name-2743a8ce-6861-4b45-805e-c57a4745793e[0m
[32m[I 2022-12-24 12:47:11,234][0m Trial 0 finished with value: 0.30288466310338996 and parameters: {'reg_alpha': 0.3133970921180145, 'reg_lambda': 0.02166794004467384, 'colsample_bytree': 0.3, 'subsample': 0.6, 'learning_rate': 0.014, 'max_depth': 100, 'num_leaves': 379, 'min_child_samples': 58, 'scale_pos_weight': 3}. Best is trial 0 with value: 0.30288466310338996.[0m
[32m[I 2022-12-24 12:47:25,135][0m Trial 1 finished with value: 0.20583521150242232 and parameters: {'reg_alpha': 0.05998940865758211, 'reg_lambda': 7.901718593122019, 'colsample_bytree': 0.8, 'subsample': 0.7, 'learning_rate': 0.01, 'max_depth': 20, 'num_leaves': 137, 'min_child_samples': 126, 'scale_pos_weight': 5}. Best is trial 1 with value: 0.20583521150242232.[0m
[32m[I 2022-12-24 12:48:05,115][0m Trial 2 finished with value: 0.15753500771304854 and parameters: {'reg_alpha': 0.240

[32m[I 2022-12-24 12:59:02,187][0m Trial 22 finished with value: 0.16003865985696886 and parameters: {'reg_alpha': 0.01484311516717204, 'reg_lambda': 0.013030040772600464, 'colsample_bytree': 1.0, 'subsample': 0.8, 'learning_rate': 0.017, 'max_depth': 100, 'num_leaves': 789, 'min_child_samples': 14, 'scale_pos_weight': 9}. Best is trial 16 with value: 0.1275114905390621.[0m
[32m[I 2022-12-24 12:59:29,555][0m Trial 23 finished with value: 0.16852325668851523 and parameters: {'reg_alpha': 0.24523774664618112, 'reg_lambda': 0.006476984715990487, 'colsample_bytree': 1.0, 'subsample': 0.4, 'learning_rate': 0.006, 'max_depth': 100, 'num_leaves': 404, 'min_child_samples': 31, 'scale_pos_weight': 8}. Best is trial 16 with value: 0.1275114905390621.[0m
[32m[I 2022-12-24 12:59:47,497][0m Trial 24 finished with value: 0.3021729033333001 and parameters: {'reg_alpha': 3.590361315496289, 'reg_lambda': 0.02526651929779915, 'colsample_bytree': 0.3, 'subsample': 0.4, 'learning_rate': 0.006, 'ma

[32m[I 2022-12-24 13:10:49,754][0m Trial 44 finished with value: 0.15727756670334722 and parameters: {'reg_alpha': 0.008959159095855172, 'reg_lambda': 0.0010111815654282263, 'colsample_bytree': 1.0, 'subsample': 0.8, 'learning_rate': 0.017, 'max_depth': 100, 'num_leaves': 855, 'min_child_samples': 20, 'scale_pos_weight': 10}. Best is trial 16 with value: 0.1275114905390621.[0m
[32m[I 2022-12-24 13:11:46,232][0m Trial 45 finished with value: 0.14741129271898248 and parameters: {'reg_alpha': 0.008149509214468198, 'reg_lambda': 0.00112555093878746, 'colsample_bytree': 1.0, 'subsample': 0.8, 'learning_rate': 0.017, 'max_depth': 100, 'num_leaves': 867, 'min_child_samples': 21, 'scale_pos_weight': 10}. Best is trial 16 with value: 0.1275114905390621.[0m
[32m[I 2022-12-24 13:12:34,286][0m Trial 46 finished with value: 0.19257291820149738 and parameters: {'reg_alpha': 0.0020310753536184724, 'reg_lambda': 0.0018967023511484522, 'colsample_bytree': 0.9, 'subsample': 0.8, 'learning_rate':

In [27]:
print("Number of finished trials: {}".format(len(study.trials)))

print("Best trial:")
best_trial = study.best_trial

print("  Best Value: {}".format(best_trial.value))

print("  Best Params: ")
for key, value in best_trial.params.items():
    print("    {}: {}".format(key, value))

Number of finished trials: 50
Best trial:
  Best Value: 0.0996289201651316
  Best Params: 
    reg_alpha: 0.007175842912809304
    reg_lambda: 0.004778131710498134
    colsample_bytree: 0.7
    subsample: 0.8
    learning_rate: 0.017
    max_depth: 100
    num_leaves: 848
    min_child_samples: 1
    scale_pos_weight: 9


In [28]:
outer_rmsle_scores = []

for _ in tqdm(range(fold_number), desc="Outer Validation Evaluation..."):
    X_train, X_val, y_train, y_val = train_test_split(X_outer_val, y_outer_val, test_size=initial_split)
    lgbmr = LGBMRegressor(**best_trial.params)
    lgbmr.fit(X_train, y_train)
    y_pred = lgbmr.predict(X_val)
    y_pred_norm = minmax_scale(y_pred, feature_range=(0,1))
    y_val_norm = minmax_scale(y_val, feature_range=(0,1))
    rmsle_score = mean_squared_log_error(y_pred_norm, y_val_norm, squared=False)
    outer_rmsle_scores.append(rmsle_score)
print("RMSLE Score for Outer Validation: {}".format(np.mean(outer_rmsle_scores)))

Outer Validation Evaluation...: 100%|████████████████████████████████████████████████████| 3/3 [00:48<00:00, 16.16s/it]

RMSLE Score for Outer Validation: 0.09861719515532913





# XGBOOST

In [23]:
def objective(trial):
    params = {
        'max_depth': trial.suggest_int('max_depth', 1, 9),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 1),
        'n_estimators': trial.suggest_int('n_estimators', 50, 1000),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'gamma': trial.suggest_loguniform('gamma', 1e-8, 1.0),
        'subsample': trial.suggest_loguniform('subsample', 0.01, 1.0),
        'colsample_bytree': trial.suggest_loguniform('colsample_bytree', 0.01, 1.0),
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-8, 1.0),
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-8, 1.0),
        "scale_pos_weight": trial.suggest_int('scale_pos_weight', 1, 10),
        'eval_metric': 'mlogloss',
        'use_label_encoder': False
    }
    rmsle_scores = []
    
    for _ in range(fold_number):
        X_train, X_inner_val, y_train, y_inner_val = train_test_split(X_train_inner_val, y_train_inner_val, test_size=initial_split)
        xgbr = XGBRegressor(**params)
        xgbr.fit(X_train, y_train)
        y_inner_pred = xgbr.predict(X_inner_val)
        y_inner_pred_norm = minmax_scale(y_inner_pred, feature_range=(0,1))
        y_inner_val_norm = minmax_scale(y_inner_val, feature_range=(0,1))
        rmsle_score = mean_squared_log_error(y_inner_pred_norm, y_inner_val_norm, squared=False)
        rmsle_scores.append(rmsle_score)
    return np.mean(rmsle_scores)

X_train_inner_val, X_outer_val, y_train_inner_val, y_outer_val = train_test_split(X_train_val, y_train_val, test_size=ho_split)
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=10)

[32m[I 2022-12-24 12:20:45,024][0m A new study created in memory with name: no-name-64c1bad0-0e18-4a7c-96fc-d15a6e02bff4[0m
[32m[I 2022-12-24 12:22:06,594][0m Trial 0 finished with value: 0.3413538695809167 and parameters: {'max_depth': 6, 'learning_rate': 0.005665774538065229, 'n_estimators': 363, 'min_child_weight': 3, 'gamma': 0.0023550414026541964, 'subsample': 0.04129245753196764, 'colsample_bytree': 0.013109501109084196, 'reg_alpha': 2.899099908957447e-06, 'reg_lambda': 0.0002645661858789095, 'scale_pos_weight': 4}. Best is trial 0 with value: 0.3413538695809167.[0m
[32m[I 2022-12-24 12:22:35,662][0m Trial 1 finished with value: 0.31555829470487556 and parameters: {'max_depth': 9, 'learning_rate': 0.8070883713892307, 'n_estimators': 99, 'min_child_weight': 1, 'gamma': 0.0003365534791176606, 'subsample': 0.14910012170784398, 'colsample_bytree': 0.027489693320526587, 'reg_alpha': 0.012427185254735566, 'reg_lambda': 6.3206084037772275e-06, 'scale_pos_weight': 4}. Best is tri

In [24]:
print("Number of finished trials: {}".format(len(study.trials)))

print("Best trial:")
best_trial = study.best_trial

print("  Best Value: {}".format(best_trial.value))

print("  Best Params: ")
for key, value in best_trial.params.items():
    print("    {}: {}".format(key, value))

Number of finished trials: 10
Best trial:
  Best Value: 0.22842532424790027
  Best Params: 
    max_depth: 8
    learning_rate: 0.037346125301359175
    n_estimators: 665
    min_child_weight: 8
    gamma: 1.5848237714840588e-06
    subsample: 0.10290926573296401
    colsample_bytree: 0.3451004261851029
    reg_alpha: 4.465859998850306e-06
    reg_lambda: 7.21319357337837e-06
    scale_pos_weight: 2


In [25]:
outer_rmsle_scores = []

for _ in tqdm(range(fold_number), desc="Outer Validation Evaluation..."):
    X_train, X_val, y_train, y_val = train_test_split(X_outer_val, y_outer_val, test_size=initial_split)
    xgbr = XGBRegressor(**best_trial.params)
    xgbr.fit(X_train, y_train)
    y_pred = xgbr.predict(X_val)
    y_pred_norm = minmax_scale(y_pred, feature_range=(0,1))
    y_val_norm = minmax_scale(y_val, feature_range=(0,1))
    rmsle_score = mean_squared_log_error(y_pred_norm, y_val_norm, squared=False)
    outer_rmsle_scores.append(rmsle_score)
print("RMSLE Score for Outer Validation: {}".format(np.mean(outer_rmsle_scores)))

Outer Validation Evaluation...: 100%|████████████████████████████████████████████████████| 3/3 [04:08<00:00, 82.75s/it]

RMSLE Score for Outer Validation: 0.2581772165784035





# CATBOOST

In [31]:
def objective(trial):
    params = {
        'iterations': trial.suggest_int('iterations', 100, 5000),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 1.0),
        'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1, 100),
        'bagging_temperature': trial.suggest_loguniform('bagging_temperature', 0.1, 20.0),
        'random_strength': trial.suggest_float('random_strength', 1.0, 2.0),
        'depth': trial.suggest_int('depth', 1, 10),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 1, 300),
#         "scale_pos_weight": trial.suggest_int('scale_pos_weight', 1, 10),
        "verbose" : 0
    }    
    rmsle_scores = []
    
    for _ in range(fold_number):
        X_train, X_inner_val, y_train, y_inner_val = train_test_split(X_train_inner_val, y_train_inner_val, test_size=initial_split)
        cbr = CatBoostRegressor(**params)
        cbr.fit(X_train, y_train)
        y_inner_pred = cbr.predict(X_inner_val)
        y_inner_pred_norm = minmax_scale(y_inner_pred, feature_range=(0,1))
        y_inner_val_norm = minmax_scale(y_inner_val, feature_range=(0,1))
        rmsle_score = mean_squared_log_error(y_inner_pred_norm, y_inner_val_norm, squared=False)
        rmsle_scores.append(rmsle_score)
    return np.mean(rmsle_scores)

X_train_inner_val, X_outer_val, y_train_inner_val, y_outer_val = train_test_split(X_train_val, y_train_val, test_size=ho_split)
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=10)

[32m[I 2022-12-24 11:35:57,973][0m A new study created in memory with name: no-name-09d60328-cd8a-40f9-b3dd-0ad4915f930b[0m
[32m[I 2022-12-24 11:38:11,570][0m Trial 0 finished with value: 0.22806295786237873 and parameters: {'iterations': 3837, 'learning_rate': 0.4101692851732992, 'l2_leaf_reg': 20.717496235866186, 'bagging_temperature': 11.949999236733683, 'random_strength': 1.1908492277202054, 'depth': 10, 'min_data_in_leaf': 31, 'scale_pos_weight': 5}. Best is trial 0 with value: 0.22806295786237873.[0m
[32m[I 2022-12-24 11:40:28,527][0m Trial 1 finished with value: 0.23817611906410655 and parameters: {'iterations': 527, 'learning_rate': 0.18671372881539336, 'l2_leaf_reg': 5.249693178763945, 'bagging_temperature': 0.22241244737167215, 'random_strength': 1.0214016124671135, 'depth': 2, 'min_data_in_leaf': 79, 'scale_pos_weight': 5}. Best is trial 0 with value: 0.22806295786237873.[0m
[32m[I 2022-12-24 11:42:50,354][0m Trial 2 finished with value: 0.2373050362300051 and par

In [32]:
print("Number of finished trials: {}".format(len(study.trials)))

print("Best trial:")
best_trial = study.best_trial

print("  Best Value: {}".format(best_trial.value))

print("  Best Params: ")
for key, value in best_trial.params.items():
    print("    {}: {}".format(key, value))

Number of finished trials: 10
Best trial:
  Best Value: 0.21887332233007187
  Best Params: 
    iterations: 3822
    learning_rate: 0.0027230688720926957
    l2_leaf_reg: 3.6312800974593946
    bagging_temperature: 11.194434962574801
    random_strength: 1.4358561813862258
    depth: 5
    min_data_in_leaf: 258
    scale_pos_weight: 5


In [34]:
outer_rmsle_scores = []

for _ in tqdm(range(fold_number), desc="Outer Validation Evaluation..."):
    X_train, X_val, y_train, y_val = train_test_split(X_outer_val, y_outer_val, test_size=initial_split)
    cbr = CatBoostRegressor(**best_trial.params)
    cbr.fit(X_train, y_train)
    y_pred = cbr.predict(X_val)
    y_pred_norm = minmax_scale(y_pred, feature_range=(0,1))
    y_val_norm = minmax_scale(y_val, feature_range=(0,1))
    rmsle_score = mean_squared_log_error(y_pred_norm, y_val_norm, squared=False)
    outer_rmsle_scores.append(rmsle_score)
print("RMSLE Score for Outer Validation: {}".format(np.mean(outer_rmsle_scores)))

Outer Validation Evaluation...:   0%|                                                            | 0/3 [00:00<?, ?it/s]


TypeError: __init__() got an unexpected keyword argument 'scale_pos_weight'

# <h1><center>TRAIN THE FINAL MODEL & PREDICT</center></h1>

In [29]:
lgbmr_final = LGBMRegressor(**best_trial.params)
lgbmr_final.fit(X_train_val, y_train_val)
y_test_pred = lgbmr_final.predict(X_test)

In [None]:
xgbr_final = XGBRegressor(**best_trial.params)
xgbr_final.fit(X_train_val, y_train_val)
y_test_pred = xgbr_final.predict(X_test)

In [None]:
cbr_final = CatBoostRegressor(**best_trial.params)
cbr_final.fit(X_train_val, y_train_val)
y_test_pred = cbr_final.predict(X_test)

# <h1><center>FEATURE IMPORTANCE </center></h1>

In [None]:
#plt.rcParams["figure.figsize"] = (14, 24)
plot_importance(xgbr_final)
plt.show()

In [None]:
#plt.rcParams["figure.figsize"] = (14, 24)
plot_importance(cbr_final)
plt.show()

In [None]:
explainerlgbmr = shap.TreeExplainer(lgbmr_final)
shap_values_train = explainerlgbmr.shap_values(X_train_val)

In [None]:
explainerxgbr = shap.TreeExplainer(xgbr_final)
shap_values_train = explainerxgbr.shap_values(X_train_val)

In [None]:
explainercbr = shap.TreeExplainer(cbr_final)
shap_values_train = explainercbr.shap_values(X_train_val)

In [None]:
shap.summary_plot(shap_values_train, X_train_val)

# <h1><center> CREATE SUBMISSION FILE </center></h1>

In [30]:
submission = pd.DataFrame({"index":test.index})
submission['Litres'] = y_test_pred
submission.response = submission.Litres.astype(float)
submission

Unnamed: 0,index,Litres
0,0,7924.850634
1,1,11896.385527
2,2,11952.415426
3,3,11952.415426
4,4,11952.415426
...,...,...
108467,108467,7232.859749
108468,108468,7232.859749
108469,108469,7232.859749
108470,108470,7232.859749


In [31]:
submission.to_csv(f"Submission-{time.strftime('%d - %H-%M-%S')}.csv", index=False)