In [32]:
import time
import shap
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import minmax_scale
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_log_error
from lightgbm import LGBMClassifier, LGBMRegressor
from xgboost import XGBClassifier, XGBRegressor
from xgboost import plot_importance
from catboost import CatBoostClassifier, CatBoostRegressor
import optuna

warnings.filterwarnings("ignore")

# <h1 align="center"> DATA READING & MERGING </h1>

In [2]:
train = pd.read_csv("data/train.csv")
test = pd.read_csv("data/test.csv")
price = pd.read_csv("data/price.csv")

In [3]:
train

Unnamed: 0,Customer,Area,City Code,Billing Date,Product,Plant,Litres
0,10048261,Karadeniz Bölgesi,883,2014-01-01,Benzin,T102,0.0
1,10048261,Karadeniz Bölgesi,883,2014-01-01,Benzin,T155,0.0
2,10048261,Karadeniz Bölgesi,883,2014-01-01,Motorin,T102,0.0
3,10048261,Karadeniz Bölgesi,883,2014-01-01,Motorin,T155,0.0
4,10048261,Karadeniz Bölgesi,883,2014-01-02,Motorin,T155,16685.0
...,...,...,...,...,...,...,...
1379255,12663344,Doğu Anadolu Bölgesi,218,2018-12-22,Motorin,T317,1504.0
1379256,12663344,Doğu Anadolu Bölgesi,218,2018-12-28,Benzin,T317,11111.0
1379257,12663344,Doğu Anadolu Bölgesi,218,2018-12-28,Motorin,T317,22882.0
1379258,12664221,Marmara Bölgesi,941,2018-12-29,Benzin,T102,6528.0


In [4]:
test

Unnamed: 0,index,Customer,Area,City Code,Product,Week Start Date
0,0,10048261,Karadeniz Bölgesi,883,Motorin,2019-01-07
1,1,10048261,Karadeniz Bölgesi,883,Motorin,2019-01-14
2,2,10048261,Karadeniz Bölgesi,883,Motorin,2019-01-21
3,3,10048261,Karadeniz Bölgesi,883,Motorin,2019-01-28
4,4,10048261,Karadeniz Bölgesi,883,Motorin,2019-02-04
...,...,...,...,...,...,...
108467,108467,12739737,Doğu Anadolu Bölgesi,194,Benzin,2019-12-02
108468,108468,12739737,Doğu Anadolu Bölgesi,194,Benzin,2019-12-09
108469,108469,12739737,Doğu Anadolu Bölgesi,194,Benzin,2019-12-16
108470,108470,12739737,Doğu Anadolu Bölgesi,194,Benzin,2019-12-23


In [5]:
price

Unnamed: 0,City Code,Validfrom,Product,CRP
0,116,2016-07-27,Benzin,4.46
1,116,2016-07-28,Benzin,4.46
2,116,2016-07-29,Benzin,4.46
3,116,2016-07-30,Benzin,4.40
4,116,2016-07-31,Benzin,4.40
...,...,...,...,...
311739,990,2019-12-27,Motorin,6.74
311740,990,2019-12-28,Motorin,6.74
311741,990,2019-12-29,Motorin,6.74
311742,990,2019-12-30,Motorin,6.74


In [6]:
price.rename(columns={"Product":"Product-Price"}, inplace = True)
price

Unnamed: 0,City Code,Validfrom,Product-Price,CRP
0,116,2016-07-27,Benzin,4.46
1,116,2016-07-28,Benzin,4.46
2,116,2016-07-29,Benzin,4.46
3,116,2016-07-30,Benzin,4.40
4,116,2016-07-31,Benzin,4.40
...,...,...,...,...
311739,990,2019-12-27,Motorin,6.74
311740,990,2019-12-28,Motorin,6.74
311741,990,2019-12-29,Motorin,6.74
311742,990,2019-12-30,Motorin,6.74


In [7]:
price_grpd_agg = price.groupby("City Code").agg({"Validfrom": ["count", "nunique"], "Product-Price": ["count", "nunique"],
                               "CRP": ["min", "median", "mean", "max", "sum", "std"]})
price_grpd_agg

Unnamed: 0_level_0,Validfrom,Validfrom,Product-Price,Product-Price,CRP,CRP,CRP,CRP,CRP,CRP
Unnamed: 0_level_1,count,nunique,count,nunique,min,median,mean,max,sum,std
City Code,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
116,2504,1252,2504,2,3.70,5.80,5.779724,7.33,14472.43,0.906490
125,4324,2162,4324,2,3.25,4.95,5.096658,7.15,22037.95,0.994862
126,4324,2162,4324,2,3.22,4.93,5.071732,7.09,21930.17,0.987659
150,4306,2153,4306,2,3.27,4.96,5.122436,7.23,22057.21,1.014794
172,4326,2163,4326,2,3.41,5.03,5.234619,7.31,22644.96,1.008456
...,...,...,...,...,...,...,...,...,...,...
957,4326,2163,4326,2,3.32,4.98,5.159452,7.29,22319.79,1.015930
966,4326,2163,4326,2,3.26,4.96,5.118116,7.23,22140.97,1.015432
976,4326,2163,4326,2,3.29,4.95,5.119979,7.20,22149.03,0.998805
987,4326,2163,4326,2,3.28,4.97,5.140985,7.27,22239.90,1.016212


In [8]:
columns = []
for col in price_grpd_agg.columns:
    columns.append("-".join(col))
columns

['Validfrom-count',
 'Validfrom-nunique',
 'Product-Price-count',
 'Product-Price-nunique',
 'CRP-min',
 'CRP-median',
 'CRP-mean',
 'CRP-max',
 'CRP-sum',
 'CRP-std']

In [9]:
price_grpd_agg.columns = columns
price_grpd_agg

Unnamed: 0_level_0,Validfrom-count,Validfrom-nunique,Product-Price-count,Product-Price-nunique,CRP-min,CRP-median,CRP-mean,CRP-max,CRP-sum,CRP-std
City Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
116,2504,1252,2504,2,3.70,5.80,5.779724,7.33,14472.43,0.906490
125,4324,2162,4324,2,3.25,4.95,5.096658,7.15,22037.95,0.994862
126,4324,2162,4324,2,3.22,4.93,5.071732,7.09,21930.17,0.987659
150,4306,2153,4306,2,3.27,4.96,5.122436,7.23,22057.21,1.014794
172,4326,2163,4326,2,3.41,5.03,5.234619,7.31,22644.96,1.008456
...,...,...,...,...,...,...,...,...,...,...
957,4326,2163,4326,2,3.32,4.98,5.159452,7.29,22319.79,1.015930
966,4326,2163,4326,2,3.26,4.96,5.118116,7.23,22140.97,1.015432
976,4326,2163,4326,2,3.29,4.95,5.119979,7.20,22149.03,0.998805
987,4326,2163,4326,2,3.28,4.97,5.140985,7.27,22239.90,1.016212


In [10]:
price_grpd_agg.reset_index(inplace = True)
price_grpd_agg

Unnamed: 0,City Code,Validfrom-count,Validfrom-nunique,Product-Price-count,Product-Price-nunique,CRP-min,CRP-median,CRP-mean,CRP-max,CRP-sum,CRP-std
0,116,2504,1252,2504,2,3.70,5.80,5.779724,7.33,14472.43,0.906490
1,125,4324,2162,4324,2,3.25,4.95,5.096658,7.15,22037.95,0.994862
2,126,4324,2162,4324,2,3.22,4.93,5.071732,7.09,21930.17,0.987659
3,150,4306,2153,4306,2,3.27,4.96,5.122436,7.23,22057.21,1.014794
4,172,4326,2163,4326,2,3.41,5.03,5.234619,7.31,22644.96,1.008456
...,...,...,...,...,...,...,...,...,...,...,...
69,957,4326,2163,4326,2,3.32,4.98,5.159452,7.29,22319.79,1.015930
70,966,4326,2163,4326,2,3.26,4.96,5.118116,7.23,22140.97,1.015432
71,976,4326,2163,4326,2,3.29,4.95,5.119979,7.20,22149.03,0.998805
72,987,4326,2163,4326,2,3.28,4.97,5.140985,7.27,22239.90,1.016212


In [11]:
train_price = pd.merge(train, price_grpd_agg, how = "left")
train_price

Unnamed: 0,Customer,Area,City Code,Billing Date,Product,Plant,Litres,Validfrom-count,Validfrom-nunique,Product-Price-count,Product-Price-nunique,CRP-min,CRP-median,CRP-mean,CRP-max,CRP-sum,CRP-std
0,10048261,Karadeniz Bölgesi,883,2014-01-01,Benzin,T102,0.0,4326.0,2163.0,4326.0,2.0,3.28,4.98,5.145950,7.25,22261.38,1.011349
1,10048261,Karadeniz Bölgesi,883,2014-01-01,Benzin,T155,0.0,4326.0,2163.0,4326.0,2.0,3.28,4.98,5.145950,7.25,22261.38,1.011349
2,10048261,Karadeniz Bölgesi,883,2014-01-01,Motorin,T102,0.0,4326.0,2163.0,4326.0,2.0,3.28,4.98,5.145950,7.25,22261.38,1.011349
3,10048261,Karadeniz Bölgesi,883,2014-01-01,Motorin,T155,0.0,4326.0,2163.0,4326.0,2.0,3.28,4.98,5.145950,7.25,22261.38,1.011349
4,10048261,Karadeniz Bölgesi,883,2014-01-02,Motorin,T155,16685.0,4326.0,2163.0,4326.0,2.0,3.28,4.98,5.145950,7.25,22261.38,1.011349
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1379255,12663344,Doğu Anadolu Bölgesi,218,2018-12-22,Motorin,T317,1504.0,4326.0,2163.0,4326.0,2.0,3.33,5.01,5.181574,7.32,22415.49,1.020376
1379256,12663344,Doğu Anadolu Bölgesi,218,2018-12-28,Benzin,T317,11111.0,4326.0,2163.0,4326.0,2.0,3.33,5.01,5.181574,7.32,22415.49,1.020376
1379257,12663344,Doğu Anadolu Bölgesi,218,2018-12-28,Motorin,T317,22882.0,4326.0,2163.0,4326.0,2.0,3.33,5.01,5.181574,7.32,22415.49,1.020376
1379258,12664221,Marmara Bölgesi,941,2018-12-29,Benzin,T102,6528.0,4326.0,2163.0,4326.0,2.0,3.25,4.96,5.100127,7.16,22063.15,0.997419


In [12]:
test_price = pd.merge(test, price_grpd_agg, how = "left")
test_price

Unnamed: 0,index,Customer,Area,City Code,Product,Week Start Date,Validfrom-count,Validfrom-nunique,Product-Price-count,Product-Price-nunique,CRP-min,CRP-median,CRP-mean,CRP-max,CRP-sum,CRP-std
0,0,10048261,Karadeniz Bölgesi,883,Motorin,2019-01-07,4326.0,2163.0,4326.0,2.0,3.28,4.98,5.145950,7.25,22261.38,1.011349
1,1,10048261,Karadeniz Bölgesi,883,Motorin,2019-01-14,4326.0,2163.0,4326.0,2.0,3.28,4.98,5.145950,7.25,22261.38,1.011349
2,2,10048261,Karadeniz Bölgesi,883,Motorin,2019-01-21,4326.0,2163.0,4326.0,2.0,3.28,4.98,5.145950,7.25,22261.38,1.011349
3,3,10048261,Karadeniz Bölgesi,883,Motorin,2019-01-28,4326.0,2163.0,4326.0,2.0,3.28,4.98,5.145950,7.25,22261.38,1.011349
4,4,10048261,Karadeniz Bölgesi,883,Motorin,2019-02-04,4326.0,2163.0,4326.0,2.0,3.28,4.98,5.145950,7.25,22261.38,1.011349
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
108467,108467,12739737,Doğu Anadolu Bölgesi,194,Benzin,2019-12-02,4324.0,2162.0,4324.0,2.0,3.46,5.07,5.276242,7.38,22814.47,1.009622
108468,108468,12739737,Doğu Anadolu Bölgesi,194,Benzin,2019-12-09,4324.0,2162.0,4324.0,2.0,3.46,5.07,5.276242,7.38,22814.47,1.009622
108469,108469,12739737,Doğu Anadolu Bölgesi,194,Benzin,2019-12-16,4324.0,2162.0,4324.0,2.0,3.46,5.07,5.276242,7.38,22814.47,1.009622
108470,108470,12739737,Doğu Anadolu Bölgesi,194,Benzin,2019-12-23,4324.0,2162.0,4324.0,2.0,3.46,5.07,5.276242,7.38,22814.47,1.009622


# <h1 align="center"> DATA PREPROCESSING </h1>

In [13]:
X_train_val = train_price.drop(["Plant", "Litres"], axis = 1)
X_train_val

Unnamed: 0,Customer,Area,City Code,Billing Date,Product,Validfrom-count,Validfrom-nunique,Product-Price-count,Product-Price-nunique,CRP-min,CRP-median,CRP-mean,CRP-max,CRP-sum,CRP-std
0,10048261,Karadeniz Bölgesi,883,2014-01-01,Benzin,4326.0,2163.0,4326.0,2.0,3.28,4.98,5.145950,7.25,22261.38,1.011349
1,10048261,Karadeniz Bölgesi,883,2014-01-01,Benzin,4326.0,2163.0,4326.0,2.0,3.28,4.98,5.145950,7.25,22261.38,1.011349
2,10048261,Karadeniz Bölgesi,883,2014-01-01,Motorin,4326.0,2163.0,4326.0,2.0,3.28,4.98,5.145950,7.25,22261.38,1.011349
3,10048261,Karadeniz Bölgesi,883,2014-01-01,Motorin,4326.0,2163.0,4326.0,2.0,3.28,4.98,5.145950,7.25,22261.38,1.011349
4,10048261,Karadeniz Bölgesi,883,2014-01-02,Motorin,4326.0,2163.0,4326.0,2.0,3.28,4.98,5.145950,7.25,22261.38,1.011349
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1379255,12663344,Doğu Anadolu Bölgesi,218,2018-12-22,Motorin,4326.0,2163.0,4326.0,2.0,3.33,5.01,5.181574,7.32,22415.49,1.020376
1379256,12663344,Doğu Anadolu Bölgesi,218,2018-12-28,Benzin,4326.0,2163.0,4326.0,2.0,3.33,5.01,5.181574,7.32,22415.49,1.020376
1379257,12663344,Doğu Anadolu Bölgesi,218,2018-12-28,Motorin,4326.0,2163.0,4326.0,2.0,3.33,5.01,5.181574,7.32,22415.49,1.020376
1379258,12664221,Marmara Bölgesi,941,2018-12-29,Benzin,4326.0,2163.0,4326.0,2.0,3.25,4.96,5.100127,7.16,22063.15,0.997419


In [14]:
y_train_val = train["Litres"]
y_train_val

0              0.0
1              0.0
2              0.0
3              0.0
4          16685.0
            ...   
1379255     1504.0
1379256    11111.0
1379257    22882.0
1379258     6528.0
1379259    26314.0
Name: Litres, Length: 1379260, dtype: float64

In [15]:
X_test = test_price.drop(["index"], axis = 1)
X_test

Unnamed: 0,Customer,Area,City Code,Product,Week Start Date,Validfrom-count,Validfrom-nunique,Product-Price-count,Product-Price-nunique,CRP-min,CRP-median,CRP-mean,CRP-max,CRP-sum,CRP-std
0,10048261,Karadeniz Bölgesi,883,Motorin,2019-01-07,4326.0,2163.0,4326.0,2.0,3.28,4.98,5.145950,7.25,22261.38,1.011349
1,10048261,Karadeniz Bölgesi,883,Motorin,2019-01-14,4326.0,2163.0,4326.0,2.0,3.28,4.98,5.145950,7.25,22261.38,1.011349
2,10048261,Karadeniz Bölgesi,883,Motorin,2019-01-21,4326.0,2163.0,4326.0,2.0,3.28,4.98,5.145950,7.25,22261.38,1.011349
3,10048261,Karadeniz Bölgesi,883,Motorin,2019-01-28,4326.0,2163.0,4326.0,2.0,3.28,4.98,5.145950,7.25,22261.38,1.011349
4,10048261,Karadeniz Bölgesi,883,Motorin,2019-02-04,4326.0,2163.0,4326.0,2.0,3.28,4.98,5.145950,7.25,22261.38,1.011349
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
108467,12739737,Doğu Anadolu Bölgesi,194,Benzin,2019-12-02,4324.0,2162.0,4324.0,2.0,3.46,5.07,5.276242,7.38,22814.47,1.009622
108468,12739737,Doğu Anadolu Bölgesi,194,Benzin,2019-12-09,4324.0,2162.0,4324.0,2.0,3.46,5.07,5.276242,7.38,22814.47,1.009622
108469,12739737,Doğu Anadolu Bölgesi,194,Benzin,2019-12-16,4324.0,2162.0,4324.0,2.0,3.46,5.07,5.276242,7.38,22814.47,1.009622
108470,12739737,Doğu Anadolu Bölgesi,194,Benzin,2019-12-23,4324.0,2162.0,4324.0,2.0,3.46,5.07,5.276242,7.38,22814.47,1.009622


In [16]:
cols = list(X_test.columns)
cols = cols[:3] + ["Week Start Date", "Product"] + cols[5:]
X_test = X_test[cols]
X_test

Unnamed: 0,Customer,Area,City Code,Week Start Date,Product,Validfrom-count,Validfrom-nunique,Product-Price-count,Product-Price-nunique,CRP-min,CRP-median,CRP-mean,CRP-max,CRP-sum,CRP-std
0,10048261,Karadeniz Bölgesi,883,2019-01-07,Motorin,4326.0,2163.0,4326.0,2.0,3.28,4.98,5.145950,7.25,22261.38,1.011349
1,10048261,Karadeniz Bölgesi,883,2019-01-14,Motorin,4326.0,2163.0,4326.0,2.0,3.28,4.98,5.145950,7.25,22261.38,1.011349
2,10048261,Karadeniz Bölgesi,883,2019-01-21,Motorin,4326.0,2163.0,4326.0,2.0,3.28,4.98,5.145950,7.25,22261.38,1.011349
3,10048261,Karadeniz Bölgesi,883,2019-01-28,Motorin,4326.0,2163.0,4326.0,2.0,3.28,4.98,5.145950,7.25,22261.38,1.011349
4,10048261,Karadeniz Bölgesi,883,2019-02-04,Motorin,4326.0,2163.0,4326.0,2.0,3.28,4.98,5.145950,7.25,22261.38,1.011349
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
108467,12739737,Doğu Anadolu Bölgesi,194,2019-12-02,Benzin,4324.0,2162.0,4324.0,2.0,3.46,5.07,5.276242,7.38,22814.47,1.009622
108468,12739737,Doğu Anadolu Bölgesi,194,2019-12-09,Benzin,4324.0,2162.0,4324.0,2.0,3.46,5.07,5.276242,7.38,22814.47,1.009622
108469,12739737,Doğu Anadolu Bölgesi,194,2019-12-16,Benzin,4324.0,2162.0,4324.0,2.0,3.46,5.07,5.276242,7.38,22814.47,1.009622
108470,12739737,Doğu Anadolu Bölgesi,194,2019-12-23,Benzin,4324.0,2162.0,4324.0,2.0,3.46,5.07,5.276242,7.38,22814.47,1.009622


In [18]:
lbl = LabelEncoder() 
X_train_val['Area'] = lbl.fit_transform(X_train_val['Area'].astype(str))
X_train_val['Billing Date'] = lbl.fit_transform(X_train_val['Billing Date'].astype(str)) 
#X_train_val['Plant'] = lbl.fit_transform(X_train_val['Plant'].astype(str)) 
X_train_val['Product'] = lbl.fit_transform(X_train_val['Product'].astype(str)) 
X_train_val

Unnamed: 0,Customer,Area,City Code,Billing Date,Product,Validfrom-count,Validfrom-nunique,Product-Price-count,Product-Price-nunique,CRP-min,CRP-median,CRP-mean,CRP-max,CRP-sum,CRP-std
0,10048261,4,883,0,0,4326.0,2163.0,4326.0,2.0,3.28,4.98,5.145950,7.25,22261.38,1.011349
1,10048261,4,883,0,0,4326.0,2163.0,4326.0,2.0,3.28,4.98,5.145950,7.25,22261.38,1.011349
2,10048261,4,883,0,1,4326.0,2163.0,4326.0,2.0,3.28,4.98,5.145950,7.25,22261.38,1.011349
3,10048261,4,883,0,1,4326.0,2163.0,4326.0,2.0,3.28,4.98,5.145950,7.25,22261.38,1.011349
4,10048261,4,883,1,1,4326.0,2163.0,4326.0,2.0,3.28,4.98,5.145950,7.25,22261.38,1.011349
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1379255,12663344,1,218,1796,1,4326.0,2163.0,4326.0,2.0,3.33,5.01,5.181574,7.32,22415.49,1.020376
1379256,12663344,1,218,1802,0,4326.0,2163.0,4326.0,2.0,3.33,5.01,5.181574,7.32,22415.49,1.020376
1379257,12663344,1,218,1802,1,4326.0,2163.0,4326.0,2.0,3.33,5.01,5.181574,7.32,22415.49,1.020376
1379258,12664221,5,941,1803,0,4326.0,2163.0,4326.0,2.0,3.25,4.96,5.100127,7.16,22063.15,0.997419


In [19]:
lbl = LabelEncoder() 
X_test['Area'] = lbl.fit_transform(X_test['Area'].astype(str))
X_test['Week Start Date'] = lbl.fit_transform(X_test['Week Start Date'].astype(str)) 
X_test['Product'] = lbl.fit_transform(X_test['Product'].astype(str)) 
X_test

Unnamed: 0,Customer,Area,City Code,Week Start Date,Product,Validfrom-count,Validfrom-nunique,Product-Price-count,Product-Price-nunique,CRP-min,CRP-median,CRP-mean,CRP-max,CRP-sum,CRP-std
0,10048261,4,883,0,1,4326.0,2163.0,4326.0,2.0,3.28,4.98,5.145950,7.25,22261.38,1.011349
1,10048261,4,883,1,1,4326.0,2163.0,4326.0,2.0,3.28,4.98,5.145950,7.25,22261.38,1.011349
2,10048261,4,883,2,1,4326.0,2163.0,4326.0,2.0,3.28,4.98,5.145950,7.25,22261.38,1.011349
3,10048261,4,883,3,1,4326.0,2163.0,4326.0,2.0,3.28,4.98,5.145950,7.25,22261.38,1.011349
4,10048261,4,883,4,1,4326.0,2163.0,4326.0,2.0,3.28,4.98,5.145950,7.25,22261.38,1.011349
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
108467,12739737,1,194,47,0,4324.0,2162.0,4324.0,2.0,3.46,5.07,5.276242,7.38,22814.47,1.009622
108468,12739737,1,194,48,0,4324.0,2162.0,4324.0,2.0,3.46,5.07,5.276242,7.38,22814.47,1.009622
108469,12739737,1,194,49,0,4324.0,2162.0,4324.0,2.0,3.46,5.07,5.276242,7.38,22814.47,1.009622
108470,12739737,1,194,50,0,4324.0,2162.0,4324.0,2.0,3.46,5.07,5.276242,7.38,22814.47,1.009622


# <h1 align="center"> MODEL COMPARISON </h1>

# LGBM

In [28]:
start = time.time()
fold_number = 10

rmsle_scores = []
X_train_inner_val, X_outer_val, y_train_inner_val, y_outer_val = train_test_split(X_train_val, y_train_val, test_size=0.2)
for _ in range(fold_number):
    X_train, X_inner_val, y_train, y_inner_val = train_test_split(X_train_inner_val, y_train_inner_val, test_size=0.2)
    lgbmr_base = LGBMRegressor()
    lgbmr_base.fit(X_train, y_train)
    y_inner_pred = lgbmr_base.predict(X_inner_val)
    y_inner_pred_norm = minmax_scale(y_inner_pred, feature_range=(0,1))
    y_inner_val_norm = minmax_scale(y_inner_val, feature_range=(0,1))
    rmsle_score = mean_squared_log_error(y_inner_pred_norm, y_inner_val_norm, squared=False)
    rmsle_scores.append(rmsle_score)
print("RMSLE Score for Inner Validation: {}".format(np.mean(rmsle_scores)))
lgbmr_base = LGBMRegressor()
lgbmr_base.fit(X_train_inner_val, y_train_inner_val)
y_outer_pred = lgbmr_base.predict(X_outer_val)
y_outer_pred_norm = minmax_scale(y_outer_pred, feature_range=(0,1))
y_outer_val_norm = minmax_scale(y_outer_val, feature_range=(0,1))
rmsle_score = mean_squared_log_error(y_outer_pred_norm, y_outer_val_norm, squared=False)
print("\nRMSLE Score for Outer Validation: {}".format(rmsle_score))

print("\nElapsed time: {}".format(time.time()-start))

RMSLE Score for Inner Validation: 0.24040686768124667

RMSLE Score for Outer Validation: 0.30036746026786876

Elapsed time: 85.43339610099792


# XGBOOST

In [29]:
start = time.time()
fold_number = 10

rmsle_scores = []
X_train_inner_val, X_outer_val, y_train_inner_val, y_outer_val = train_test_split(X_train_val, y_train_val, test_size=0.2)
for _ in range(fold_number):
    X_train, X_inner_val, y_train, y_inner_val = train_test_split(X_train_inner_val, y_train_inner_val, test_size=0.2)
    xgbr_base = XGBRegressor()
    xgbr_base.fit(X_train, y_train)
    y_inner_pred = xgbr_base.predict(X_inner_val)
    y_inner_pred_norm = minmax_scale(y_inner_pred, feature_range=(0,1))
    y_inner_val_norm = minmax_scale(y_inner_val, feature_range=(0,1))
    rmsle_score = mean_squared_log_error(y_inner_pred_norm, y_inner_val_norm, squared=False)
    rmsle_scores.append(rmsle_score)
print("RMSLE Score for Inner Validation: {}".format(np.mean(rmsle_scores)))

xgbr_base = XGBRegressor()
xgbr_base.fit(X_train_inner_val, y_train_inner_val)
y_outer_pred = xgbr_base.predict(X_outer_val)
y_outer_pred_norm = minmax_scale(y_outer_pred, feature_range=(0,1))
y_outer_val_norm = minmax_scale(y_outer_val, feature_range=(0,1))
rmsle_score = mean_squared_log_error(y_outer_pred_norm, y_outer_val_norm, squared=False)
print("\nRMSLE Score for Outer Validation: {}".format(rmsle_score))

print("\nElapsed time: {}".format(time.time()-start))

RMSLE Score for Inner Validation: 0.2194962693499236

RMSLE Score for Outer Validation: 0.2668754352185889

Elapsed time: 778.3089118003845


# CATBOOST

In [34]:
start = time.time()
fold_number = 10

rmsle_scores = []
X_train_inner_val, X_outer_val, y_train_inner_val, y_outer_val = train_test_split(X_train_val, y_train_val, test_size=0.2)
for _ in range(fold_number):
    X_train, X_inner_val, y_train, y_inner_val = train_test_split(X_train_inner_val, y_train_inner_val, test_size=0.2)
    cbr_base = CatBoostRegressor(verbose=0) 
    cbr_base.fit(X_train, y_train)
    y_inner_pred = cbr_base.predict(X_inner_val)
    y_inner_pred_norm = minmax_scale(y_inner_pred, feature_range=(0,1))
    y_inner_val_norm = minmax_scale(y_inner_val, feature_range=(0,1))
    rmsle_score = mean_squared_log_error(y_inner_pred_norm, y_inner_val_norm, squared=False)
    rmsle_scores.append(rmsle_score)
print("RMSLE Score for Inner Validation: {}".format(np.mean(rmsle_scores)))

cbr_base = CatBoostRegressor(verbose=0)
cbr_base.fit(X_train_inner_val, y_train_inner_val)
y_outer_pred = cbr_base.predict(X_outer_val)
y_outer_pred_norm = minmax_scale(y_outer_pred, feature_range=(0,1))
y_outer_val_norm = minmax_scale(y_outer_val, feature_range=(0,1))
rmsle_score = mean_squared_log_error(y_outer_pred_norm, y_outer_val_norm, squared=False)
print("\nRMSLE Score for Outer Validation: {}".format(rmsle_score))

print("\nElapsed time: {}".format(time.time()-start))

RMSLE Score for Inner Validation: 0.21898189783301372

RMSLE Score for Outer Validation: 0.2149571843543665

Elapsed time: 1119.44433259964


# <h1 align="center"> HYPERPARAMETER TUNING </h1>

# LGBM

In [45]:
fold_number = 5

def objective(trial):
    params = {
        #'metric': 'mlogloss', 
        'n_estimators': trial.suggest_int('num_leaves', 10, 10000),
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-3, 10.0),
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-3, 10.0),
        'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.3,0.4,0.5,0.6,0.7,0.8,0.9, 1.0]),
        'subsample': trial.suggest_categorical('subsample', [0.4,0.5,0.6,0.7,0.8,1.0]),
        'learning_rate': trial.suggest_categorical('learning_rate', [0.006,0.008,0.01,0.014,0.017,0.02, 0.1]),
        'max_depth': trial.suggest_categorical('max_depth', [10,20,100]),
        'num_leaves' : trial.suggest_int('num_leaves', 1, 1000),
        'min_child_samples': trial.suggest_int('min_child_samples', 1, 300),
        "scale_pos_weight": trial.suggest_int('scale_pos_weight', 1, 10),
    }
        
    rmsle_scores = []
    X_train_inner_val, X_outer_val, y_train_inner_val, y_outer_val = train_test_split(X_train_val, y_train_val, test_size=0.2)
    X_outer_splits.append((X_train_inner_val, X_outer_val))
    y_outer_splits.append((y_train_inner_val, y_outer_val))
    for _ in range(fold_number):
        X_train, X_inner_val, y_train, y_inner_val = train_test_split(X_train_inner_val, y_train_inner_val, test_size=0.2)
        lgbmr = LGBMRegressor()
        lgbmr.fit(X_train, y_train)
        y_inner_pred = lgbmr.predict(X_inner_val)
        y_inner_pred_norm = minmax_scale(y_inner_pred, feature_range=(0,1))
        y_inner_val_norm = minmax_scale(y_inner_val, feature_range=(0,1))
        rmsle_score = mean_squared_log_error(y_inner_pred_norm, y_inner_val_norm, squared=False)
        rmsle_scores.append(rmsle_score)

    return np.mean(rmsle_scores)

X_outer_splits = []
y_outer_splits = []
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=50)

[32m[I 2022-12-23 22:30:45,759][0m A new study created in memory with name: no-name-4889dcf2-21b8-4708-b307-88c3626fd2f0[0m
[32m[I 2022-12-23 22:31:07,447][0m Trial 0 finished with value: 0.25543206740753066 and parameters: {'num_leaves': 703, 'reg_alpha': 0.006325219481707933, 'reg_lambda': 0.010091924907406031, 'colsample_bytree': 1.0, 'subsample': 0.6, 'learning_rate': 0.02, 'max_depth': 10, 'min_child_samples': 153, 'scale_pos_weight': 5}. Best is trial 0 with value: 0.25543206740753066.[0m
[32m[I 2022-12-23 22:31:29,895][0m Trial 1 finished with value: 0.25162093052884976 and parameters: {'num_leaves': 8835, 'reg_alpha': 0.038046873320573604, 'reg_lambda': 0.02546457282438107, 'colsample_bytree': 0.9, 'subsample': 0.7, 'learning_rate': 0.014, 'max_depth': 20, 'min_child_samples': 215, 'scale_pos_weight': 9}. Best is trial 1 with value: 0.25162093052884976.[0m
[32m[I 2022-12-23 22:31:52,949][0m Trial 2 finished with value: 0.24730572682954027 and parameters: {'num_leaves

[32m[I 2022-12-23 22:39:31,885][0m Trial 22 finished with value: 0.24574756823352195 and parameters: {'num_leaves': 4257, 'reg_alpha': 0.32134948732854657, 'reg_lambda': 0.040701885716605296, 'colsample_bytree': 0.6, 'subsample': 0.7, 'learning_rate': 0.017, 'max_depth': 20, 'min_child_samples': 280, 'scale_pos_weight': 8}. Best is trial 17 with value: 0.22747921468550905.[0m
[32m[I 2022-12-23 22:39:53,932][0m Trial 23 finished with value: 0.2442848566777988 and parameters: {'num_leaves': 5563, 'reg_alpha': 0.010440539504547626, 'reg_lambda': 0.004539861970237538, 'colsample_bytree': 0.7, 'subsample': 0.8, 'learning_rate': 0.008, 'max_depth': 20, 'min_child_samples': 229, 'scale_pos_weight': 9}. Best is trial 17 with value: 0.22747921468550905.[0m
[32m[I 2022-12-23 22:40:15,993][0m Trial 24 finished with value: 0.238229984161669 and parameters: {'num_leaves': 3126, 'reg_alpha': 0.09175202787687621, 'reg_lambda': 0.13046074635276553, 'colsample_bytree': 0.5, 'subsample': 0.7, 'l

[32m[I 2022-12-23 22:47:48,623][0m Trial 44 finished with value: 0.26433137090576914 and parameters: {'num_leaves': 1764, 'reg_alpha': 0.977449723536174, 'reg_lambda': 1.8754491962757411, 'colsample_bytree': 0.5, 'subsample': 0.7, 'learning_rate': 0.02, 'max_depth': 20, 'min_child_samples': 260, 'scale_pos_weight': 8}. Best is trial 17 with value: 0.22747921468550905.[0m
[32m[I 2022-12-23 22:48:10,624][0m Trial 45 finished with value: 0.24762688594150495 and parameters: {'num_leaves': 33, 'reg_alpha': 4.39865250485799, 'reg_lambda': 5.976950141169635, 'colsample_bytree': 0.7, 'subsample': 0.8, 'learning_rate': 0.017, 'max_depth': 20, 'min_child_samples': 210, 'scale_pos_weight': 9}. Best is trial 17 with value: 0.22747921468550905.[0m
[32m[I 2022-12-23 22:48:33,251][0m Trial 46 finished with value: 0.2486452025855968 and parameters: {'num_leaves': 3975, 'reg_alpha': 0.0010229223407016324, 'reg_lambda': 0.8710376531601984, 'colsample_bytree': 0.4, 'subsample': 0.4, 'learning_rat

In [46]:
print("Number of finished trials: {}".format(len(study.trials)))

print("Best trial:")
best_trial = study.best_trial

print("  Best Value: {}".format(best_trial.value))

print("  Best Params: ")
for key, value in best_trial.params.items():
    print("    {}: {}".format(key, value))

Number of finished trials: 50
Best trial:
  Best Value: 0.22747921468550905
  Best Params: 
    num_leaves: 4834
    reg_alpha: 0.08201084069814263
    reg_lambda: 0.005808788361912547
    colsample_bytree: 0.6
    subsample: 0.8
    learning_rate: 0.017
    max_depth: 20
    min_child_samples: 267
    scale_pos_weight: 10


In [None]:
outer_rmsle_scores = []

for index, X_outer_split in enumerate(X_outer_splits):
    lgbmr = LGBMRegressor(**best_trial.params)
    lgbmr.fit(X_outer_split[0], y_outer_splits[index][0])
    y_outer_pred = lgbmr.predict(X_outer_split[1])
    y_outer_pred_norm = minmax_scale(y_outer_pred, feature_range=(0,1))
    y_outer_val_norm = minmax_scale(y_outer_splits[index][1], feature_range=(0,1))
    rmsle_score = mean_squared_log_error(y_outer_pred_norm, y_outer_val_norm, squared=False)
    outer_rmsle_scores.append(rmsle_score)
print("RMSLE Score for Outer Validation: {}".format(np.mean(outer_rmsle_scores)))

# XGBOOST

In [None]:
def objective(trial):
    params = {
        'max_depth': trial.suggest_int('max_depth', 1, 9),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 1),
        'n_estimators': trial.suggest_int('n_estimators', 50, 500),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'gamma': trial.suggest_loguniform('gamma', 1e-8, 1.0),
        'subsample': trial.suggest_loguniform('subsample', 0.01, 1.0),
        'colsample_bytree': trial.suggest_loguniform('colsample_bytree', 0.01, 1.0),
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-8, 1.0),
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-8, 1.0),
        "scale_pos_weight": trial.suggest_int('scale_pos_weight', 3, 10),
        'eval_metric': 'mlogloss',
        'use_label_encoder': False
    }
        
    rmsle_scores = []

    for _ in range(5):
        X_train, X_inner_val, y_train, y_inner_val = train_test_split(X_train_inner_val, y_train_inner_val, test_size=0.2)
        xgbr = XGBRegressor()
        xgbr.fit(X_train, y_train)
        y_inner_pred = xgbr.predict(X_inner_val)
        y_inner_pred_norm = minmax_scale(y_inner_pred, feature_range=(0,1))
        y_inner_val_norm = minmax_scale(y_inner_val, feature_range=(0,1))
        rmsle_score = mean_squared_log_error(y_inner_pred_norm, y_inner_val_norm, squared=False)
        rmsle_scores.append(rmsle_score)

    return np.mean(rmsle_scores)

X_train_inner_val, X_outer_val, y_train_inner_val, y_outer_val = train_test_split(X_train_val, y_train_val, test_size=0.2)
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=25)

In [None]:
print("Number of finished trials: {}".format(len(study.trials)))

print("Best trial:")
best_trial = study.best_trial

print("  Best Value: {}".format(best_trial.value))

print("  Best Params: ")
for key, value in best_trial.params.items():
    print("    {}: {}".format(key, value))

In [None]:
xgbr = XGBRegressor(**best_trial.params)
xgbr.fit(X_train_inner_val, y_train_inner_val)
y_outer_pred = xgbr.predict(X_outer_val)
y_outer_pred_norm = minmax_scale(y_outer_pred, feature_range=(0,1))
y_outer_val_norm = minmax_scale(y_outer_val, feature_range=(0,1))
rmsle_score = mean_squared_log_error(y_outer_pred_norm, y_outer_val_norm, squared=False)
print("RMSLE Score for Outer Validation: {}".format(rmsle_score))

# CATBOOST

In [None]:
def objective(trial):
    params = {
        'iterations': trial.suggest_int('iterations', 500, 5000),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 1.0),
        'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1, 100),
        'bagging_temperature': trial.suggest_loguniform('bagging_temperature', 0.1, 20.0),
        'random_strength': trial.suggest_float('random_strength', 1.0, 2.0),
        'depth': trial.suggest_int('depth', 1, 10),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 1, 300),
         "scale_pos_weight": trial.suggest_int('scale_pos_weight', 3, 10),
        "verbose" : 0
    }
        
    rmsle_scores = []

    for _ in range(5):
        X_train, X_inner_val, y_train, y_inner_val = train_test_split(X_train_inner_val, y_train_inner_val, test_size=0.2)
        cbr = CatBoostRegressor()
        cbr.fit(X_train, y_train)
        y_inner_pred = cbr.predict(X_inner_val)
        y_inner_pred_norm = minmax_scale(y_inner_pred, feature_range=(0,1))
        y_inner_val_norm = minmax_scale(y_inner_val, feature_range=(0,1))
        rmsle_score = mean_squared_log_error(y_inner_pred_norm, y_inner_val_norm, squared=False)
        rmsle_scores.append(rmsle_score)

    return np.mean(rmsle_scores)

X_train_inner_val, X_outer_val, y_train_inner_val, y_outer_val = train_test_split(X_train_val, y_train_val, test_size=0.2)
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=25)

In [None]:
print("Number of finished trials: {}".format(len(study.trials)))

print("Best trial:")
best_trial = study.best_trial

print("  Best Value: {}".format(best_trial.value))

print("  Best Params: ")
for key, value in best_trial.params.items():
    print("    {}: {}".format(key, value))

In [None]:
cbr = CatBoostRegressor(**best_trial.params)
cbr.fit(X_train_inner_val, y_train_inner_val)
y_outer_pred = cbr.predict(X_outer_val)
y_outer_pred_norm = minmax_scale(y_outer_pred, feature_range=(0,1))
y_outer_val_norm = minmax_scale(y_outer_val, feature_range=(0,1))
rmsle_score = mean_squared_log_error(y_outer_pred_norm, y_outer_val_norm, squared=False)
print("RMSLE Score for Outer Validation: {}".format(rmsle_score))

# <h1 align="center"> TRAIN THE FINAL MODEL & PREDICT </h1>

In [None]:
lgbmr_final = LGBMRegressor(**trial.params)
lgbmr_final.fit(X_train_val, y_train_val)
y_test_pred = lgbmr_final.predict(X_test)

In [None]:
xgbr_final = XGBRegressor(**trial.params)
xgbr_final.fit(X_train_val, y_train_val)
y_test_pred = xgbr_final.predict(X_test)

In [None]:
cbr_final = CatBoostRegressor(**trial.params)
cbr_final.fit(X_train_val, y_train_val)
y_test_pred = cbr_final.predict(X_test)

# <h1 align="center"> FEATURE IMPORTANCE </h1>

In [None]:
#plt.rcParams["figure.figsize"] = (14, 24)
plot_importance(xgbr_final)
plt.show()

In [None]:
#plt.rcParams["figure.figsize"] = (14, 24)
plot_importance(cbr_final)
plt.show()

In [None]:
explainerlgbmr = shap.TreeExplainer(lgbmr_final)
shap_values_train = explainerlgbmr.shap_values(X_train_val)

In [None]:
explainerxgbr = shap.TreeExplainer(xgbr_final)
shap_values_train = explainerxgbr.shap_values(X_train_val)

In [None]:
explainercbr = shap.TreeExplainer(cbr_final)
shap_values_train = explainercbr.shap_values(X_train_val)

In [None]:
shap.summary_plot(shap_values_train, X_train_val)

# <h1 align="center"> CREATE SUBMISSION FILE </h1>

In [None]:
submission = pd.DataFrame({"index":test["index"]})
submission['Litres'] = y_test_pred
submission.response = submission.Litres.astype(float)
submission

In [None]:
submission.to_csv(f"Submission-{time.strftime('%d - %H-%M-%S')}.csv", index=False)