In [1]:
import time
from tqdm import tqdm
import shap
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import minmax_scale
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_log_error
from lightgbm import LGBMClassifier, LGBMRegressor
from xgboost import XGBClassifier, XGBRegressor
from xgboost import plot_importance
from catboost import CatBoostClassifier, CatBoostRegressor
import optuna
pd.set_option('display.max_rows', 1000)

warnings.filterwarnings("ignore")

# <h1><center> DATA READING & MERGING </center></h1>

In [2]:
train = pd.read_csv("data/train.csv")
test = pd.read_csv("data/test.csv")
price = pd.read_csv("data/price.csv")

In [3]:
train

Unnamed: 0,Customer,Area,City Code,Billing Date,Product,Plant,Litres
0,10048261,Karadeniz Bölgesi,883,2014-01-01,Benzin,T102,0.0
1,10048261,Karadeniz Bölgesi,883,2014-01-01,Benzin,T155,0.0
2,10048261,Karadeniz Bölgesi,883,2014-01-01,Motorin,T102,0.0
3,10048261,Karadeniz Bölgesi,883,2014-01-01,Motorin,T155,0.0
4,10048261,Karadeniz Bölgesi,883,2014-01-02,Motorin,T155,16685.0
...,...,...,...,...,...,...,...
1379255,12663344,Doğu Anadolu Bölgesi,218,2018-12-22,Motorin,T317,1504.0
1379256,12663344,Doğu Anadolu Bölgesi,218,2018-12-28,Benzin,T317,11111.0
1379257,12663344,Doğu Anadolu Bölgesi,218,2018-12-28,Motorin,T317,22882.0
1379258,12664221,Marmara Bölgesi,941,2018-12-29,Benzin,T102,6528.0


In [4]:
test

Unnamed: 0,index,Customer,Area,City Code,Product,Week Start Date
0,0,10048261,Karadeniz Bölgesi,883,Motorin,2019-01-07
1,1,10048261,Karadeniz Bölgesi,883,Motorin,2019-01-14
2,2,10048261,Karadeniz Bölgesi,883,Motorin,2019-01-21
3,3,10048261,Karadeniz Bölgesi,883,Motorin,2019-01-28
4,4,10048261,Karadeniz Bölgesi,883,Motorin,2019-02-04
...,...,...,...,...,...,...
108467,108467,12739737,Doğu Anadolu Bölgesi,194,Benzin,2019-12-02
108468,108468,12739737,Doğu Anadolu Bölgesi,194,Benzin,2019-12-09
108469,108469,12739737,Doğu Anadolu Bölgesi,194,Benzin,2019-12-16
108470,108470,12739737,Doğu Anadolu Bölgesi,194,Benzin,2019-12-23


In [5]:
price

Unnamed: 0,City Code,Validfrom,Product,CRP
0,116,2016-07-27,Benzin,4.46
1,116,2016-07-28,Benzin,4.46
2,116,2016-07-29,Benzin,4.46
3,116,2016-07-30,Benzin,4.40
4,116,2016-07-31,Benzin,4.40
...,...,...,...,...
311739,990,2019-12-27,Motorin,6.74
311740,990,2019-12-28,Motorin,6.74
311741,990,2019-12-29,Motorin,6.74
311742,990,2019-12-30,Motorin,6.74


In [6]:
train.rename(columns={"City Code":"City_Code", "Billing Date":"Date"}, inplace = True)
train['Date'] = pd.to_datetime(train['Date'])
train

Unnamed: 0,Customer,Area,City_Code,Date,Product,Plant,Litres
0,10048261,Karadeniz Bölgesi,883,2014-01-01,Benzin,T102,0.0
1,10048261,Karadeniz Bölgesi,883,2014-01-01,Benzin,T155,0.0
2,10048261,Karadeniz Bölgesi,883,2014-01-01,Motorin,T102,0.0
3,10048261,Karadeniz Bölgesi,883,2014-01-01,Motorin,T155,0.0
4,10048261,Karadeniz Bölgesi,883,2014-01-02,Motorin,T155,16685.0
...,...,...,...,...,...,...,...
1379255,12663344,Doğu Anadolu Bölgesi,218,2018-12-22,Motorin,T317,1504.0
1379256,12663344,Doğu Anadolu Bölgesi,218,2018-12-28,Benzin,T317,11111.0
1379257,12663344,Doğu Anadolu Bölgesi,218,2018-12-28,Motorin,T317,22882.0
1379258,12664221,Marmara Bölgesi,941,2018-12-29,Benzin,T102,6528.0


In [7]:
train_grpd_by_plant = train.groupby(["Customer", "Area", "City_Code", "Date", "Product"]).agg({"Litres":[sum]})
train_grpd_by_plant.reset_index(inplace = True)
train_grpd_by_plant

Unnamed: 0_level_0,Customer,Area,City_Code,Date,Product,Litres
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,sum
0,10048261,Karadeniz Bölgesi,883,2014-01-01,Benzin,0.0
1,10048261,Karadeniz Bölgesi,883,2014-01-01,Motorin,0.0
2,10048261,Karadeniz Bölgesi,883,2014-01-02,Motorin,16685.0
3,10048261,Karadeniz Bölgesi,883,2014-01-03,Benzin,5040.0
4,10048261,Karadeniz Bölgesi,883,2014-01-03,Motorin,12027.0
...,...,...,...,...,...,...
1359080,12663344,Doğu Anadolu Bölgesi,218,2018-12-22,Motorin,1504.0
1359081,12663344,Doğu Anadolu Bölgesi,218,2018-12-28,Benzin,11111.0
1359082,12663344,Doğu Anadolu Bölgesi,218,2018-12-28,Motorin,22882.0
1359083,12664221,Marmara Bölgesi,941,2018-12-29,Benzin,6528.0


In [8]:
columns = []
for col in train_grpd_by_plant.columns[:-1]:
    columns.append("".join(col))
columns.append("_".join(train_grpd_by_plant.columns[-1]))
train_grpd_by_plant.columns = columns
train_grpd_by_plant

Unnamed: 0,Customer,Area,City_Code,Date,Product,Litres_sum
0,10048261,Karadeniz Bölgesi,883,2014-01-01,Benzin,0.0
1,10048261,Karadeniz Bölgesi,883,2014-01-01,Motorin,0.0
2,10048261,Karadeniz Bölgesi,883,2014-01-02,Motorin,16685.0
3,10048261,Karadeniz Bölgesi,883,2014-01-03,Benzin,5040.0
4,10048261,Karadeniz Bölgesi,883,2014-01-03,Motorin,12027.0
...,...,...,...,...,...,...
1359080,12663344,Doğu Anadolu Bölgesi,218,2018-12-22,Motorin,1504.0
1359081,12663344,Doğu Anadolu Bölgesi,218,2018-12-28,Benzin,11111.0
1359082,12663344,Doğu Anadolu Bölgesi,218,2018-12-28,Motorin,22882.0
1359083,12664221,Marmara Bölgesi,941,2018-12-29,Benzin,6528.0


In [9]:
price.rename(columns={"City Code":"City_Code", "Validfrom":"Date"}, inplace = True)
price['Date'] = pd.to_datetime(price['Date'])
price

Unnamed: 0,City_Code,Date,Product,CRP
0,116,2016-07-27,Benzin,4.46
1,116,2016-07-28,Benzin,4.46
2,116,2016-07-29,Benzin,4.46
3,116,2016-07-30,Benzin,4.40
4,116,2016-07-31,Benzin,4.40
...,...,...,...,...
311739,990,2019-12-27,Motorin,6.74
311740,990,2019-12-28,Motorin,6.74
311741,990,2019-12-29,Motorin,6.74
311742,990,2019-12-30,Motorin,6.74


In [10]:
price_grpd = price.groupby([pd.Grouper(key='Date', freq='W'), 'City_Code', "Product",]).agg({"CRP":["mean"]})
price_grpd.reset_index(inplace = True)
price_grpd

Unnamed: 0_level_0,Date,City_Code,Product,CRP
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,mean
0,2014-01-05,125,Benzin,5.038
1,2014-01-05,125,Motorin,4.524
2,2014-01-05,126,Benzin,5.038
3,2014-01-05,126,Motorin,4.514
4,2014-01-05,150,Benzin,5.075
...,...,...,...,...
45127,2020-01-05,976,Motorin,6.610
45128,2020-01-05,987,Benzin,7.200
45129,2020-01-05,987,Motorin,6.660
45130,2020-01-05,990,Benzin,7.220


In [11]:
columns = []
for col in price_grpd.columns[:-1]:
    columns.append("".join(col))
columns.append("_".join(price_grpd.columns[-1]))
price_grpd.columns = columns
price_grpd

Unnamed: 0,Date,City_Code,Product,CRP_mean
0,2014-01-05,125,Benzin,5.038
1,2014-01-05,125,Motorin,4.524
2,2014-01-05,126,Benzin,5.038
3,2014-01-05,126,Motorin,4.514
4,2014-01-05,150,Benzin,5.075
...,...,...,...,...
45127,2020-01-05,976,Motorin,6.610
45128,2020-01-05,987,Benzin,7.200
45129,2020-01-05,987,Motorin,6.660
45130,2020-01-05,990,Benzin,7.220


In [12]:
train_grpd_by_date = train_grpd_by_plant.groupby([pd.Grouper(key='Date', freq='W'), "Customer", "Area",
                            'City_Code', "Product",]).agg({"Litres_sum":["sum"]})
train_grpd_by_date.reset_index(inplace = True)
train_grpd_by_date

Unnamed: 0_level_0,Date,Customer,Area,City_Code,Product,Litres_sum
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,sum
0,2014-01-05,10048261,Karadeniz Bölgesi,883,Benzin,5040.0
1,2014-01-05,10048261,Karadeniz Bölgesi,883,Motorin,28712.0
2,2014-01-05,10048263,Karadeniz Bölgesi,966,Motorin,34082.0
3,2014-01-05,10048264,Karadeniz Bölgesi,883,Motorin,0.0
4,2014-01-05,10048265,Karadeniz Bölgesi,589,Benzin,0.0
...,...,...,...,...,...,...
483963,2019-01-06,12654774,Marmara Bölgesi,835,Motorin,30871.0
483964,2019-01-06,12655800,Ege Bölgesi,574,Benzin,3520.0
483965,2019-01-06,12655800,Ege Bölgesi,574,Motorin,20143.0
483966,2019-01-06,12663274,Doğu Anadolu Bölgesi,518,Benzin,13636.0


In [13]:
columns = []
for col in train_grpd_by_date.columns[:-1]:
    columns.append("".join(col))
columns.append("_".join(train_grpd_by_date.columns[-1]))
train_grpd_by_date.columns = columns
train_grpd_by_date

Unnamed: 0,Date,Customer,Area,City_Code,Product,Litres_sum_sum
0,2014-01-05,10048261,Karadeniz Bölgesi,883,Benzin,5040.0
1,2014-01-05,10048261,Karadeniz Bölgesi,883,Motorin,28712.0
2,2014-01-05,10048263,Karadeniz Bölgesi,966,Motorin,34082.0
3,2014-01-05,10048264,Karadeniz Bölgesi,883,Motorin,0.0
4,2014-01-05,10048265,Karadeniz Bölgesi,589,Benzin,0.0
...,...,...,...,...,...,...
483963,2019-01-06,12654774,Marmara Bölgesi,835,Motorin,30871.0
483964,2019-01-06,12655800,Ege Bölgesi,574,Benzin,3520.0
483965,2019-01-06,12655800,Ege Bölgesi,574,Motorin,20143.0
483966,2019-01-06,12663274,Doğu Anadolu Bölgesi,518,Benzin,13636.0


In [14]:
train_price = pd.merge(train_grpd_by_date, price_grpd, on = ["City_Code", "Date", "Product"], how = "left")
train_price.drop_duplicates(inplace = True)
train_price

Unnamed: 0,Date,Customer,Area,City_Code,Product,Litres_sum_sum,CRP_mean
0,2014-01-05,10048261,Karadeniz Bölgesi,883,Benzin,5040.0,5.076
1,2014-01-05,10048261,Karadeniz Bölgesi,883,Motorin,28712.0,4.552
2,2014-01-05,10048263,Karadeniz Bölgesi,966,Motorin,34082.0,4.518
3,2014-01-05,10048264,Karadeniz Bölgesi,883,Motorin,0.0,4.552
4,2014-01-05,10048265,Karadeniz Bölgesi,589,Benzin,0.0,5.048
...,...,...,...,...,...,...,...
483963,2019-01-06,12654774,Marmara Bölgesi,835,Motorin,30871.0,5.670
483964,2019-01-06,12655800,Ege Bölgesi,574,Benzin,3520.0,5.920
483965,2019-01-06,12655800,Ege Bölgesi,574,Motorin,20143.0,5.710
483966,2019-01-06,12663274,Doğu Anadolu Bölgesi,518,Benzin,13636.0,6.040


In [15]:
test.drop(columns=["index"], inplace = True)
test.rename(columns={"City Code":"City_Code", "Week Start Date":"Date"}, inplace = True)
test["Date"] = pd.to_datetime(test['Date'])
test

Unnamed: 0,Customer,Area,City_Code,Product,Date
0,10048261,Karadeniz Bölgesi,883,Motorin,2019-01-07
1,10048261,Karadeniz Bölgesi,883,Motorin,2019-01-14
2,10048261,Karadeniz Bölgesi,883,Motorin,2019-01-21
3,10048261,Karadeniz Bölgesi,883,Motorin,2019-01-28
4,10048261,Karadeniz Bölgesi,883,Motorin,2019-02-04
...,...,...,...,...,...
108467,12739737,Doğu Anadolu Bölgesi,194,Benzin,2019-12-02
108468,12739737,Doğu Anadolu Bölgesi,194,Benzin,2019-12-09
108469,12739737,Doğu Anadolu Bölgesi,194,Benzin,2019-12-16
108470,12739737,Doğu Anadolu Bölgesi,194,Benzin,2019-12-23


In [16]:
price_grpd

Unnamed: 0,Date,City_Code,Product,CRP_mean
0,2014-01-05,125,Benzin,5.038
1,2014-01-05,125,Motorin,4.524
2,2014-01-05,126,Benzin,5.038
3,2014-01-05,126,Motorin,4.514
4,2014-01-05,150,Benzin,5.075
...,...,...,...,...
45127,2020-01-05,976,Motorin,6.610
45128,2020-01-05,987,Benzin,7.200
45129,2020-01-05,987,Motorin,6.660
45130,2020-01-05,990,Benzin,7.220


In [17]:
price_grpd = price.set_index(price.Date)["2018-12-08":].groupby([pd.Grouper(key='Date', freq='W'), 'City_Code', "Product",]).agg({"CRP":["mean"]})
price_grpd.reset_index(inplace = True)
price_grpd["Date"] = price_grpd["Date"] + pd.DateOffset(1)
price_grpd

Unnamed: 0_level_0,Date,City_Code,Product,CRP
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,mean
0,2018-12-10,116,Benzin,6.30
1,2018-12-10,116,Motorin,6.10
2,2018-12-10,125,Benzin,6.19
3,2018-12-10,125,Motorin,5.95
4,2018-12-10,126,Benzin,6.14
...,...,...,...,...
8429,2020-01-06,976,Motorin,6.61
8430,2020-01-06,987,Benzin,7.20
8431,2020-01-06,987,Motorin,6.66
8432,2020-01-06,990,Benzin,7.22


In [18]:
columns = []
for col in price_grpd.columns[:-1]:
    columns.append("".join(col))
columns.append("_".join(price_grpd.columns[-1]))
price_grpd.columns = columns
price_grpd

Unnamed: 0,Date,City_Code,Product,CRP_mean
0,2018-12-10,116,Benzin,6.30
1,2018-12-10,116,Motorin,6.10
2,2018-12-10,125,Benzin,6.19
3,2018-12-10,125,Motorin,5.95
4,2018-12-10,126,Benzin,6.14
...,...,...,...,...
8429,2020-01-06,976,Motorin,6.61
8430,2020-01-06,987,Benzin,7.20
8431,2020-01-06,987,Motorin,6.66
8432,2020-01-06,990,Benzin,7.22


In [19]:
test_price = pd.merge(test, price_grpd, on = ["City_Code", "Date", "Product"], how = "left")
test_price

Unnamed: 0,Customer,Area,City_Code,Product,Date,CRP_mean
0,10048261,Karadeniz Bölgesi,883,Motorin,2019-01-07,5.760000
1,10048261,Karadeniz Bölgesi,883,Motorin,2019-01-14,5.888571
2,10048261,Karadeniz Bölgesi,883,Motorin,2019-01-21,6.111429
3,10048261,Karadeniz Bölgesi,883,Motorin,2019-01-28,6.140000
4,10048261,Karadeniz Bölgesi,883,Motorin,2019-02-04,6.140000
...,...,...,...,...,...,...
108467,12739737,Doğu Anadolu Bölgesi,194,Benzin,2019-12-02,7.134286
108468,12739737,Doğu Anadolu Bölgesi,194,Benzin,2019-12-09,7.140000
108469,12739737,Doğu Anadolu Bölgesi,194,Benzin,2019-12-16,7.140000
108470,12739737,Doğu Anadolu Bölgesi,194,Benzin,2019-12-23,7.140000


In [20]:
train_price = train_price[["Customer", "Area", "City_Code", "Product", "Date", "CRP_mean", "Litres_sum_sum"]]
train_price.rename(columns={"Litres_sum_sum":"Litres"}, inplace = True)
train_price

Unnamed: 0,Customer,Area,City_Code,Product,Date,CRP_mean,Litres
0,10048261,Karadeniz Bölgesi,883,Benzin,2014-01-05,5.076,5040.0
1,10048261,Karadeniz Bölgesi,883,Motorin,2014-01-05,4.552,28712.0
2,10048263,Karadeniz Bölgesi,966,Motorin,2014-01-05,4.518,34082.0
3,10048264,Karadeniz Bölgesi,883,Motorin,2014-01-05,4.552,0.0
4,10048265,Karadeniz Bölgesi,589,Benzin,2014-01-05,5.048,0.0
...,...,...,...,...,...,...,...
483963,12654774,Marmara Bölgesi,835,Motorin,2019-01-06,5.670,30871.0
483964,12655800,Ege Bölgesi,574,Benzin,2019-01-06,5.920,3520.0
483965,12655800,Ege Bölgesi,574,Motorin,2019-01-06,5.710,20143.0
483966,12663274,Doğu Anadolu Bölgesi,518,Benzin,2019-01-06,6.040,13636.0


# <h1><center> DATA PREPROCESSING </center></h1>

In [21]:
X_train_val = train_price.drop(["Litres"], axis = 1)
X_train_val

Unnamed: 0,Customer,Area,City_Code,Product,Date,CRP_mean
0,10048261,Karadeniz Bölgesi,883,Benzin,2014-01-05,5.076
1,10048261,Karadeniz Bölgesi,883,Motorin,2014-01-05,4.552
2,10048263,Karadeniz Bölgesi,966,Motorin,2014-01-05,4.518
3,10048264,Karadeniz Bölgesi,883,Motorin,2014-01-05,4.552
4,10048265,Karadeniz Bölgesi,589,Benzin,2014-01-05,5.048
...,...,...,...,...,...,...
483963,12654774,Marmara Bölgesi,835,Motorin,2019-01-06,5.670
483964,12655800,Ege Bölgesi,574,Benzin,2019-01-06,5.920
483965,12655800,Ege Bölgesi,574,Motorin,2019-01-06,5.710
483966,12663274,Doğu Anadolu Bölgesi,518,Benzin,2019-01-06,6.040


In [22]:
y_train_val = train_price["Litres"]
y_train_val

0          5040.0
1         28712.0
2         34082.0
3             0.0
4             0.0
           ...   
483963    30871.0
483964     3520.0
483965    20143.0
483966    13636.0
483967    17102.0
Name: Litres, Length: 483968, dtype: float64

In [23]:
X_test = test_price.copy()
X_test

Unnamed: 0,Customer,Area,City_Code,Product,Date,CRP_mean
0,10048261,Karadeniz Bölgesi,883,Motorin,2019-01-07,5.760000
1,10048261,Karadeniz Bölgesi,883,Motorin,2019-01-14,5.888571
2,10048261,Karadeniz Bölgesi,883,Motorin,2019-01-21,6.111429
3,10048261,Karadeniz Bölgesi,883,Motorin,2019-01-28,6.140000
4,10048261,Karadeniz Bölgesi,883,Motorin,2019-02-04,6.140000
...,...,...,...,...,...,...
108467,12739737,Doğu Anadolu Bölgesi,194,Benzin,2019-12-02,7.134286
108468,12739737,Doğu Anadolu Bölgesi,194,Benzin,2019-12-09,7.140000
108469,12739737,Doğu Anadolu Bölgesi,194,Benzin,2019-12-16,7.140000
108470,12739737,Doğu Anadolu Bölgesi,194,Benzin,2019-12-23,7.140000


In [24]:
lbl = LabelEncoder() 
X_train_val['Area'] = lbl.fit_transform(X_train_val['Area'].astype(str))
X_train_val['Date'] = lbl.fit_transform(X_train_val['Date'].astype(str)) 
X_train_val['Product'] = lbl.fit_transform(X_train_val['Product'].astype(str)) 
X_train_val

Unnamed: 0,Customer,Area,City_Code,Product,Date,CRP_mean
0,10048261,4,883,0,0,5.076
1,10048261,4,883,1,0,4.552
2,10048263,4,966,1,0,4.518
3,10048264,4,883,1,0,4.552
4,10048265,4,589,0,0,5.048
...,...,...,...,...,...,...
483963,12654774,5,835,1,261,5.670
483964,12655800,2,574,0,261,5.920
483965,12655800,2,574,1,261,5.710
483966,12663274,1,518,0,261,6.040


In [25]:
lbl = LabelEncoder() 
X_test['Area'] = lbl.fit_transform(X_test['Area'].astype(str))
X_test['Date'] = lbl.fit_transform(X_test['Date'].astype(str)) 
X_test['Product'] = lbl.fit_transform(X_test['Product'].astype(str)) 
X_test

Unnamed: 0,Customer,Area,City_Code,Product,Date,CRP_mean
0,10048261,4,883,1,0,5.760000
1,10048261,4,883,1,1,5.888571
2,10048261,4,883,1,2,6.111429
3,10048261,4,883,1,3,6.140000
4,10048261,4,883,1,4,6.140000
...,...,...,...,...,...,...
108467,12739737,1,194,0,47,7.134286
108468,12739737,1,194,0,48,7.140000
108469,12739737,1,194,0,49,7.140000
108470,12739737,1,194,0,50,7.140000


In [26]:
fold_number = 10 if X_train_val.shape[0] < 100_000 else 5 if  X_train_val.shape[0] < 1_000_000 else 3
initial_split = 0.3
ho_split = 0.5

# <h1><center> MODEL COMPARISON</center></h1>

# LGBM

In [27]:
rmsle_scores = []

for _ in tqdm(range(fold_number), desc="Training..."):
    X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=initial_split)
    lgbmr_base = LGBMRegressor()
    lgbmr_base.fit(X_train, y_train)
    y_pred = lgbmr_base.predict(X_val)
    y_pred_norm = minmax_scale(y_pred, feature_range=(0,1))
    y_val_norm = minmax_scale(y_val, feature_range=(0,1))
    rmsle_score = mean_squared_log_error(y_pred_norm, y_val_norm, squared=False)
    rmsle_scores.append(rmsle_score)
    
print("RMSLE Score: {}".format(np.mean(rmsle_scores)))

Training...: 100%|███████████████████████████████████████████████████████████████████████| 5/5 [00:11<00:00,  2.23s/it]

RMSLE Score: 0.08671460697010056





# XGBOOST

In [28]:
rmsle_scores = []

for _ in tqdm(range(fold_number), desc="Training..."):
    X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=initial_split)
    xgbr_base = XGBRegressor()
    xgbr_base.fit(X_train, y_train)
    y_pred = xgbr_base.predict(X_val)
    y_pred_norm = minmax_scale(y_pred, feature_range=(0,1))
    y_val_norm = minmax_scale(y_val, feature_range=(0,1))
    rmsle_score = mean_squared_log_error(y_pred_norm, y_val_norm, squared=False)
    rmsle_scores.append(rmsle_score)
    
print("RMSLE Score: {}".format(np.mean(rmsle_scores)))

Training...: 100%|███████████████████████████████████████████████████████████████████████| 5/5 [01:18<00:00, 15.63s/it]

RMSLE Score: 0.10835090745997074





# CATBOOST

In [None]:
rmsle_scores = []

for _ in tqdm(range(fold_number), desc="Training..."):
    X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=initial_split)
    cbr_base = CatBoostRegressor(verbose=0)
    cbr_base.fit(X_train, y_train)
    y_pred = cbr_base.predict(X_val)
    y_pred_norm = minmax_scale(y_pred, feature_range=(0,1))
    y_val_norm = minmax_scale(y_val, feature_range=(0,1))
    rmsle_score = mean_squared_log_error(y_pred_norm, y_val_norm, squared=False)
    rmsle_scores.append(rmsle_score)
    
print("RMSLE Score: {}".format(np.mean(rmsle_scores)))

# <h1><center> HYPERPARAMETER TUNING </center></h1>

# LGBM

In [32]:
def objective(trial):
    params = {
        #'metric': 'mlogloss', 
        #'n_estimators': trial.suggest_int('num_leaves', 10, 1000),
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-3, 10.0),
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-3, 10.0),
        'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.3,0.4,0.5,0.6,0.7,0.8,0.9, 1.0]),
        'subsample': trial.suggest_categorical('subsample', [0.4,0.5,0.6,0.7,0.8,1.0]),
        'learning_rate': trial.suggest_categorical('learning_rate', [0.006,0.008,0.01,0.014,0.017,0.02, 0.1]),
        'max_depth': trial.suggest_categorical('max_depth', [10,20,100]),
        'num_leaves' : trial.suggest_int('num_leaves', 1, 1000),
        'min_child_samples': trial.suggest_int('min_child_samples', 1, 300),
        "scale_pos_weight": trial.suggest_int('scale_pos_weight', 1, 10),
    }    
    rmsle_scores = []
    
    for _ in range(fold_number):
        X_train, X_inner_val, y_train, y_inner_val = train_test_split(X_train_inner_val, y_train_inner_val, test_size=initial_split)
        lgbmr = LGBMRegressor(**params)
        lgbmr.fit(X_train, y_train)
        y_inner_pred = lgbmr.predict(X_inner_val)
        y_inner_pred_norm = minmax_scale(y_inner_pred, feature_range=(0,1))
        y_inner_val_norm = minmax_scale(y_inner_val, feature_range=(0,1))
        rmsle_score = mean_squared_log_error(y_inner_pred_norm, y_inner_val_norm, squared=False)
        rmsle_scores.append(rmsle_score)
    return np.mean(rmsle_scores)

X_train_inner_val, X_outer_val, y_train_inner_val, y_outer_val = train_test_split(X_train_val, y_train_val, test_size=ho_split)
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=50)

[32m[I 2022-12-24 16:43:05,171][0m A new study created in memory with name: no-name-116acce2-7468-4a98-9872-703c4eece9c8[0m
[32m[I 2022-12-24 16:44:07,057][0m Trial 0 finished with value: 0.08397998264420688 and parameters: {'reg_alpha': 0.16907582283990105, 'reg_lambda': 2.203139565812919, 'colsample_bytree': 0.8, 'subsample': 0.5, 'learning_rate': 0.02, 'max_depth': 100, 'num_leaves': 887, 'min_child_samples': 122, 'scale_pos_weight': 7}. Best is trial 0 with value: 0.08397998264420688.[0m
[32m[I 2022-12-24 16:44:14,856][0m Trial 1 finished with value: 0.06963464561134376 and parameters: {'reg_alpha': 0.001337068163195801, 'reg_lambda': 0.6252417027054391, 'colsample_bytree': 1.0, 'subsample': 0.4, 'learning_rate': 0.008, 'max_depth': 20, 'num_leaves': 46, 'min_child_samples': 118, 'scale_pos_weight': 2}. Best is trial 1 with value: 0.06963464561134376.[0m
[32m[I 2022-12-24 16:44:31,608][0m Trial 2 finished with value: 0.17233945573352952 and parameters: {'reg_alpha': 0.00

[32m[I 2022-12-24 16:55:39,912][0m Trial 22 finished with value: 0.051863695232362915 and parameters: {'reg_alpha': 0.01288534966597166, 'reg_lambda': 0.10119299979625455, 'colsample_bytree': 0.9, 'subsample': 0.6, 'learning_rate': 0.017, 'max_depth': 100, 'num_leaves': 439, 'min_child_samples': 8, 'scale_pos_weight': 8}. Best is trial 22 with value: 0.051863695232362915.[0m
[32m[I 2022-12-24 16:56:33,583][0m Trial 23 finished with value: 0.06793500337238727 and parameters: {'reg_alpha': 0.01686281188485969, 'reg_lambda': 0.07500298890912377, 'colsample_bytree': 0.9, 'subsample': 0.6, 'learning_rate': 0.017, 'max_depth': 100, 'num_leaves': 615, 'min_child_samples': 38, 'scale_pos_weight': 8}. Best is trial 22 with value: 0.051863695232362915.[0m
[32m[I 2022-12-24 16:57:40,598][0m Trial 24 finished with value: 0.08157458181298986 and parameters: {'reg_alpha': 0.06029350255787092, 'reg_lambda': 0.24055402369755088, 'colsample_bytree': 0.9, 'subsample': 0.6, 'learning_rate': 0.017

[32m[I 2022-12-24 17:09:55,814][0m Trial 44 finished with value: 0.06733195873974568 and parameters: {'reg_alpha': 0.003150913032448301, 'reg_lambda': 1.1812672037300471, 'colsample_bytree': 0.8, 'subsample': 0.6, 'learning_rate': 0.01, 'max_depth': 100, 'num_leaves': 450, 'min_child_samples': 17, 'scale_pos_weight': 3}. Best is trial 22 with value: 0.051863695232362915.[0m
[32m[I 2022-12-24 17:10:26,054][0m Trial 45 finished with value: 0.12414028534505128 and parameters: {'reg_alpha': 0.0018661354039240425, 'reg_lambda': 0.0560594442809667, 'colsample_bytree': 0.5, 'subsample': 0.7, 'learning_rate': 0.01, 'max_depth': 20, 'num_leaves': 288, 'min_child_samples': 40, 'scale_pos_weight': 2}. Best is trial 22 with value: 0.051863695232362915.[0m
[32m[I 2022-12-24 17:11:09,751][0m Trial 46 finished with value: 0.07770315533487239 and parameters: {'reg_alpha': 0.006498415030090289, 'reg_lambda': 0.001065249543154694, 'colsample_bytree': 0.9, 'subsample': 0.6, 'learning_rate': 0.01,

In [33]:
print("Number of finished trials: {}".format(len(study.trials)))

print("Best trial:")
best_trial = study.best_trial

print("  Best Value: {}".format(best_trial.value))

print("  Best Params: ")
for key, value in best_trial.params.items():
    print("    {}: {}".format(key, value))

Number of finished trials: 50
Best trial:
  Best Value: 0.051863695232362915
  Best Params: 
    reg_alpha: 0.01288534966597166
    reg_lambda: 0.10119299979625455
    colsample_bytree: 0.9
    subsample: 0.6
    learning_rate: 0.017
    max_depth: 100
    num_leaves: 439
    min_child_samples: 8
    scale_pos_weight: 8


In [34]:
outer_rmsle_scores = []

for _ in tqdm(range(fold_number), desc="Outer Validation Evaluation..."):
    X_train, X_val, y_train, y_val = train_test_split(X_outer_val, y_outer_val, test_size=initial_split)
    lgbmr = LGBMRegressor(**best_trial.params)
    lgbmr.fit(X_train, y_train)
    y_pred = lgbmr.predict(X_val)
    y_pred_norm = minmax_scale(y_pred, feature_range=(0,1))
    y_val_norm = minmax_scale(y_val, feature_range=(0,1))
    rmsle_score = mean_squared_log_error(y_pred_norm, y_val_norm, squared=False)
    outer_rmsle_scores.append(rmsle_score)
print("RMSLE Score for Outer Validation: {}".format(np.mean(outer_rmsle_scores)))

Outer Validation Evaluation...: 100%|████████████████████████████████████████████████████| 5/5 [00:38<00:00,  7.64s/it]

RMSLE Score for Outer Validation: 0.05680350185138374





# XGBOOST

In [None]:
def objective(trial):
    params = {
        'max_depth': trial.suggest_int('max_depth', 1, 9),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 1),
        'n_estimators': trial.suggest_int('n_estimators', 50, 1000),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'gamma': trial.suggest_loguniform('gamma', 1e-8, 1.0),
        'subsample': trial.suggest_loguniform('subsample', 0.01, 1.0),
        'colsample_bytree': trial.suggest_loguniform('colsample_bytree', 0.01, 1.0),
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-8, 1.0),
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-8, 1.0),
        "scale_pos_weight": trial.suggest_int('scale_pos_weight', 1, 10),
        'eval_metric': 'mlogloss',
        'use_label_encoder': False
    }
    rmsle_scores = []
    
    for _ in range(fold_number):
        X_train, X_inner_val, y_train, y_inner_val = train_test_split(X_train_inner_val, y_train_inner_val, test_size=initial_split)
        xgbr = XGBRegressor(**params)
        xgbr.fit(X_train, y_train)
        y_inner_pred = xgbr.predict(X_inner_val)
        y_inner_pred_norm = minmax_scale(y_inner_pred, feature_range=(0,1))
        y_inner_val_norm = minmax_scale(y_inner_val, feature_range=(0,1))
        rmsle_score = mean_squared_log_error(y_inner_pred_norm, y_inner_val_norm, squared=False)
        rmsle_scores.append(rmsle_score)
    return np.mean(rmsle_scores)

X_train_inner_val, X_outer_val, y_train_inner_val, y_outer_val = train_test_split(X_train_val, y_train_val, test_size=ho_split)
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=10)

In [None]:
print("Number of finished trials: {}".format(len(study.trials)))

print("Best trial:")
best_trial = study.best_trial

print("  Best Value: {}".format(best_trial.value))

print("  Best Params: ")
for key, value in best_trial.params.items():
    print("    {}: {}".format(key, value))

In [None]:
outer_rmsle_scores = []

for _ in tqdm(range(fold_number), desc="Outer Validation Evaluation..."):
    X_train, X_val, y_train, y_val = train_test_split(X_outer_val, y_outer_val, test_size=initial_split)
    xgbr = XGBRegressor(**best_trial.params)
    xgbr.fit(X_train, y_train)
    y_pred = xgbr.predict(X_val)
    y_pred_norm = minmax_scale(y_pred, feature_range=(0,1))
    y_val_norm = minmax_scale(y_val, feature_range=(0,1))
    rmsle_score = mean_squared_log_error(y_pred_norm, y_val_norm, squared=False)
    outer_rmsle_scores.append(rmsle_score)
print("RMSLE Score for Outer Validation: {}".format(np.mean(outer_rmsle_scores)))

# CATBOOST

In [None]:
def objective(trial):
    params = {
        'iterations': trial.suggest_int('iterations', 100, 5000),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 1.0),
        'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1, 100),
        'bagging_temperature': trial.suggest_loguniform('bagging_temperature', 0.1, 20.0),
        'random_strength': trial.suggest_float('random_strength', 1.0, 2.0),
        'depth': trial.suggest_int('depth', 1, 10),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 1, 300),
#         "scale_pos_weight": trial.suggest_int('scale_pos_weight', 1, 10),
        "verbose" : 0
    }    
    rmsle_scores = []
    
    for _ in range(fold_number):
        X_train, X_inner_val, y_train, y_inner_val = train_test_split(X_train_inner_val, y_train_inner_val, test_size=initial_split)
        cbr = CatBoostRegressor(**params)
        cbr.fit(X_train, y_train)
        y_inner_pred = cbr.predict(X_inner_val)
        y_inner_pred_norm = minmax_scale(y_inner_pred, feature_range=(0,1))
        y_inner_val_norm = minmax_scale(y_inner_val, feature_range=(0,1))
        rmsle_score = mean_squared_log_error(y_inner_pred_norm, y_inner_val_norm, squared=False)
        rmsle_scores.append(rmsle_score)
    return np.mean(rmsle_scores)

X_train_inner_val, X_outer_val, y_train_inner_val, y_outer_val = train_test_split(X_train_val, y_train_val, test_size=ho_split)
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=10)

In [None]:
print("Number of finished trials: {}".format(len(study.trials)))

print("Best trial:")
best_trial = study.best_trial

print("  Best Value: {}".format(best_trial.value))

print("  Best Params: ")
for key, value in best_trial.params.items():
    print("    {}: {}".format(key, value))

In [None]:
outer_rmsle_scores = []

for _ in tqdm(range(fold_number), desc="Outer Validation Evaluation..."):
    X_train, X_val, y_train, y_val = train_test_split(X_outer_val, y_outer_val, test_size=initial_split)
    cbr = CatBoostRegressor(**best_trial.params)
    cbr.fit(X_train, y_train)
    y_pred = cbr.predict(X_val)
    y_pred_norm = minmax_scale(y_pred, feature_range=(0,1))
    y_val_norm = minmax_scale(y_val, feature_range=(0,1))
    rmsle_score = mean_squared_log_error(y_pred_norm, y_val_norm, squared=False)
    outer_rmsle_scores.append(rmsle_score)
print("RMSLE Score for Outer Validation: {}".format(np.mean(outer_rmsle_scores)))

# <h1><center>TRAIN THE FINAL MODEL & PREDICT</center></h1>

In [35]:
lgbmr_final = LGBMRegressor(**best_trial.params)
lgbmr_final.fit(X_train_val, y_train_val)
y_test_pred = lgbmr_final.predict(X_test)

In [None]:
xgbr_final = XGBRegressor(**best_trial.params)
xgbr_final.fit(X_train_val, y_train_val)
y_test_pred = xgbr_final.predict(X_test)

In [None]:
cbr_final = CatBoostRegressor(**best_trial.params)
cbr_final.fit(X_train_val, y_train_val)
y_test_pred = cbr_final.predict(X_test)

# <h1><center>FEATURE IMPORTANCE </center></h1>

In [None]:
#plt.rcParams["figure.figsize"] = (14, 24)
plot_importance(xgbr_final)
plt.show()

In [None]:
#plt.rcParams["figure.figsize"] = (14, 24)
plot_importance(cbr_final)
plt.show()

In [None]:
explainerlgbmr = shap.TreeExplainer(lgbmr_final)
shap_values_train = explainerlgbmr.shap_values(X_train_val)

In [None]:
explainerxgbr = shap.TreeExplainer(xgbr_final)
shap_values_train = explainerxgbr.shap_values(X_train_val)

In [None]:
explainercbr = shap.TreeExplainer(cbr_final)
shap_values_train = explainercbr.shap_values(X_train_val)

In [None]:
shap.summary_plot(shap_values_train, X_train_val)

# <h1><center> CREATE SUBMISSION FILE </center></h1>

In [36]:
submission = pd.DataFrame({"index":X_test.index})
submission['Litres'] = y_test_pred
submission.response = submission.Litres.astype(float)
submission

Unnamed: 0,index,Litres
0,0,19758.019279
1,1,29739.496740
2,2,29739.496740
3,3,29739.496740
4,4,29739.496740
...,...,...
108467,108467,14128.517002
108468,108468,14128.517002
108469,108469,14154.919224
108470,108470,14154.919224


In [37]:
submission.to_csv(f"Submission-{time.strftime('%d - %H-%M-%S')}.csv", index=False)