In [1]:
import pandas as pd
# catboosting
from catboost import CatBoostRegressor
# metrics
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error, make_scorer
# kfold
from sklearn.model_selection import KFold
# gridsearch
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
# matplotlib
import matplotlib.pyplot as plt
# datetime
import datetime as dt
# numpy
import numpy as np
import sys
sys.path.insert(1, '../utils/')
from data_visualization import residuals_hist, residuals_scatter
from preprocessing import preprocess_cosumo 
import pickle



In [2]:
# load train data
df = pd.read_csv('../data/consumo_material_clean_with_category.csv', parse_dates=['FECHAPEDIDO'])

# preprocess data
df = preprocess_cosumo(df)

# split train and test and filter by date
df_train = df[df['FECHAPEDIDO']<dt.datetime(year=2023,month=1,day=1)]
df_train = df_train[df_train['FECHAPEDIDO'] >= dt.datetime(year=2020,month=1,day=1)]
df_test = df[df['FECHAPEDIDO']>=dt.datetime(year=2023,month=1,day=1)]

In [None]:
df.head()

In [4]:
# drop columns
df_train.drop(columns=['FECHAPEDIDO'], inplace=True)
df_test.drop(columns=['FECHAPEDIDO'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test.drop(columns=['FECHAPEDIDO'], inplace=True)


In [5]:
# kfolds
X = df_train.drop(['STACKS_COMPRATS'], axis = 1)
y = df_train['STACKS_COMPRATS']
folds = KFold(n_splits=10, shuffle=True, random_state=42)

In [None]:
# define the parameter grid for hyperparameter tuning
param_grid = {
    'iterations': [100, 150, 200],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'depth': [3, 5, 7],
    'l2_leaf_reg': [0.1, 0.2, 0.5],
    'border_count': [5, 10, 20],
    'bagging_temperature': [0.5, 0.8, 1.0],
    'grow_policy': ['SymmetricTree', 'Depthwise', 'Lossguide']
}

In [6]:
X.head()

Unnamed: 0,UNIDADESCONSUMOCONTENIDAS,PRECIO,CODIGO_B40558,CODIGO_B41691,CODIGO_C26183,CODIGO_C56207,CODIGO_E64488,CODIGO_E64543,CODIGO_E64544,CODIGO_E64663,...,CODIGO_F43331,CODIGO_F43580,CODIGO_F43581,CODIGO_F43585,CODIGO_F44200,CODIGO_F46843,CODIGO_F46846,CODIGO_F50071,TIPOCOMPRA_Compra menor,TIPOCOMPRA_Concurso
16,5,12.1,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
17,5,12.1,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
18,5,26.399925,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1
19,300,27.588,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
20,300,49.302,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [7]:
# define the model
model = CatBoostRegressor()

# define the KFold cross-validation object
folds = KFold(n_splits=10, shuffle=True, random_state=42)

# define the scoring metric
scoring = {'mse': make_scorer(mean_squared_error), 
           'mae': make_scorer(mean_absolute_error), 
           'r2': make_scorer(r2_score)}

# define the grid search object
grid_search = RandomizedSearchCV(estimator=model, param_distributions=param_grid, cv=folds, n_iter=30, n_jobs=10, verbose=1, scoring=scoring, refit='mse')

# fit the grid search object to the data
grid_search.fit(X, y)

# print the best hyperparameters and the corresponding score
print("Best hyperparameters: ", grid_search.best_params_)
print("Best score: ", grid_search.best_score_)


Fitting 10 folds for each of 30 candidates, totalling 300 fits
0:	learn: 34.2802038	total: 142ms	remaining: 14.1s
1:	learn: 34.1575567	total: 145ms	remaining: 7.11s
2:	learn: 34.0348176	total: 147ms	remaining: 4.75s
3:	learn: 33.9095404	total: 149ms	remaining: 3.57s
4:	learn: 33.7831053	total: 151ms	remaining: 2.86s
5:	learn: 33.6587176	total: 153ms	remaining: 2.39s
6:	learn: 33.5420672	total: 155ms	remaining: 2.06s
7:	learn: 33.4323523	total: 158ms	remaining: 1.81s
8:	learn: 33.3180018	total: 160ms	remaining: 1.62s
9:	learn: 33.2120447	total: 162ms	remaining: 1.45s
10:	learn: 33.0968290	total: 163ms	remaining: 1.32s
11:	learn: 32.9891134	total: 165ms	remaining: 1.21s
12:	learn: 32.8860913	total: 167ms	remaining: 1.12s
13:	learn: 32.7773829	total: 169ms	remaining: 1.04s
14:	learn: 32.6732045	total: 171ms	remaining: 969ms
15:	learn: 32.5659158	total: 173ms	remaining: 908ms
16:	learn: 32.4605659	total: 175ms	remaining: 853ms
17:	learn: 32.3666537	total: 176ms	remaining: 802ms
18:	learn: 

In [None]:
# metrics for TRAIN
X_test = df_test.drop(['STACKS_COMPRATS'], axis = 1)
y_test = df_test['STACKS_COMPRATS'].reset_index(drop=True)
preds = grid_search.best_estimator_.predict(X)

mse = mean_squared_error(y, preds)
print('mse: ', mse)
mae = mean_absolute_error(y, preds)
print('mae: ', mae)
r2 = r2_score(y, preds)
print('r2: ', r2)


mse:  320.1317882190941
mae:  9.555236082604926
r2:  0.7521401763508648


In [None]:
# metrics for TEST
X_test = df_test.drop(['STACKS_COMPRATS'], axis = 1)
y_test = df_test['STACKS_COMPRATS'].reset_index(drop=True)
preds = grid_search.best_estimator_.predict(X_test)

mse = mean_squared_error(y_test, preds)
print('mse: ', mse)
mae = mean_absolute_error(y_test, preds)
print('mae: ', mae)
r2 = r2_score(y_test, preds)
print('r2: ', r2)


mse:  1727.6042524827499
mae:  16.00992279998005
r2:  0.30626630511023945
