# 4. Treinamento do Modelo:

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


from datetime import datetime, timedelta


In [67]:
df = pd.read_parquet('data/vendas.parquet')

In [28]:
df = df.sort_values(by='dtvenda').reset_index()

In [68]:
df = df.groupby(['vlvendido', 'day_of_week', 'month', 'day_of_year', 'year'])['qtd'].sum().reset_index()

In [69]:
df.shape

(8968, 6)

## 4.1 - Método de Treinamento: Divisão Treino/Teste ou Validação Cruzada

In [70]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, root_mean_squared_error

In [71]:
windows_size = 30 # ref. day  -- 30/60
train_size = None # ref. day
val_size = 0 # ref. day
test_size = 360 # ref. day

flag_stationary = False

features = ['vlvendido', 'day_of_week', 'month', 'day_of_year', 'year', 'qtd']

if train_size is None:
    train_size = len(df) - val_size - test_size

print(f'windows_size = {windows_size}')
print(f'train_size = {train_size}')
print(f'val_size = {val_size}')
print(f'test_size = {test_size}')
print(f'features = {features}')


windows_size = 30
train_size = 8608
val_size = 0
test_size = 360
features = ['vlvendido', 'day_of_week', 'month', 'day_of_year', 'year', 'qtd']


In [72]:
def create_dataset(df, windows_size):
    
    X = []
    y = []
    
    for i in range(len(df) - windows_size):
        
        pos_target = i + windows_size
        target = df.iloc[pos_target]['qtd']
        
        sample = []
        for f in features:
            if f == 'qtd':
                sample += list(df.iloc[i:pos_target][f].values)
            else:
                sample += [df.iloc[pos_target][f]]
        
        X.append(sample)
        y.append(target)
    
    return np.array(X), np.array(y)

In [73]:
X, y = create_dataset(df, windows_size=windows_size)

X.shape, y.shape

((8938, 35), (8938,))

In [74]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [75]:
X_scaled

array([[-1.70036653,  1.02747012,  0.63342565, ..., -0.12379816,
         0.09734602, -0.56613595],
       [-1.70036653,  1.02747012,  0.91210052, ...,  0.09737073,
        -0.56616088, -0.56613595],
       [-1.70036653,  1.02747012,  0.91210052, ..., -0.56613595,
        -0.56616088, -0.56613595],
       ...,
       [ 7.66896929,  0.39407968, -0.20259893, ..., -0.34496706,
         0.53968395, -0.12379816],
       [ 8.29609183,  0.39407968, -1.59597324, ...,  0.53970852,
        -0.12382295,  0.09737073],
       [11.42128724,  0.39407968, -0.48127379, ..., -0.12379816,
         0.09734602, -0.12379816]])

In [76]:
X_test, y_test = X_scaled[-test_size:, :], y[-test_size:]
X_train, y_train = X_scaled[-test_size-val_size-train_size:-test_size-val_size, :], y[-test_size-val_size-train_size:-test_size-val_size]

X_train.shape, y_train.shape, X_test.shape, y_test.shape

((8578, 35), (8578,), (360, 35), (360,))

In [77]:
y

array([1., 1., 2., ..., 4., 3., 3.])

## 4.2 - Pelo menos dois Modelos: Linear, Árvore, Ensemble, KNN, etc.

Os modelos utilizados serão:
- LinearRegression
- DecisionTreeRegressor
- AdaBoostRegressor
- RandomForestRegressor

In [78]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor, RandomForestRegressor

In [79]:
def get_model_linear_regression():
    return LinearRegression()

def get_model_decision_tree(max_depth=5, max_leaf_nodes=5):
    return DecisionTreeRegressor(max_depth=max_depth, max_leaf_nodes=max_leaf_nodes, random_state=42)

def get_model_adaboost(n_estimators=50, learning_rate=0.5, loss='linear'):
    return AdaBoostRegressor(n_estimators=n_estimators, learning_rate=learning_rate, loss=loss, random_state=42)

def get_model_random_forest(n_estimators=50,max_depth=5):
    return RandomForestRegressor(n_estimators=n_estimators, max_depth=max_depth, random_state=42)

In [80]:
def predict(sample, model):
    
    predictions = []
    
    test_info = X_test[:, windows_size:]
    
    sample_to_predict = [sample.copy()]

    for i in range(test_size):

        sample_to_predict = sample_to_predict[0]

        if i > 0:
            sales = list(sample_to_predict[1:windows_size]) + [y]
            others = list(test_info[i])
            
            sample_to_predict = sales + others

        sample_to_predict = np.array([sample_to_predict])
        y = model.predict(sample_to_predict)[0]
        
        predictions.append(y)
    
    return predictions

In [81]:
def result_summary(model_name, hp, y_pred, y_test):
    mae = mean_absolute_error(y_pred, y_test)
    mse = mean_squared_error(y_pred, y_test)
    rmse = root_mean_squared_error(y_pred, y_test)
    return {"model_name": model_name, "hyper_parameters": hp,
            "mae": mae, "mse": mse, "rmse": rmse}


In [82]:
model = get_model_linear_regression()
model.fit(X_train, y_train)

y_pred = predict(X_test[0,:], model)

In [84]:
lr_result = result_summary("linear_regression", {}, y_pred, y_test)

## 4.3 - Hiperparâmetros

In [86]:
decision_tree_params = [
    {"max_depth": 5, "max_leaf_nodes": 5},
    {"max_depth": 3, "max_leaf_nodes": 10},
    {"max_depth": 10, "max_leaf_nodes": 5},
    {"max_depth": 10, "max_leaf_nodes": 10},
    {"max_depth": 10, "max_leaf_nodes": 3}
]

adaboost_params = [
    {"n_estimators": 50, "learning_rate": 0.5, "loss": "linear"},
    {"n_estimators": 100, "learning_rate": 0.5, "loss": "linear"},
    {"n_estimators": 50, "learning_rate": 0.1, "loss": "linear"},
    {"n_estimators": 100, "learning_rate": 0.1, "loss": "linear"}
]

random_forest_params = [
    {"n_estimators": 50, "max_depth": 5},
    {"n_estimators": 100, "max_depth": 5},
    {"n_estimators": 50, "max_depth": 10},
    {"n_estimators": 100, "max_depth": 10}
]

## 4.4 - Treinamentos

In [87]:
resultados = [lr_result]


### 4.4.1 - Arvore de Decisão

In [88]:
for param in decision_tree_params:
    model = get_model_decision_tree(**param)
    model.fit(X_train, y_train)

    y_pred = predict(X_test[0,:], model)
    rs = result_summary("decision_tree", param, y_pred, y_test)
    resultados.append(rs)

### 4.4.2 - Adaboost

In [90]:
for param in adaboost_params:
    model = get_model_adaboost(**param)
    model.fit(X_train, y_train)

    y_pred = predict(X_test[0,:], model)
    rs = result_summary("adaboost", param, y_pred, y_test)
    resultados.append(rs)

### 4.4.3 - Random Forest

In [92]:
for param in random_forest_params:
    model = get_model_random_forest(**param)
    model.fit(X_train, y_train)

    y_pred = predict(X_test[0,:], model)
    rs = result_summary("random_forest", param, y_pred, y_test)
    resultados.append(rs)

In [93]:
resultados

[{'model_name': 'linear_regression',
  'hyper_parameters': {},
  'mae': np.float64(3.9440762869143624e+16),
  'mse': np.float64(3.7198806897977575e+34),
  'rmse': np.float64(1.928699222221484e+17)},
 {'model_name': 'decision_tree',
  'hyper_parameters': {'max_depth': 5, 'max_leaf_nodes': 5},
  'mae': np.float64(1.1779409602836854),
  'mse': np.float64(3.0308586851366126),
  'rmse': np.float64(1.740936151941424)},
 {'model_name': 'decision_tree',
  'hyper_parameters': {'max_depth': 3, 'max_leaf_nodes': 10},
  'mae': np.float64(22.169067483890483),
  'mse': np.float64(534.8431988122967),
  'rmse': np.float64(23.126677210794824)},
 {'model_name': 'decision_tree',
  'hyper_parameters': {'max_depth': 10, 'max_leaf_nodes': 5},
  'mae': np.float64(1.1779409602836854),
  'mse': np.float64(3.0308586851366126),
  'rmse': np.float64(1.740936151941424)},
 {'model_name': 'decision_tree',
  'hyper_parameters': {'max_depth': 10, 'max_leaf_nodes': 10},
  'mae': np.float64(1.1994690295522168),
  'mse':