# 4. Treinamento do Modelo:

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


from datetime import datetime, timedelta


In [2]:
df = pd.read_parquet('data/vendas.parquet')

In [3]:
df = df.sort_values(by='dtvenda').reset_index()

In [4]:
df = df.groupby(['vlvendido', 'day_of_week', 'month', 'day_of_year', 'year'])['qtd'].sum().reset_index()

In [5]:
df.shape

(8968, 6)

## 4.1 - Método de Treinamento: Divisão Treino/Teste ou Validação Cruzada

In [6]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, root_mean_squared_error

In [7]:
windows_size = 30 # ref. day  -- 30/60
train_size = None # ref. day
val_size = 0 # ref. day
test_size = 360 # ref. day

flag_stationary = False

features = ['vlvendido', 'day_of_week', 'month', 'day_of_year', 'year', 'qtd']

if train_size is None:
    train_size = len(df) - val_size - test_size

print(f'windows_size = {windows_size}')
print(f'train_size = {train_size}')
print(f'val_size = {val_size}')
print(f'test_size = {test_size}')
print(f'features = {features}')


windows_size = 30
train_size = 8608
val_size = 0
test_size = 360
features = ['vlvendido', 'day_of_week', 'month', 'day_of_year', 'year', 'qtd']


In [8]:
def create_dataset(df, windows_size):
    
    X = []
    y = []
    
    for i in range(len(df) - windows_size):
        
        pos_target = i + windows_size
        target = df.iloc[pos_target]['qtd']
        
        sample = []
        for f in features:
            if f == 'qtd':
                sample += list(df.iloc[i:pos_target][f].values)
            else:
                sample += [df.iloc[pos_target][f]]
        
        X.append(sample)
        y.append(target)
    
    return np.array(X), np.array(y)

In [9]:
X, y = create_dataset(df, windows_size=windows_size)

X.shape, y.shape

((8938, 35), (8938,))

In [10]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [11]:
X_scaled

array([[-1.70036653,  1.02747012,  0.63342565, ..., -0.12379816,
         0.09734602, -0.56613595],
       [-1.70036653,  1.02747012,  0.91210052, ...,  0.09737073,
        -0.56616088, -0.56613595],
       [-1.70036653,  1.02747012,  0.91210052, ..., -0.56613595,
        -0.56616088, -0.56613595],
       ...,
       [ 7.66896929,  0.39407968, -0.20259893, ..., -0.34496706,
         0.53968395, -0.12379816],
       [ 8.29609183,  0.39407968, -1.59597324, ...,  0.53970852,
        -0.12382295,  0.09737073],
       [11.42128724,  0.39407968, -0.48127379, ..., -0.12379816,
         0.09734602, -0.12379816]])

In [12]:
X_test, y_test = X_scaled[-test_size:, :], y[-test_size:]
X_train, y_train = X_scaled[-test_size-val_size-train_size:-test_size-val_size, :], y[-test_size-val_size-train_size:-test_size-val_size]

X_train.shape, y_train.shape, X_test.shape, y_test.shape

((8578, 35), (8578,), (360, 35), (360,))

In [77]:
y

array([1., 1., 2., ..., 4., 3., 3.])

## 4.2 - Pelo menos dois Modelos: Linear, Árvore, Ensemble, KNN, etc.

Os modelos utilizados serão:
- LinearRegression
- DecisionTreeRegressor
- AdaBoostRegressor
- RandomForestRegressor

In [13]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor, RandomForestRegressor

In [14]:
def get_model_linear_regression():
    return LinearRegression()

def get_model_decision_tree(max_depth=5, max_leaf_nodes=5):
    return DecisionTreeRegressor(max_depth=max_depth, max_leaf_nodes=max_leaf_nodes, random_state=42)

def get_model_adaboost(n_estimators=50, learning_rate=0.5, loss='linear'):
    return AdaBoostRegressor(n_estimators=n_estimators, learning_rate=learning_rate, loss=loss, random_state=42)

def get_model_random_forest(n_estimators=50,max_depth=5):
    return RandomForestRegressor(n_estimators=n_estimators, max_depth=max_depth, random_state=42)

In [15]:
def predict(sample, model):
    
    predictions = []
    
    test_info = X_test[:, windows_size:]
    
    sample_to_predict = [sample.copy()]

    for i in range(test_size):

        sample_to_predict = sample_to_predict[0]

        if i > 0:
            sales = list(sample_to_predict[1:windows_size]) + [y]
            others = list(test_info[i])
            
            sample_to_predict = sales + others

        sample_to_predict = np.array([sample_to_predict])
        y = model.predict(sample_to_predict)[0]
        
        predictions.append(y)
    
    return predictions

In [16]:
def result_summary(model_name, hp, y_pred, y_test):
    mae = mean_absolute_error(y_pred, y_test)
    mse = mean_squared_error(y_pred, y_test)
    rmse = root_mean_squared_error(y_pred, y_test)
    return {"model_name": model_name, "hyper_parameters": hp,
            "mae": mae, "mse": mse, "rmse": rmse}


In [17]:
model = get_model_linear_regression()
model.fit(X_train, y_train)

y_pred = predict(X_test[0,:], model)

In [18]:
lr_result = result_summary("linear_regression", {}, y_pred, y_test)

## 4.3 - Hiperparâmetros

In [19]:
decision_tree_params = [
    {"max_depth": 5, "max_leaf_nodes": 5},
    {"max_depth": 3, "max_leaf_nodes": 10},
    {"max_depth": 10, "max_leaf_nodes": 5},
    {"max_depth": 10, "max_leaf_nodes": 10},
    {"max_depth": 10, "max_leaf_nodes": 3}
]

adaboost_params = [
    {"n_estimators": 50, "learning_rate": 0.5, "loss": "linear"},
    {"n_estimators": 100, "learning_rate": 0.5, "loss": "linear"},
    {"n_estimators": 50, "learning_rate": 0.1, "loss": "linear"},
    {"n_estimators": 100, "learning_rate": 0.1, "loss": "linear"}
]

random_forest_params = [
    {"n_estimators": 50, "max_depth": 5},
    {"n_estimators": 100, "max_depth": 5},
    {"n_estimators": 50, "max_depth": 10},
    {"n_estimators": 100, "max_depth": 10}
]

## 4.4 - Treinamentos

In [20]:
resultados = [lr_result]


### 4.4.1 - Arvore de Decisão

In [21]:
for param in decision_tree_params:
    model = get_model_decision_tree(**param)
    model.fit(X_train, y_train)

    y_pred = predict(X_test[0,:], model)
    rs = result_summary("decision_tree", param, y_pred, y_test)
    resultados.append(rs)

### 4.4.2 - Adaboost

In [22]:
for param in adaboost_params:
    model = get_model_adaboost(**param)
    model.fit(X_train, y_train)

    y_pred = predict(X_test[0,:], model)
    rs = result_summary("adaboost", param, y_pred, y_test)
    resultados.append(rs)

### 4.4.3 - Random Forest

In [23]:
for param in random_forest_params:
    model = get_model_random_forest(**param)
    model.fit(X_train, y_train)

    y_pred = predict(X_test[0,:], model)
    rs = result_summary("random_forest", param, y_pred, y_test)
    resultados.append(rs)

# 5 - Avaliação

## 5.1 - Métricas

Por se tratar de um problema de regressão, iremos utilizar as métricas:

- **MAE** (Mean Absoluto Error): Erro Médio Absoluto
- **MSE** (Mean Squared Error): Erro Quadrático Médio
- **RMSE** (Root Mean Squared Error): Raiz do Erro Quadrático Médio

In [31]:
df_resultados = pd.DataFrame(resultados)
df_resultados

Unnamed: 0,model_name,hyper_parameters,mae,mse,rmse
0,linear_regression,{},3.944076e+16,3.719881e+34,1.928699e+17
1,decision_tree,"{'max_depth': 5, 'max_leaf_nodes': 5}",1.177941,3.030859,1.740936
2,decision_tree,"{'max_depth': 3, 'max_leaf_nodes': 10}",22.16907,534.8432,23.12668
3,decision_tree,"{'max_depth': 10, 'max_leaf_nodes': 5}",1.177941,3.030859,1.740936
4,decision_tree,"{'max_depth': 10, 'max_leaf_nodes': 10}",1.199469,3.204668,1.790159
5,decision_tree,"{'max_depth': 10, 'max_leaf_nodes': 3}",1.178351,3.031957,1.741251
6,adaboost,"{'n_estimators': 50, 'learning_rate': 0.5, 'lo...",37.01527,1445.046,38.01376
7,adaboost,"{'n_estimators': 100, 'learning_rate': 0.5, 'l...",37.01527,1445.046,38.01376
8,adaboost,"{'n_estimators': 50, 'learning_rate': 0.1, 'lo...",24.62166,657.4507,25.6408
9,adaboost,"{'n_estimators': 100, 'learning_rate': 0.1, 'l...",21.86644,507.469,22.52707


In [32]:
df_resultados.to_csv('data/resultados.csv')

## 5.2 - Comparação dos Modelos