# Ibovespa forecasting using neural networks

## Machine Learning Engineer Nanodegree - Capstone Proposal

### Import python packages

In [131]:
import os
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from torch.nn import L1Loss
from torch.optim import Adam

from ibovespa.utils import load_config
from ibovespa.data_collection import collect_data
from ibovespa.data_preparation import prepare_data
from ibovespa.feature_engineering import engineer_features
from ibovespa.model_training import torch_data

### Load Configurations

In [145]:
config = load_config()

### Data Collection

In [3]:
period = config["data_collection"]["period"]
stocks = config["data_collection"]["stocks"]

raw_data = collect_data(stocks=stocks, data_size=period)

In [38]:
raw_data.tail()

Unnamed: 0,date,^BVSP,ITUB4,BBDC4,VALE3,PETR4,PETR3,ABEV3,BBAS3,B3SA3,ITSA4,group
360,2021-01-21,118443.0,29.91,25.48,93.360001,27.549999,28.09,15.61,34.18,58.099998,11.04,test
361,2021-01-22,117172.0,29.15,24.99,92.629997,27.02,27.700001,15.03,33.419998,58.459999,10.82,test
362,2021-01-26,116464.0,28.25,24.41,91.75,27.0,27.6,15.4,32.790001,59.299999,10.62,test
363,2021-01-27,115954.0,28.67,24.940001,90.540001,27.93,28.42,15.53,33.77,59.369999,10.65,test
364,2021-01-28,117587.726562,28.940001,25.370001,89.730003,28.34,28.860001,15.53,34.310001,59.529999,10.74,test


### Data Preparation

In [5]:
test_split  = config["data_preparation"]["split_size"]["test"]
valid_split = config["data_preparation"]["split_size"]["validation"]

clean_data = prepare_data(raw_data, split=test_split, split_valid=valid_split)

In [39]:
clean_data.tail()

Unnamed: 0,date,^BVSP,ITUB4,BBDC4,VALE3,PETR4,PETR3,ABEV3,BBAS3,B3SA3,ITSA4,group
360,2021-01-21,118443.0,29.91,25.48,93.360001,27.549999,28.09,15.61,34.18,58.099998,11.04,test
361,2021-01-22,117172.0,29.15,24.99,92.629997,27.02,27.700001,15.03,33.419998,58.459999,10.82,test
362,2021-01-26,116464.0,28.25,24.41,91.75,27.0,27.6,15.4,32.790001,59.299999,10.62,test
363,2021-01-27,115954.0,28.67,24.940001,90.540001,27.93,28.42,15.53,33.77,59.369999,10.65,test
364,2021-01-28,117587.726562,28.940001,25.370001,89.730003,28.34,28.860001,15.53,34.310001,59.529999,10.74,test


### Feature Engineering

In [136]:
window = config["feature_engineering"]["window"]

feature_table, scaler = engineer_features(clean_data, window, "train", "^BVSP", model=None)

### Model Training

In [146]:
target = config["model_training"]["target"]
variables = config["model_training"]["variables"]

In [148]:
# Load data to torch standard
train_loader, train_x_tensor, train_y_tensor = \
    torch_data(feature_table, target="target", variables=variables, group_var="group", batch=50, group="train")

valid_loader, valid_x_tensor, valid_y_tensor = \
    torch_data(feature_table, target="target", variables=variables, group_var="group", batch=50, group="valid")

test_loader, test_x_tensor, test_y_tensor = \
    torch_data(feature_table, target="target", variables=variables, group_var="group", batch=50, group="test")

### Training step

In [150]:
from torch import nn
import torch

In [151]:
class Model(nn.Module):

    def __init__(self, input_layer, hidden_layer=50, dropout=0.25):

        super(Model, self).__init__()
        self.hidden_layer = hidden_layer
        self.dropout = nn.Dropout(dropout)
        self.fc1 = nn.Linear(input_layer, hidden_layer)
        self.fc2 = nn.Linear(hidden_layer+hidden_layer, 1)

        self.hidden_cell = (torch.zeros(1,1,self.hidden_layer),
                            torch.zeros(1,1,self.hidden_layer))
        
        self.lstm = nn.LSTM(input_layer, hidden_layer)
        
    def forward(self, input):

        x = input[:,0,:]
        z = input[:,1,:]
        x = self.fc1(x)
        x = self.dropout(x)

        lstm_out, self.hidden_cell = self.lstm(z.view(len(z),1 , -1), self.hidden_cell)
        ds = torch.cat((x,lstm_out[:,0,:]),1)
        output = self.fc2(ds)

        return output

In [154]:
# Model definition
model = Model(input_layer=window, hidden_layer=50, dropout=0.25)
criterion = L1Loss()
optimizer = Adam(model.parameters(), lr=0.01)

In [156]:
from ibovespa.model_training import train

In [159]:
# Model training
train(model, train_loader, valid_loader, criterion, optimizer, epochs=10, seed=42)

13:43:09, epoch: 0, train: 2.327, valid: 0.11
13:43:09, epoch: 1, train: 0.933, valid: 0.09
13:43:10, epoch: 2, train: 0.748, valid: 0.038
13:43:11, epoch: 3, train: 0.72, valid: 0.108
13:43:11, epoch: 4, train: 0.623, valid: 0.058
13:43:13, epoch: 5, train: 0.57, valid: 0.078
13:43:14, epoch: 6, train: 0.611, valid: 0.053
13:43:15, epoch: 7, train: 0.532, valid: 0.059
13:43:17, epoch: 8, train: 0.571, valid: 0.028
13:43:19, epoch: 9, train: 0.495, valid: 0.045


### Evaluation

In [None]:
# Model performance on Training dataset
train_true, train_pred = model_prediction(model, train_x_tensor, train_y_tensor)
calculate_metrics(train_true, train_pred)

In [None]:
# Model performance on Validation dataset
valid_true, valid_pred = model_prediction(model, valid_x_tensor, valid_y_tensor)
calculate_metrics(valid_true, valid_pred) 

In [None]:
# Model performance on Test dataset
test_true, test_pred = model_prediction(model, test_x_tensor, test_y_tensor)
calculate_metrics(test_true, test_pred)

### Benchmark Model

In [None]:
# Set and Evaludate Benchmark Model
bmk_true, bmk_pred = benchmark_model(test_y_tensor, valid_y_tensor)
calculate_metrics(bmk_true, bmk_pred)

### Graphical Evaluation

#### Test dataset and benchmark model

In [None]:
# Evaluate test dataset graphically
graphical_evaluation(test_true, test_pred, bmk_true, bmk_pred)

#### Train and validation datasets

In [None]:
# Evaluate train and validation dataset graphically
graphical_evaluation(train_true, train_pred, valid_true, valid_pred)