# Ibovespa forecasting using neural networks

## Machine Learning Engineer Nanodegree - Capstone Proposal

### Import python packages

In [17]:
import os
import pandas as pd

from torch.nn import L1Loss
from torch.optim import Adam

from ibov.utils import load_config
from ibov.model import train, torch_data, model_fc1h, model_fc2h, model_lstm
from ibov.feature import create_lags, consolidate_features, create_delta_sign, label_train_test, Normalize
from ibov.request import get_history
from ibov.metrics import calculate_metrics, model_prediction, benchmark_model, graphical_evaluation

### Loading Configs

In [18]:
# Load config dict
config = load_config()

In [19]:
# Feature Engineering Configs
window = config.get("feature").get("window")
variables = config.get("feature").get("variables")
test_split = config.get("feature").get("split").get("test")
valid_split = config.get("feature").get("split").get("valid")

# Data Configs
data_dir = config.get("data").get("dir")
ibov_ticker = config.get("ibov").get("ticker")
filename = config.get("data").get("file")
data_size = config.get("data").get("size")
ascending = config.get("data").get("ascending") == 'True'

# Model configurations
dropout = config.get("model").get("dropout")
hidden_layer = config.get("model").get("hidden_layer")
lr = config.get("model").get("lr")
seed = config.get("model").get("seed")
epochs = config.get("model").get("epochs")

### Data Preparation

In [20]:
# Invoke yahoo finance api
#ibovespa = get_history(ticker=ibov_ticker, data_size=data_size, ascending=ascending)

# Save data on disk
#ibovespa.to_csv(os.path.join(data_dir, filename), index=False)

# Read from disk
ibovespa = pd.read_csv(os.path.join(data_dir, filename))

# Label datapoint as train or test dataset
ibovespa = label_train_test(ibovespa, split=test_split, split_valid=valid_split, ascending=ascending)

### Feature Engineering

In [21]:
scaler = Normalize()
scaler.fit(ibovespa[ibovespa["group"]=="train"][["close"]])
ibovespa[["close"]] = scaler.transform(ibovespa[["close"]])

In [25]:
# Create lag variables
ibov_lags_df = create_lags(ibovespa, window=window, var="close", index="date")

In [26]:
# Create sign variables
ibov_delta_sign_df = create_delta_sign(ibov_lags_df, var="lags", index="date", window=window)

In [27]:
# Consolidate raw data with features
master_table = consolidate_features(ibovespa, "date", ibov_lags_df, ibov_delta_sign_df)

In [28]:
# Load data to torch standard
train_loader, train_x_tensor, train_y_tensor = \
    torch_data(master_table, target="target", variables=variables, group_var="group", batch=50, group="train")

valid_loader, valid_x_tensor, valid_y_tensor = \
    torch_data(master_table, target="target", variables=variables, group_var="group", batch=50, group="valid")

test_loader, test_x_tensor, test_y_tensor = \
    torch_data(master_table, target="target", variables=variables, group_var="group", batch=50, group="test")

### Training step

In [37]:
scaler.denormalize(test_y_tensor.detach().numpy())

array([[107379.],
       [109786.],
       [110133.],
       [110227.],
       [110575.],
       [108888.],
       [111335.],
       [111814.],
       [112919.],
       [113682.],
       [113625.],
       [113571.],
       [112722.],
       [114992.],
       [115323.],
       [114975.],
       [116146.],
       [117947.],
       [118157.],
       [117679.],
       [116016.],
       [116348.],
       [117857.],
       [119051.],
       [119475.],
       [119306.],
       [118558.],
       [119223.],
       [119851.]], dtype=float32)

In [29]:
from torch import nn
import torch

In [30]:
class model_fc1h(nn.Module):

    def __init__(self, input_layer, hidden_layer=50, dropout=0.25):

        super(model_fc1h, self).__init__()
        
        self.dropout = nn.Dropout(dropout)
        self.fc1 = nn.Linear(input_layer, hidden_layer)
        self.fc2 = nn.Linear(hidden_layer, 1)
        
    def forward(self, input):

        x = self.fc1(input)
        x = self.dropout(x)
        output = self.fc2(x)

        return output[:,0]

In [31]:
# Model definition
model = model_fc1h(input_layer=window, hidden_layer=hidden_layer, dropout=dropout)
criterion = L1Loss()
optimizer = Adam(model.parameters(), lr=lr)

In [32]:
# Model training
train(model, train_loader, valid_loader, criterion, optimizer, epochs=epochs, seed=seed)

13:36:32, epoch: 0, train: 2.317, valid: 0.228
13:36:32, epoch: 1, train: 1.368, valid: 0.047
13:36:32, epoch: 2, train: 0.985, valid: 0.094
13:36:32, epoch: 3, train: 0.948, valid: 0.096
13:36:32, epoch: 4, train: 0.913, valid: 0.055
13:36:32, epoch: 5, train: 0.821, valid: 0.038
13:36:32, epoch: 6, train: 0.802, valid: 0.038
13:36:32, epoch: 7, train: 0.786, valid: 0.036
13:36:32, epoch: 8, train: 0.799, valid: 0.038
13:36:32, epoch: 9, train: 0.755, valid: 0.037
13:36:32, epoch: 10, train: 0.828, valid: 0.039
13:36:32, epoch: 11, train: 0.731, valid: 0.04
13:36:32, epoch: 12, train: 0.701, valid: 0.035
13:36:32, epoch: 13, train: 0.696, valid: 0.035
13:36:32, epoch: 14, train: 0.736, valid: 0.041
13:36:32, epoch: 15, train: 0.686, valid: 0.04
13:36:32, epoch: 16, train: 0.702, valid: 0.036
13:36:32, epoch: 17, train: 0.692, valid: 0.045
13:36:32, epoch: 18, train: 0.665, valid: 0.049
13:36:32, epoch: 19, train: 0.678, valid: 0.037
13:36:32, epoch: 20, train: 0.651, valid: 0.033
13:3

In [34]:
model.maximo = 1

In [35]:
model.maximo

1

### Evaluation

In [None]:
train_true.shape

In [None]:
train_pred.shape

In [None]:
# Model performance on Training dataset
train_true, train_pred = model_prediction(model, train_x_tensor, train_y_tensor)
calculate_metrics(train_true, train_pred)

In [None]:
# Model performance on Validation dataset
valid_true, valid_pred = model_prediction(model, valid_x_tensor, valid_y_tensor)
calculate_metrics(valid_true, valid_pred) 

In [None]:
# Model performance on Test dataset
test_true, test_pred = model_prediction(model, test_x_tensor, test_y_tensor)
calculate_metrics(test_true, test_pred)

### Benchmark Model

In [None]:
# Set and Evaludate Benchmark Model
bmk_true, bmk_pred = benchmark_model(test_y_tensor, valid_y_tensor)
calculate_metrics(bmk_true, bmk_pred)

### Graphical Evaluation

In [None]:
# Evaluate test dataset graphically
graphical_evaluation(test_true, test_pred, bmk_true, bmk_pred)