# Ibovespa forecasting using neural networks

## Machine Learning Engineer Nanodegree - Capstone Proposal

### Import python packages

In [1]:
import os
import numpy as np
import pandas as pd
from tqdm import tqdm
import torch
import torch.utils.data
from torch import optim
from torch import nn
from torch.autograd import Variable
from ibov.utils import load_config
from ibov.feature import create_lags, consolidate_features
from ibov.model import Ibovespa, train
from ibov.request import get_history, label_train_test

### Loading Configs

In [2]:
# Load config dict
config = load_config()

In [3]:
# Feature Engineering Configs
split = config.get("feature").get("split")
window = config.get("feature").get("window")

# Data Configs
data_dir = config.get("data").get("dir")
ibov_ticker = config.get("ibov").get("ticker")
history_file = config.get("data").get("history")

### Data Preparation

In [4]:
# Invoke yahoo finance api
ibovespa = get_history(ticker=ibov_ticker)

# Save data on disk
ibovespa.to_csv(os.path.join(data_dir, history_file), index=False)

# Label datapoint as train or test dataset
ibovespa = label_train_test(ibovespa, split=split, split_valid=0.2)

### Feature Engineering

In [5]:
# Create lag variables
ibov_lags_df = create_lags(ibovespa, window=window, var="close", index="date")

In [6]:
# Consolidate raw data with features
master_table = consolidate_features(ibovespa, "date", ibov_lags_df)

### Definição do modelo

In [15]:
def torch_data(df, target, variables, group_var, batch, group):
    
    data  = df[df[group_var] == group].reset_index()
    
    x_tensor = torch.Tensor(data[variables])
    y_tensor = torch.Tensor(data[target])
    
    dataset = torch.utils.data.TensorDataset(x_tensor,y_tensor)
    loader = torch.utils.data.DataLoader(dataset, batch_size=batch)
    
    return loader, x_tensor, y_tensor   

In [17]:
train_loader, train_x_tensor, train_y_tensor = \
    torch_data(master_table, target="target", variables="lags", group_var="group", batch=10, group="train")

valid_loader, valid_x_tensor, valid_y_tensor = \
    torch_data(master_table, target="target", variables="lags", group_var="group", batch=10, group="valid")

test_loader, test_x_tensor, test_y_tensor = \
    torch_data(master_table, target="target", variables="lags", group_var="group", batch=10, group="test")

In [18]:
model = Ibovespa(input_layer=window)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [19]:
train(model, train_loader, valid_loader, criterion, optimizer, epochs=10)

253244388.734375
242366546.390625
227100552.25
208438436.828125
191173837.4375
174416356.046875
158213161.078125
145366052.515625
133479587.46875
122718445.875


In [12]:
import seaborn as sns

In [105]:
def evaluate_results(x_tensor, y_tensor):
    # Prediction error
    pred = np.array(np.hstack(model(x_tensor).detach().numpy()).tolist())
    true = np.array(np.hstack(y_tensor).tolist())
    mae = np.mean(np.abs(pred - true))
    # Predict delta accuracy
    delta_true = np.array([1 if np.sign(true[idx]-true[idx+1]) >=0 else 0 for idx in range(true.shape[0]-1)])
    delta_pred = np.array([1 if np.sign(pred[idx]-pred[idx+1]) >=0 else 0 for idx in range(pred.shape[0]-1)])
    tp = sum(1 for val in (delta_true + delta_pred) if val==2)
    fp = sum(1 for val in (delta_true - delta_pred) if val==-1)
    fn = sum(1 for val in (delta_true - delta_pred) if val==1)
    precision = tp/(tp+fp)
    recal = tp/(tp+fn)
    f1 = 2*(precision*recal)/(precision+recal)
    
    return mae, f1

In [106]:
evaluate_results(train_x_tensor, train_y_tensor)

(467.84016883828406, 0.5604274134119381)

In [107]:
evaluate_results(valid_x_tensor, valid_y_tensor)

(776.3718249848055, 0.5038402457757296)

In [108]:
evaluate_results(test_x_tensor, test_y_tensor)

(1339.5038093065693, 0.4919786096256685)

### Benchmark Model

In [109]:
true = np.array(np.hstack(y_tensor).tolist())
pred = np.array(list(np.array(np.hstack(y_tensor).tolist())[1:]) + list(np.array(valid_y_tensor[0])))

In [110]:
mae = np.mean(np.abs(pred - true))
# Predict delta accuracy
delta_true = np.array([1 if np.sign(true[idx]-true[idx+1]) >=0 else 0 for idx in range(true.shape[0]-1)])
delta_pred = np.array([1 if np.sign(pred[idx]-pred[idx+1]) >=0 else 0 for idx in range(pred.shape[0]-1)])
tp = sum(1 for val in (delta_true + delta_pred) if val==2)
fp = sum(1 for val in (delta_true - delta_pred) if val==-1)
fn = sum(1 for val in (delta_true - delta_pred) if val==1)
precision = tp/(tp+fp)
recal = tp/(tp+fn)
f1 = 2*(precision*recal)/(precision+recal)

In [112]:
mae, f1

(391.6117385868906, 0.5516329998112138)

### Graphical Evaluation

In [None]:
from matplotlib import pyplot

In [None]:
a4_dims = (16.5, 11.7)
fig, ax = pyplot.subplots(figsize=a4_dims)
sns.lineplot(ax=ax, data=a, dashes=False, markers=True)

### Sagemaker Training

In [None]:
import boto3
import sagemaker

sagemaker_session = sagemaker.Session(boto_session=boto3.session.Session(region_name="sa-east-1"))

bucket = sagemaker_session.default_bucket()
prefix = 'sagemaker/ibovespa'

try:
    role = sagemaker.get_execution_role()
except Exception as err:
    role = "arn:aws:iam::977053370764:role/service-role/AmazonSageMaker-ExecutionRole-20201202T141643"

In [None]:
data_root_dir

In [None]:
input_data = sagemaker_session.upload_data(path=data_root_dir, bucket=bucket, key_prefix=prefix)
input_data

In [None]:
from sagemaker.pytorch import PyTorch

estimator = PyTorch(entry_point="train.py",
                    source_dir="ibov",
                    py_version="py3",
                    role=role,
                    framework_version='0.4.0',
                    train_instance_count=1,
                    train_instance_type='ml.m5.large')

In [None]:
estimator.fit({'train': input_data})

In [None]:
predictor = estimator.deploy(initial_instance_count=1,instance_type="ml.m4.xlarge")

In [None]:
tensor_x

In [None]:
predictor.predict(tensor_x)