# Ibovespa forecasting using neural networks

## Machine Learning Engineer Nanodegree - Capstone Proposal

### Import python packages

In [1]:
import os
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from torch.nn import L1Loss
from torch.optim import Adam

from ibov.utils import load_config
from ibov.model import train, torch_data, model_fc1h, model_fc2h, model_lstm, Model
from ibov.feature import create_lags, consolidate_features, create_delta_sign, label_train_test, Normalize
from ibov.request import get_history
from ibov.metrics import calculate_metrics, model_prediction, benchmark_model, graphical_evaluation

ModuleNotFoundError: No module named 'ibov'

### Loading Configs

In [None]:
# Load config dict
config = load_config()

In [None]:
# Feature Engineering Configs
window = config.get("feature").get("window")
variables = config.get("feature").get("variables")
test_split = config.get("feature").get("split").get("test")
valid_split = config.get("feature").get("split").get("valid")

# Data Configs
data_dir = config.get("data").get("dir")
ibov_ticker = config.get("ibov").get("ticker")
filename = config.get("data").get("file")
data_size = config.get("data").get("size")
ascending = config.get("data").get("ascending") == 'True'

# Model configurations
dropout = config.get("model").get("dropout")
hidden_layer = config.get("model").get("hidden_layer")
lr = config.get("model").get("lr")
seed = config.get("model").get("seed")
epochs = config.get("model").get("epochs")

### Data Collection

In [None]:
import pandas as pd
from yahooquery import Ticker

def get_history(ticker, data_size, ascending):
     
    ticker = Ticker(symbols = ticker)
    history = ticker.history(period="max")
     
    df = history.sort_values(by="date", ascending=ascending)
         
    if ascending:
        df = df.tail(data_size).reset_index(drop=False)
    else:
        df = df.head(data_size).reset_index(drop=False)
          
    return df[["date", "close"]]  

def collect_data(stocks, data_size, ascending):
    
    stocks_df = get_history(stocks[0], data_size, ascending)[["date"]]
    
    for stock in stocks:
        stock_df = get_history(stock, data_size, ascending)
        stock_df.rename(columns={"close": stock[:-3]}, inplace=True)
        stocks_df = stocks_df.merge(stock_df, how="inner", on="date")
        
    return stocks_df

In [None]:
collect_data(stocks=config["ibov"]["stocks"], 
             data_size=config["data"]["size"], 
             ascending=config["data"]["ascending"])

#### Capture new data

In [None]:
# Target collection
ibovespa = get_history(ticker=ibov_ticker, data_size=data_size, ascending=ascending)

# Stock Prices
for stock in config["ibov"]["stocks"]:    
    df = get_history(ticker=stock, data_size=data_size, ascending=ascending)
    df.rename(columns={"close": stock[:-3]},inplace=True)
    ibovespa = ibovespa.merge(df, how="inner", on="date")
    
# Label datapoint as train or test dataset
ibovespa = label_train_test(ibovespa, split=test_split, split_valid=valid_split, ascending=ascending)

# Save data on disk
ibovespa.to_csv(os.path.join(data_dir, filename), index=False)

#### Read Existing data

In [None]:
# Read from disk
ibovespa = pd.read_csv(os.path.join(data_dir, filename))

### Data Exploration

It is important to evaluate data and get insights only form the train dataset. Otherwise, we will have a data leakage even before any model training.

In [None]:
explore_data = ibovespa[ibovespa["group"]=="train"]

# Calendar Variables
calendar_variables = pd.get_dummies(pd.DatetimeIndex(explore_data['date']).weekday, prefix="weekday")
explore_data = pd.concat([explore_data, calendar_variables], axis = 1)
explore_data[["weekday"]] = pd.DatetimeIndex(explore_data['date']).weekday

numeric_columns = ['close', 'ITUB4', 'BBDC4', 'VALE3', 'PETR4', 'PETR3', 'ABEV3', 'BBAS3', 'B3SA3', 'ITSA4']
stocks_diff = explore_data[numeric_columns].pct_change().reset_index(drop=True)
stocks_diff.columns = ["diff_" + column for column in stocks_diff.columns]

complete_explore_data = pd.concat([explore_data, stocks_diff], axis=1).iloc[1:].reset_index(drop=True)
complete_explore_data.head()

#### Weekday Boxplots

In [None]:
f, ax = plt.subplots(figsize=(13.7, 5.5))
sns.boxplot(y="weekday", x="diff_close", data=complete_explore_data, orient="h", ax=ax)
sns.swarmplot(x="diff_close", y="weekday", orient="h", data=complete_explore_data, color=".25", ax=ax)
plt.axvline(0, 0,1, ls="--", color="gray")

In [None]:
sns.displot(y="weekday", x="diff_close", data=complete_explore_data)

#### Correlations

In [None]:
last_day_diff = complete_explore_data.iloc[:-1][['diff_ITUB4', 'diff_BBDC4', 'diff_VALE3', 'diff_PETR4', 'diff_PETR3',
       'diff_ABEV3', 'diff_BBAS3', 'diff_B3SA3', 'diff_ITSA4']].reset_index(drop=True)

today_diff_close = complete_explore_data.iloc[1:][["diff_close"]].reset_index(drop=True)

diff_evaluation = pd.concat([today_diff_close, last_day_diff], axis=1)

In [None]:
diff_evaluation.corr(method="spearman")

In [None]:
sns.pairplot(diff_evaluation, kind='reg', plot_kws={'line_kws':{'color':'red'}, 'scatter_kws': {'alpha': 0.1}})

### Feature Engineering

In [None]:
def feature_engineer(dados, config, mode, model=None):

    # Target Normalization

    scaler = Normalize()

    if mode == "train":
        scaler.fit(dados[dados["group"]=="train"][["close"]])
    elif mode == "predict":
        scaler.load_configs(maximo=model.maximo, minimo=model.minimo)
    else:
        raise Exception("mode does not exist")

    dados[["close"]] = scaler.transform(dados[["close"]])

    # Feature Engineering   

    ibov_lags_df = create_lags(dados, 
                               window=config["feature"]["window"], 
                               var="close", 
                               index="date")

    ibov_delta_sign_df = create_delta_sign(ibov_lags_df, 
                                           var="lags", 
                                           index="date", 
                                           window=config["feature"]["window"])

    master_table = consolidate_features(dados, "date", ibov_lags_df, ibov_delta_sign_df)

    return master_table, scaler

In [None]:
scaler = Normalize()
scaler.fit(ibovespa[ibovespa["group"]=="train"][["close"]])
ibovespa[["close"]] = scaler.transform(ibovespa[["close"]])

In [None]:
# Create lag variables
ibov_lags_df = create_lags(ibovespa, window=window, var="close", index="date")

In [None]:
# Create sign variables
ibov_delta_sign_df = create_delta_sign(ibov_lags_df, var="lags", index="date", window=window)

In [None]:
# Create weekdays dummies
weekdays_df = ibovespa[['date', 'weekday_1', 'weekday_2', 'weekday_3', 'weekday_4']]

In [None]:
# Create last day stock price diff variables
numeric_columns = ['close', 'ITUB4', 'BBDC4', 'VALE3', 'PETR4', 'PETR3', 'ABEV3', 'BBAS3', 'B3SA3', 'ITSA4']
stocks_diff = ibovespa[numeric_columns].pct_change().reset_index(drop=True)
diff_dates = ibovespa.iloc[2:][["date"]].reset_index(drop=True)
last_diff_df = stocks_diff.iloc[1:].iloc[:-1].reset_index(drop=True)
last_day_stock_diff = pd.concat([diff_dates, last_diff_df], axis=1)

In [None]:
last_day_stock_diff

In [None]:
ement = create_lags(last_day_stock_diff, window=2, var="close", index="date")[["date", "lags"]].rename(columns={"lags": "close"})

In [None]:
# Consolidate raw data with features
master_table = consolidate_features(ibovespa[["date", "group"]], "date", 
                                    ibov_lags_df, ibov_delta_sign_df, ement, weekdays_df)

In [None]:
def torch_data(data, target, variables, group_var, batch, group):
    
    if group is not None:
        data  = data[data[group_var] == group].reset_index()
    
    x_tensor = torch.Tensor(data[variables].values.tolist())
    y_tensor = torch.Tensor(data[target])
    
    dataset = torch.utils.data.TensorDataset(x_tensor,y_tensor)
    loader = torch.utils.data.DataLoader(dataset, batch_size=batch)
    
    return loader, x_tensor, y_tensor

In [None]:
# Load data to torch standard
train_loader, train_x_tensor, train_y_tensor = \
    torch_data(master_table, target="target", variables=variables, group_var="group", batch=50, group="train")

valid_loader, valid_x_tensor, valid_y_tensor = \
    torch_data(master_table, target="target", variables=variables, group_var="group", batch=50, group="valid")

test_loader, test_x_tensor, test_y_tensor = \
    torch_data(master_table, target="target", variables=variables, group_var="group", batch=50, group="test")

### Training step

In [None]:
from torch import nn
import torch

In [None]:
class Model(nn.Module):

    def __init__(self, input_layer, hidden_layer=50, dropout=0.25):

        super(Model, self).__init__()
        self.hidden_layer = hidden_layer
        self.dropout = nn.Dropout(dropout)
        self.fc1 = nn.Linear(input_layer, hidden_layer)
        self.fc2 = nn.Linear(hidden_layer+hidden_layer, 1)

        self.hidden_cell = (torch.zeros(1,1,self.hidden_layer),
                            torch.zeros(1,1,self.hidden_layer))
        
        self.lstm = nn.LSTM(input_layer, hidden_layer)
        
    def forward(self, input):

        x = input[:,0,:]
        z = input[:,1,:]
        x = self.fc1(x)
        x = self.dropout(x)

        lstm_out, self.hidden_cell = self.lstm(z.view(len(z),1 , -1), self.hidden_cell)
        ds = torch.cat((x,lstm_out[:,0,:]),1)
        output = self.fc2(ds)

        return output

In [None]:
# Model definition
model = Model(input_layer=window, hidden_layer=hidden_layer, dropout=dropout)
criterion = L1Loss()
optimizer = Adam(model.parameters(), lr=lr)

In [None]:
# Model training
train(model, train_loader, valid_loader, criterion, optimizer, epochs=epochs, seed=seed)

### Evaluation

In [None]:
# Model performance on Training dataset
train_true, train_pred = model_prediction(model, train_x_tensor, train_y_tensor)
calculate_metrics(train_true, train_pred)

In [None]:
# Model performance on Validation dataset
valid_true, valid_pred = model_prediction(model, valid_x_tensor, valid_y_tensor)
calculate_metrics(valid_true, valid_pred) 

In [None]:
# Model performance on Test dataset
test_true, test_pred = model_prediction(model, test_x_tensor, test_y_tensor)
calculate_metrics(test_true, test_pred)

### Benchmark Model

In [None]:
# Set and Evaludate Benchmark Model
bmk_true, bmk_pred = benchmark_model(test_y_tensor, valid_y_tensor)
calculate_metrics(bmk_true, bmk_pred)

### Graphical Evaluation

#### Test dataset and benchmark model

In [None]:
# Evaluate test dataset graphically
graphical_evaluation(test_true, test_pred, bmk_true, bmk_pred)

#### Train and validation datasets

In [None]:
# Evaluate train and validation dataset graphically
graphical_evaluation(train_true, train_pred, valid_true, valid_pred)