In [126]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import accuracy_score
import yfinance as yf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

pairs = [['IEMG', 'EEM'], ['ARKK', 'ARKW'], ['TLT', 'SPTL'], ['SHY', 'VGSH'], ['SOXX', 'ITA']]

downloadable_tickers = [ticker for pair in pairs for ticker in pair]

training_data = yf.download(downloadable_tickers, start = '2015-01-01', end = '2020-01-01')['Close']
testing_data = yf.download(downloadable_tickers, start = '2020-01-02', end = '2020-12-31')['Close']

  training_data = yf.download(downloadable_tickers, start = '2015-01-01', end = '2020-01-01')['Close']
[*********************100%***********************]  10 of 10 completed
  testing_data = yf.download(downloadable_tickers, start = '2020-01-02', end = '2020-12-31')['Close']
[*********************100%***********************]  10 of 10 completed


In [127]:
def zscore_calc_test(series, mean, std):
    return (series - mean) / std

def zscore_calc_train(series):
    return (series - series.mean()) / series.std(), series.mean(), series.std()

training_spreads = {}
testing_spreads = {}

for etf1, etf2 in pairs:
    train_z1, mean_z1, std_z1 = zscore_calc_train(training_data[etf1])
    train_z2, mean_z2, std_z2 = zscore_calc_train(training_data[etf2])

    test_z1 = zscore_calc_test(testing_data[etf1], mean_z1, std_z1)
    test_z2 = zscore_calc_test(testing_data[etf2], mean_z2, std_z2)

    training_spread, testing_spread = train_z1 - train_z2, test_z1 - test_z2
    training_spreads[f'{etf1}_{etf2}'], testing_spreads[f'{etf1}_{etf2}'] = training_spread, testing_spread

training_spreads_df = pd.DataFrame(training_spreads)
testing_spreads_df = pd.DataFrame(testing_spreads)

In [128]:
def predict_mean_reversion_label(spread, window = 5, threshold = 0.1):
    labels = []
    mean = spread.mean()

    for i in range(len(spread)):
        future_spread = spread[i + 1 : i + 1 + window]
        if len(future_spread) < window:
            labels.append(np.nan)
        elif any(abs(j - mean) < threshold for j in future_spread):
            labels.append(1)
        else:
            labels.append(0)

    return pd.Series(labels, index = spread.index)

In [129]:
def create_sequences(data, window_size):
    X, y  = [], []

    for i in range(len(data) - window_size):
        X.append(data[i : i + window_size])
        y.append(data[i + window_size])

    X_1 = np.array(X)
    y_1 = np.array(y)

    return torch.tensor(X_1, dtype = torch.float32).unsqueeze(-1), torch.tensor(y_1, dtype = torch.float32).unsqueeze(-1)

In [130]:
class ClassificationLSTM(nn.Module):
    def __init__(self, input_size=1, hidden_size=50, num_layers=1):
        super(ClassificationLSTM, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.linear = nn.Linear(hidden_size, 1)
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, x):
        out, _ = self.lstm(x)
        out = out[:, -1, :]
        out = self.linear(out)
        out = self.sigmoid(out)
        return out.squeeze()

In [131]:
def train_model(model, train_loader, criterion, optimizer, epochs = 50, device = 'cpu'):
    model.train()
    
    for epoch in range(epochs):
        total_loss = 0
        for xb, yb in train_loader:
            xb, yb = xb.to(device), yb.to(device)
            optimizer.zero_grad()
            preds = model(xb)
            loss = criterion(preds, yb.view(-1))
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch+1}/{epochs} - Loss: {total_loss / len(train_loader):.4f}")

In [132]:
def evaluate_model(model, test_loader, device = 'cpu'):
    model.eval()
    all_preds = []
    all_true = []
    with torch.no_grad():
        for xb, yb in test_loader:
            xb, yb = xb.to(device), yb.to(device)
            preds = model(xb)
            all_preds.append(preds.cpu())
            all_true.append(yb.cpu())
    y_pred = torch.cat(all_preds).numpy()
    y_true = torch.cat(all_true).numpy()
    y_pred_label = (y_pred > 0.5).astype(int)
    accuracy = accuracy_score(y_true, y_pred_label)
    print("Test Accuracy:", accuracy)
    return y_true, y_pred

In [133]:
def run_pair(pair_name, train_spread, test_spread, window_size=30, threshold=0.2, epochs=20, device='cpu'):
    train_labels = predict_mean_reversion_label(train_spread).dropna()
    test_labels = predict_mean_reversion_label(test_spread).dropna()
    
    train_labels = predict_mean_reversion_label(train_spread)
    train_spread_aligned = train_spread[train_labels.notna()]
    train_spread_aligned = train_labels[train_labels.notna()]
    
    test_labels = predict_mean_reversion_label(test_spread)
    test_spread_aligned = test_spread[test_labels.notna()]
    test_spread_aligned = test_labels[test_labels.notna()]
    
    X_train, y_train = create_sequences(train_spread_aligned.values, window_size)
    y_train = y_train[:len(X_train)]
    X_test, y_test = create_sequences(test_spread_aligned.values, window_size)
    y_test = y_test[:len(X_test)]

    y_train = y_train.float()
    y_test = y_test.float()
    
    train_dataset = TensorDataset(X_train, y_train)
    test_dataset = TensorDataset(X_test, y_test)
    train_loader = DataLoader(train_dataset, batch_size=64, shuffle = False)
    test_loader = DataLoader(test_dataset, batch_size=64, shuffle = False)
    
    model = ClassificationLSTM().to(device)
    criterion = nn.BCELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    
    print(f"\nTraining model for pair: {pair_name}")
    train_model(model, train_loader, criterion, optimizer, epochs=epochs, device=device)
    
    print(f"\nEvaluating model for pair: {pair_name}")
    y_true, y_pred = evaluate_model(model, test_loader, device=device)
    
    return model, y_true, y_pred

In [134]:
results = {}


for pair_name in training_spreads_df.columns:
    model, y_true, y_pred = run_pair(
        pair_name,
        training_spreads_df[pair_name].dropna(),
        testing_spreads_df[pair_name].dropna(),
        window_size=30,
        threshold=0.2,
        epochs=50,
        device='cpu'
    )
    results[pair_name] = {'model': model, 'y_true': y_true, 'y_pred': y_pred}

print(results)


Training model for pair: IEMG_EEM
Epoch 1/50 - Loss: 0.6061
Epoch 2/50 - Loss: 0.2152
Epoch 3/50 - Loss: 0.0243
Epoch 4/50 - Loss: 0.0095
Epoch 5/50 - Loss: 0.0056
Epoch 6/50 - Loss: 0.0039
Epoch 7/50 - Loss: 0.0030
Epoch 8/50 - Loss: 0.0025
Epoch 9/50 - Loss: 0.0021
Epoch 10/50 - Loss: 0.0019
Epoch 11/50 - Loss: 0.0016
Epoch 12/50 - Loss: 0.0014
Epoch 13/50 - Loss: 0.0013
Epoch 14/50 - Loss: 0.0011
Epoch 15/50 - Loss: 0.0010
Epoch 16/50 - Loss: 0.0009
Epoch 17/50 - Loss: 0.0009
Epoch 18/50 - Loss: 0.0008
Epoch 19/50 - Loss: 0.0007
Epoch 20/50 - Loss: 0.0007
Epoch 21/50 - Loss: 0.0006
Epoch 22/50 - Loss: 0.0006
Epoch 23/50 - Loss: 0.0006
Epoch 24/50 - Loss: 0.0005
Epoch 25/50 - Loss: 0.0005
Epoch 26/50 - Loss: 0.0005
Epoch 27/50 - Loss: 0.0004
Epoch 28/50 - Loss: 0.0004
Epoch 29/50 - Loss: 0.0004
Epoch 30/50 - Loss: 0.0004
Epoch 31/50 - Loss: 0.0004
Epoch 32/50 - Loss: 0.0003
Epoch 33/50 - Loss: 0.0003
Epoch 34/50 - Loss: 0.0003
Epoch 35/50 - Loss: 0.0003
Epoch 36/50 - Loss: 0.0003
Ep