In [1]:
# !git clone https://github.com/cervs257/transformers
# %cd transformers/few_shot_learning

Cloning into 'transformers'...
remote: Enumerating objects: 74, done.[K
remote: Counting objects: 100% (74/74), done.[K
remote: Compressing objects: 100% (55/55), done.[K
remote: Total 74 (delta 29), reused 61 (delta 18), pack-reused 0[K
Receiving objects: 100% (74/74), 8.98 MiB | 26.14 MiB/s, done.
Resolving deltas: 100% (29/29), done.
/content/transformers/few_shot_learning


In [358]:
from scratch_transformer import MultiHeadAttentionBlock
from data import create_weights, get_reg_data, get_nonlinear_data
import numpy as np

feature_size = 10
output_size = 1
M = 10
N = 1000
lr = 1e-3

# linear attention params override
la_params = create_weights(feature_size, output_size, N, lr)

# get the data
eval_data = get_reg_data(no_tasks=M, feature_size=feature_size, no_examples=N)


# Create a MultiHeadAttentionBlock
mha = MultiHeadAttentionBlock(
    d_model=feature_size + 1, heads=1, dropout=0.0, softmax_att=False
)  # (batch_size, seq_len, d_model)

In [35]:
import torch


# Now we will override the weights of the model to implement those that perform GD in the forward pass
def override_weights(model, new_params, w_name):
    w_name = "Transformer_gd/multi_head_attention/" + w_name
    w_numpy = new_params[w_name]["w"]
    w_tensor = torch.tensor(w_numpy, dtype=model.weight.dtype)
    model.weight.data = w_tensor


# Override the weights of the model
override_weights(mha.w_q, la_params, "query")
override_weights(mha.w_k, la_params, "key")
override_weights(mha.w_v, la_params, "value")
override_weights(mha.w_o, la_params, "linear")

In [156]:
def compute_loss(preds, targets):
    """Compute the MSE loss."""
    return 0.5 * np.sum((targets - preds) ** 2) / targets.shape[0]

In [37]:
e_eval = torch.tensor(eval_data[0]).float()

# Forward pass
out = mha(e_eval, e_eval, e_eval)

# Compare the output to the targets
eval_targets = eval_data[1][:, -1]
eval_preds = out[:, -1, -1] * (-1.0)

In [38]:
loss = compute_loss(eval_preds.detach().numpy(), eval_targets)
print(f"Loss for M: {M}, N: {N} is {loss:.3f}.")

Loss for M: 10, N: 1000 is 0.484.


In [1]:
from tqdm import tqdm
from torch.optim.lr_scheduler import StepLR


def train(
    model,
    optimizer,
    criterion,
    eval_data=None,
    training_steps=1000,
    linear_data=False,
    model_type="attn",
    mask=None,
    stocks_train=None,
):
    """
    param model_type: str, "attn" or "transformer"
    """
    assert model_type in [
        "attn",
        "transformer",
    ], "model_type must be 'attn' or 'transformer'"
    if stocks_train is not None:
        assert eval_data is not None, "No stock evaluation data provided."
    eval_losses = []
    lowest_loss = 1e9

    # Move the model to device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    print(f"Training on {device}.")

    # If using stock data, we're predicting 5 outomes
    if stocks_train is not None:
        no_outcomes = 5
    else:
        no_outcomes = 1

    # Get the evaluation data if it is not provided
    if eval_data is None:
        if linear_data:
            eval_data = get_reg_data(
                no_tasks=M, feature_size=feature_size, no_examples=N
            )
        else:
            eval_data = get_nonlinear_data(
                no_tasks=M, feature_size=feature_size, no_examples=N
            )
    assert eval_data is not None, "No evaluation data provided."
    e_eval = torch.tensor(eval_data[0]).float().to(device)
    eval_targets = (
        torch.tensor(eval_data[1][:, -no_outcomes:]).float().to(device)
    )  # change for stocks

    # Define lr scheduler
    scheduler = StepLR(optimizer, step_size=1000, gamma=0.5)

    for step in tqdm(range(training_steps + 1)):
        # Generate train data
        if stocks_train is not None:
            train_data = stocks_train
        elif linear_data:
            train_data = get_reg_data(
                no_tasks=M, feature_size=feature_size, no_examples=N
            )
        else:
            train_data = get_nonlinear_data(
                no_tasks=M, feature_size=feature_size, no_examples=N
            )
        e_train = torch.tensor(train_data[0]).float().to(device)
        targets = (
            torch.tensor(train_data[1][:, -no_outcomes:]).float().to(device)
        )  # change for stocks

        # Forward pass
        optimizer.zero_grad()
        if model_type == "attn":
            out = model(e_train, e_train, e_train, mask)
        else:
            out = model(e_train, mask)
        preds = out[:, -1, -no_outcomes:] * (-1.0)  # change for stocks
        loss = criterion(preds, targets)
        loss.backward()
        optimizer.step()
        scheduler.step()

        # Evaluate
        if step % 100 == 0:
            model.eval()
            with torch.no_grad():
                if model_type == "attn":
                    ev_preds = model(e_eval, e_eval, e_eval)
                else:
                    ev_preds = model(e_eval, None)  # no mask in evaluation mode
                ev_preds = ev_preds[:, -1, -no_outcomes:] * (-1.0)  # change for stocks
                eval_loss = criterion(ev_preds, eval_targets)
                eval_losses.append(eval_loss)
            model.train()
            if eval_loss < lowest_loss:
                lowest_loss = eval_loss
                if stocks_train is not None:
                    data_type = "stocks"
                elif linear_data:
                    data_type = "lin_data"
                else:
                    data_type = "nonlin_data"
                if model_type == "transformer":
                    att = "transformer"
                elif model.softmax_att:
                    att = "softmax_attn"
                else:
                    att = "linear_attn"
                path = f"models/{att}-{data_type}.pth"
                torch.save(model.state_dict(), path)
            print(f"Step {step}, Train Loss: {loss.item():.3f}")
            print(f"Step {step}, Eval Loss: {eval_loss:.3f}")

In [18]:
# Now let's explore training the model
import torch.optim as optim

# Train
optimizer = optim.Adam(mha.parameters(), lr=lr)
criterion = torch.nn.MSELoss()

training_steps = 1000

train(
    mha,
    optimizer,
    criterion,
    eval_data=eval_data,
    training_steps=training_steps,
    linear_data=True,
    model_type="attn",
)

Training on cpu.


  0%|          | 0/1001 [00:00<?, ?it/s]


NameError: name 'get_reg_data' is not defined

In [15]:
lr = 5e-4
# Let's do the same but with non linear data
eval_nl_data = get_nonlinear_data(no_tasks=M, feature_size=feature_size, no_examples=N)
e_eval_nl = torch.tensor(eval_nl_data[0]).float()

# Create a MultiHeadAttentionBlock
mha_nl = MultiHeadAttentionBlock(
    d_model=feature_size + 1, heads=1, dropout=0.0, softmax_att=False
)  # (batch_size, seq_len, d_model)

# Forward pass pre override
out_nl = mha_nl(e_eval_nl, e_eval_nl, e_eval_nl)

# Compare the output to the targets
eval_nl_targets = eval_nl_data[1][:, -1]
eval_nl_preds = out_nl[:, -1, -1] * (-1.0)

loss_nl = compute_loss(eval_nl_preds.detach().numpy(), eval_nl_targets)
print(f"Loss pre override for M: {M}, N: {N} is {loss_nl:.3f}.")

# Override the weights of the model
override_weights(mha_nl.w_q, la_params, "query")
override_weights(mha_nl.w_k, la_params, "key")
override_weights(mha_nl.w_v, la_params, "value")
override_weights(mha_nl.w_o, la_params, "linear")

# Forward pass
out_nl = mha_nl(e_eval_nl, e_eval_nl, e_eval_nl)

# Compare the output to the targets
eval_nl_targets = eval_nl_data[1][:, -1]
eval_nl_preds = out_nl[:, -1, -1] * (-1.0)

loss_nl = compute_loss(eval_nl_preds.detach().numpy(), eval_nl_targets)
print(f"Loss with GD weights for M: {M}, N: {N} is {loss_nl:.3f}.")

Loss pre override for M: 10, N: 1000 is 1059.016.
Loss with GD weights for M: 10, N: 1000 is 0.603.


In [16]:
lr = 5e-4
optimizer = optim.Adam(mha_nl.parameters(), lr=lr)
criterion = torch.nn.MSELoss()

training_steps = 3000

# Now let's explore training the model
train(
    mha_nl,
    optimizer,
    criterion,
    eval_data=eval_nl_data,
    training_steps=training_steps,
    linear_data=False,
)

Training on cuda.
Step 0, Train Loss: 1.222
Step 0, Eval Loss: 8.459
Step 100, Train Loss: 1.940
Step 100, Eval Loss: 2.947
Step 200, Train Loss: 1.526
Step 200, Eval Loss: 1.124
Step 300, Train Loss: 2.347
Step 300, Eval Loss: 1.023
Step 400, Train Loss: 1.603
Step 400, Eval Loss: 1.079
Step 500, Train Loss: 1.361
Step 500, Eval Loss: 1.208
Step 600, Train Loss: 2.761
Step 600, Eval Loss: 1.065
Step 700, Train Loss: 0.738
Step 700, Eval Loss: 1.183
Step 800, Train Loss: 1.057
Step 800, Eval Loss: 1.669
Step 900, Train Loss: 2.104
Step 900, Eval Loss: 1.173
Step 1000, Train Loss: 1.093
Step 1000, Eval Loss: 1.194
Step 1100, Train Loss: 0.666
Step 1100, Eval Loss: 1.440
Step 1200, Train Loss: 1.182
Step 1200, Eval Loss: 1.303
Step 1300, Train Loss: 1.037
Step 1300, Eval Loss: 1.566
Step 1400, Train Loss: 1.827
Step 1400, Eval Loss: 1.525
Step 1500, Train Loss: 0.792
Step 1500, Eval Loss: 1.274
Step 1600, Train Loss: 1.173
Step 1600, Eval Loss: 1.125
Step 1700, Train Loss: 1.082
Step 170

In [11]:
# Finally let's use softmax attention
# Create a MultiHeadAttentionBlock
mha_nl_sa = MultiHeadAttentionBlock(
    d_model=feature_size + 1, heads=1, dropout=0.0, softmax_att=True
)  # (batch_size, seq_len, d_model)

optimizer = optim.Adam(mha_nl_sa.parameters(), lr=lr)
criterion = torch.nn.MSELoss()

training_steps = 1000

# Training the model
train(
    mha_nl_sa,
    optimizer,
    criterion,
    eval_data=None,
    training_steps=training_steps,
    linear_data=False,
)

Training on cuda.
Step 0, Train Loss: 0.778
Step 0, Eval Loss: 0.747
Step 100, Train Loss: 0.854
Step 100, Eval Loss: 0.677
Step 200, Train Loss: 0.770
Step 200, Eval Loss: 0.567
Step 300, Train Loss: 0.759
Step 300, Eval Loss: 0.455
Step 400, Train Loss: 0.794
Step 400, Eval Loss: 0.432
Step 500, Train Loss: 0.550
Step 500, Eval Loss: 0.414
Step 600, Train Loss: 0.581
Step 600, Eval Loss: 0.449
Step 700, Train Loss: 1.445
Step 700, Eval Loss: 0.418
Step 800, Train Loss: 0.475
Step 800, Eval Loss: 0.403
Step 900, Train Loss: 0.499
Step 900, Eval Loss: 0.399
Step 1000, Train Loss: 0.486
Step 1000, Eval Loss: 0.427


In [29]:
from scratch_transformer import (
    LayerNormalization,
    FeedForwardBlock,
    ResidualConnection,
    EncoderBlock,
    MultiHeadAttentionBlock,
)
import torch
import torch.nn as nn
from data import get_nonlinear_data

feature_size = 10
output_size = 1
M = 10
N = 1000
lr = 1e-4
dropout = 0.2
mask = None

# get the data
eval_data = get_nonlinear_data(no_tasks=M, feature_size=feature_size, no_examples=N)
e = torch.tensor(eval_data[0]).float()


# MLP dimension usually 4 times the d_model
# Residual connection already contains layer normalizations

# Start with Self Attention
mha = MultiHeadAttentionBlock(
    d_model=feature_size + 1, heads=1, dropout=dropout, softmax_att=True
)  # (batch_size, seq_len, d_model)


# Feed Forward
ff = FeedForwardBlock(
    d_model=feature_size + 1, d_ff=4 * (feature_size + 1), dropout=dropout
)  # (batch_size, seq_len, d_model


# Create an EncoderBlock
eb = EncoderBlock(
    self_attention_block=mha,
    feed_forward_block=ff,
    dropout=dropout,
)

In [30]:
training_steps = 1000
optimizer = optim.Adam(eb.parameters(), lr=lr)
criterion = torch.nn.MSELoss()

total_params = sum(p.numel() for p in eb.parameters())
print(f"Total number of parameters: {total_params}")

# Training the model
train(
    eb,
    optimizer,
    criterion,
    eval_data=None,
    training_steps=training_steps,
    linear_data=False,
    model_type="transformer",
    mask=mask,
)

Training on cpu.
Step 0, Train Loss: 37.812
Step 0, Eval Loss: 5.308
Step 100, Train Loss: 12.215
Step 100, Eval Loss: 5.267
Step 200, Train Loss: 7.803
Step 200, Eval Loss: 5.223
Step 300, Train Loss: 4.062
Step 300, Eval Loss: 5.186
Step 400, Train Loss: 2.614
Step 400, Eval Loss: 5.146
Step 500, Train Loss: 5.747
Step 500, Eval Loss: 5.104
Step 600, Train Loss: 24.885
Step 600, Eval Loss: 5.065
Step 700, Train Loss: 11.340
Step 700, Eval Loss: 5.025
Step 800, Train Loss: 25.919
Step 800, Eval Loss: 4.989
Step 900, Train Loss: 7.570
Step 900, Eval Loss: 4.948
Step 1000, Train Loss: 15.817
Step 1000, Eval Loss: 4.906


In [6]:
import pandas as pd
import numpy as np

stocks = pd.read_csv("data/stocks.csv")
stocks.head()


Unnamed: 0,DATE,LAST,OPEN,LOW,HIGH,3M IMPLIED VOL,SHORT INTEREST RATIO,Ticker,tweet_sentiment,news_sentiment
0,2006-02-06,126.6,126.44,126.17,126.8,,,SPY,,
1,2006-02-07,125.48,126.3,125.4,126.66,,,SPY,,
2,2006-02-08,126.62,125.88,125.6,126.78,,,SPY,,
3,2006-02-09,126.41,126.85,126.37,127.6,,,SPY,,
4,2006-02-10,126.64,126.42,125.45,127.13,,,SPY,,


In [7]:
spy = stocks.loc[
    stocks["Ticker"] == "SPY", ["DATE", "LAST", "OPEN", "LOW", "HIGH", "3M IMPLIED VOL"]
].dropna()
spy.reset_index(drop=True, inplace=True)


In [8]:
import numpy as np

# Select all columns except 'DATE'
cols = [col for col in spy.columns if col not in ["DATE", "3M IMPLIED VOL"]]

# Apply the function to each element of the selected columns
spy[cols] = spy[cols].apply(lambda x: np.log(x / 200))

# Normalize the '3M IMPLIED VOL' column
spy["3M IMPLIED VOL"] = spy["3M IMPLIED VOL"] / spy["3M IMPLIED VOL"].std()


In [9]:
# create y
spy["price_1d"] = spy["LAST"].shift(-1)
spy["price_5d"] = spy["LAST"].shift(-5)
spy["price_10d"] = spy["LAST"].shift(-10)
spy["price_20d"] = spy["LAST"].shift(-20)
spy["open_1d"] = spy["OPEN"].shift(-1)


In [10]:
# divide the data into periods of 251 days
n = 251
spy["period"] = spy.index // n


In [11]:
# for each period, separate the last 20 rows to avoid lookahead bias and use as eval data
spy_val = spy.groupby("period").tail(20)
spy = (
    spy.groupby("period")
    .apply(
        lambda x: x.iloc[:-20], include_groups=True
    )  # include groups to later create test query
    .reset_index(drop=True)
)
spy_val = spy_val.dropna()


  spy.groupby("period")


In [12]:
# Now let's fill the last entry of each period with 0.0
columns = ["price_1d", "price_5d", "price_10d", "price_20d", "open_1d"]
last_entry = spy.groupby("period").apply(
    lambda x: x.last_valid_index(), include_groups=False
)
# spy_targets = np.array(spy.loc[last_entry, :].drop(columns=["DATE", "period"]))
spy_targets = np.array(spy.loc[last_entry, columns])
spy.loc[last_entry, columns] = 0.0

# Let's fill y data for spy_val with 0.0
last_entry = spy_val.groupby("period").apply(
    lambda x: x.last_valid_index(), include_groups=False
)
spy_val_targets = np.array(spy_val.loc[last_entry, columns])
spy_val.loc[last_entry, columns] = 0.0

spy.groupby("period").last().head()


Unnamed: 0_level_0,DATE,LAST,OPEN,LOW,HIGH,3M IMPLIED VOL,price_1d,price_5d,price_10d,price_20d,open_1d
period,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,2015-04-15,0.050836,0.049028,0.048552,0.05373,2.654142,0.0,0.0,0.0,0.0,0.0
1,2016-04-13,0.039225,0.034401,0.033628,0.039701,2.837641,0.0,0.0,0.0,0.0,0.0
2,2017-04-11,0.161523,0.160843,0.154179,0.162034,2.174546,0.0,0.0,0.0,0.0,0.0
3,2018-04-11,0.276722,0.275622,0.275318,0.283825,3.048831,0.0,0.0,0.0,0.0,0.0
4,2019-04-10,0.36565,0.363844,0.362244,0.365996,2.474409,0.0,0.0,0.0,0.0,0.0


In [13]:
# We're going to treat each period as a context
spy = spy.drop(columns=["DATE"])
spy_val = spy_val.drop(columns=["DATE"])
groups = [group.drop(columns=["period"]).values for _, group in spy.groupby("period")]
groups_val = [
    group.drop(columns=["period"]).values for _, group in spy_val.groupby("period")
]
spy_np = np.array(groups)
spy_val_np = np.array(groups_val)

# Define stock eval data
spy_eval = (spy_val_np, spy_val_targets)
# Define stock train data
spy_train = (spy_np, spy_targets)
# eval_targets = torch.tensor(spy_eval[1]).float()


In [9]:
import torch


def create_look_ahead_mask(size):
    mask = torch.triu(torch.ones(size, size), diagonal=1)
    return mask  # (seq_len, seq_len)


mask = create_look_ahead_mask(spy_np.shape[1])


# def create_look_ahead_mask(batch_size, seq_len):
#     mask = torch.triu(torch.ones((batch_size, seq_len, seq_len)), diagonal=1)
#     return mask  # (batch_size, seq_len, seq_len)

# # Masks for the transformer
# mask = create_look_ahead_mask(spy_np.shape[0], spy_np.shape[1])


In [16]:
from scratch_transformer import (
    LayerNormalization,
    FeedForwardBlock,
    ResidualConnection,
    EncoderBlock,
    MultiHeadAttentionBlock,
    Encoder,
)
import torch
import torch.nn as nn

feature_size = 10
output_size = 1
dropout = 0.2
mask = mask
heads = 5
layers = 12

# convert to tensor
e = torch.tensor(spy_eval[0]).float()
et = torch.tensor(spy_train[0]).float()


# MLP dimension usually 4 times the d_model
# Residual connection already contains layer normalizations

# Start with Self Attention
mha = MultiHeadAttentionBlock(
    d_model=feature_size,
    heads=heads,
    dropout=dropout,
    softmax_att=True,
)  # (batch_size, seq_len, d_model)

# out = mha(e, e, e)
out = mha(et, et, et, mask)

# Feed Forward
ff = FeedForwardBlock(
    d_model=feature_size, d_ff=4 * feature_size, dropout=dropout
)  # (batch_size, seq_len, d_model


# Create an EncoderBlock
eb = EncoderBlock(
    self_attention_block=mha,
    feed_forward_block=ff,
    dropout=dropout,
)

encoder_blocks = []
for _ in range(layers):
    encoder_self_attention_block = MultiHeadAttentionBlock(
        d_model=feature_size, heads=heads, dropout=dropout, softmax_att=True
    )
    encoder_feed_forward_block = FeedForwardBlock(
        d_model=feature_size, d_ff=4 * feature_size, dropout=dropout
    )
    encoder_block = EncoderBlock(
        self_attention_block=encoder_self_attention_block,
        feed_forward_block=encoder_feed_forward_block,
        dropout=dropout,
    )
    encoder_blocks.append(encoder_block)

# Don't worry about Encoder, it's just predefined in scratch_transformer
decoder = Encoder(
    nn.ModuleList(encoder_blocks),
)

# out = decoder(et, mask)

for p in decoder.parameters():
    if p.dim() > 1:
        nn.init.xavier_uniform_(p)

# Compare the output to the targets
# eval_targets = spy_eval[1]
# eval_preds = out[:, -1, -5:] * (-1.0)

# loss = compute_loss(eval_preds.detach().numpy(), eval_targets)
# print(f"Loss is {loss:.3f}.")


In [17]:
import torch.optim as optim

# Train
lr = 1e-3
training_steps = 1000
optimizer = optim.Adam(decoder.parameters(), lr=lr)
criterion = torch.nn.MSELoss()

total_params = sum(p.numel() for p in decoder.parameters())
print(f"Total number of parameters: {total_params}")

# Training the model
train(
    decoder,
    optimizer,
    criterion,
    eval_data=spy_eval,
    training_steps=training_steps,
    linear_data=False,
    model_type="transformer",
    mask=mask,
    stocks_train=spy_train,
)


Total number of parameters: 15530
Training on cpu.


  0%|          | 1/1001 [00:00<07:17,  2.29it/s]

Step 0, Train Loss: 0.604
Step 0, Eval Loss: 0.282


 10%|█         | 101/1001 [00:35<05:25,  2.76it/s]

Step 100, Train Loss: 0.087
Step 100, Eval Loss: 0.061


 20%|██        | 201/1001 [01:09<04:35,  2.90it/s]

Step 200, Train Loss: 0.077
Step 200, Eval Loss: 0.053


 30%|███       | 301/1001 [01:44<04:01,  2.90it/s]

Step 300, Train Loss: 0.026
Step 300, Eval Loss: 0.010


 40%|████      | 401/1001 [02:19<03:26,  2.91it/s]

Step 400, Train Loss: 0.018
Step 400, Eval Loss: 0.008


 50%|█████     | 501/1001 [02:53<02:53,  2.88it/s]

Step 500, Train Loss: 0.014
Step 500, Eval Loss: 0.008


 60%|██████    | 601/1001 [03:27<02:19,  2.87it/s]

Step 600, Train Loss: 0.014
Step 600, Eval Loss: 0.006


 70%|███████   | 701/1001 [04:02<01:43,  2.91it/s]

Step 700, Train Loss: 0.014
Step 700, Eval Loss: 0.005


 80%|████████  | 801/1001 [04:45<01:25,  2.33it/s]

Step 800, Train Loss: 0.009
Step 800, Eval Loss: 0.004


 90%|█████████ | 901/1001 [05:22<00:38,  2.62it/s]

Step 900, Train Loss: 0.006
Step 900, Eval Loss: 0.004


100%|██████████| 1001/1001 [05:59<00:00,  2.79it/s]

Step 1000, Train Loss: 0.013
Step 1000, Eval Loss: 0.004





In [18]:
# load the transformer-stocks.pth model
decoder.load_state_dict(torch.load("models/transformer-stocks.pth"))
criterion = torch.nn.MSELoss()

# Evaluate the model
decoder.eval()
with torch.no_grad():
    e_eval = torch.tensor(spy_eval[0]).float()
    eval_preds = decoder(e_eval, None)[:, -1, -5:] * (-1.0)
    eval_targets = torch.tensor(spy_eval[1]).float()
    eval_loss = criterion(eval_preds, eval_targets)

print(f"Eval Loss: {eval_loss:.3f}")


Eval Loss: 0.004


In [19]:
np.exp(eval_preds) * 200

tensor([[224.5926, 220.6326, 227.9468, 223.9325, 221.0671],
        [224.0863, 220.0589, 227.5154, 224.0515, 219.7355],
        [242.0759, 240.4400, 241.9553, 237.2371, 241.6922],
        [271.1071, 261.0004, 259.0979, 260.3919, 259.6585],
        [288.6564, 282.7476, 274.6498, 274.1552, 289.5774],
        [290.2488, 304.7810, 283.4862, 304.2762, 295.5924],
        [468.8124, 454.3061, 468.3952, 461.0709, 461.0333],
        [439.3058, 432.7989, 441.1549, 438.1042, 433.3932],
        [454.2195, 442.6006, 455.5841, 449.9601, 446.8123],
        [262.3158, 254.4384, 253.8929, 254.0308, 253.7223],
        [291.6357, 284.6928, 276.4556, 277.0117, 289.1633],
        [274.5508, 268.7983, 264.2328, 276.4927, 260.3329],
        [441.7336, 433.7585, 444.6514, 441.3233, 433.8806],
        [438.2299, 436.6476, 442.7344, 441.9445, 434.5273],
        [442.7309, 431.7016, 444.6029, 441.0671, 433.7714]])

In [20]:
np.exp(eval_targets) * 200

tensor([[212.2100, 212.8800, 212.4600, 211.6300, 211.2400],
        [206.5600, 204.9100, 209.2800, 212.0800, 207.2900],
        [239.3800, 235.8200, 240.6100, 243.7800, 239.3500],
        [272.0200, 272.2400, 273.3600, 277.3700, 270.3400],
        [288.1000, 287.7000, 282.1400, 287.6500, 285.6200],
        [292.4400, 284.9700, 294.8800, 319.3400, 291.0900],
        [422.1200, 410.2800, 415.2800, 422.6000, 419.8900],
        [413.8100, 392.7500, 391.8600, 417.3900, 424.5500],
        [412.6300, 412.1300, 419.2300, 427.9200, 408.9100],
        [265.6400, 263.7600, 270.3900, 263.2000, 265.5500],
        [287.1800, 288.2900, 289.4500, 291.1800, 286.7800],
        [251.8300, 274.0300, 279.1000, 290.4800, 245.1900],
        [400.6100, 408.5200, 415.8700, 420.0600, 398.4000],
        [458.7000, 451.0300, 438.2900, 417.2700, 460.3400],
        [403.7000, 407.6000, 413.4700, 412.4100, 404.0900]])

In [28]:
# compare to XGBoost
from xgboost import XGBRegressor

# Create the model
model = XGBRegressor(
    n_estimators=10000,
    max_depth=100,
    learning_rate=0.01,
    objective="reg:squarederror",
    n_jobs=-1,
)

train = spy_train[0].reshape(-1, 10)
test = spy_eval[0].reshape(-1, 10)

x_train = train[:, :5]
y_train = train[:, 5:]

x_test = test[:, :5]
y_test = test[:, 5:]

# Fit the model
model.fit(x_train, y_train)

# Make predictions
y_pred = model.predict(x_test)

# Calculate the MSE
from sklearn.metrics import mean_squared_error

mse = mean_squared_error(y_test, y_pred)
print(f"MSE: {mse:.3f}")


MSE: 0.013
