In [1]:
# !git clone https://github.com/cervs257/transformers
# %cd transformers/few_shot_learning

Cloning into 'transformers'...
remote: Enumerating objects: 74, done.[K
remote: Counting objects: 100% (74/74), done.[K
remote: Compressing objects: 100% (55/55), done.[K
remote: Total 74 (delta 29), reused 61 (delta 18), pack-reused 0[K
Receiving objects: 100% (74/74), 8.98 MiB | 26.14 MiB/s, done.
Resolving deltas: 100% (29/29), done.
/content/transformers/few_shot_learning


In [144]:
from scratch_transformer import MultiHeadAttentionBlock
from data import create_weights, get_reg_data, get_nonlinear_data
import numpy as np

feature_size = 10
output_size = 1
M = 10
N = 1000
lr = 1e-3

# linear attention params override
la_params = create_weights(feature_size, output_size, N, lr)

# get the data
eval_data = get_reg_data(no_tasks=M, feature_size=feature_size, no_examples=N)


# Create a MultiHeadAttentionBlock
mha = MultiHeadAttentionBlock(
    d_model=feature_size + 1, heads=1, dropout=0.0, softmax_att=False
)  # (batch_size, seq_len, d_model)

In [35]:
import torch


# Now we will override the weights of the model to implement those that perform GD in the forward pass
def override_weights(model, new_params, w_name):
    w_name = "Transformer_gd/multi_head_attention/" + w_name
    w_numpy = new_params[w_name]["w"]
    w_tensor = torch.tensor(w_numpy, dtype=model.weight.dtype)
    model.weight.data = w_tensor


# Override the weights of the model
override_weights(mha.w_q, la_params, "query")
override_weights(mha.w_k, la_params, "key")
override_weights(mha.w_v, la_params, "value")
override_weights(mha.w_o, la_params, "linear")

In [156]:
def compute_loss(preds, targets):
    """Compute the MSE loss."""
    return 0.5 * np.sum((targets - preds) ** 2) / targets.shape[0]

In [37]:
e_eval = torch.tensor(eval_data[0]).float()

# Forward pass
out = mha(e_eval, e_eval, e_eval)

# Compare the output to the targets
eval_targets = eval_data[1][:, -1]
eval_preds = out[:, -1, -1] * (-1.0)

In [38]:
loss = compute_loss(eval_preds.detach().numpy(), eval_targets)
print(f"Loss for M: {M}, N: {N} is {loss:.3f}.")

Loss for M: 10, N: 1000 is 0.484.


In [39]:
from tqdm import tqdm
from torch.optim.lr_scheduler import StepLR


def train(
    model,
    optimizer,
    criterion,
    eval_data=None,
    training_steps=1000,
    linear_data=False,
    model_type="attn",
    mask=None,
):
    """
    param model_type: str, "attn" or "transformer"
    """
    assert model_type in [
        "attn",
        "transformer",
    ], "model_type must be 'attn' or 'transformer'"
    eval_losses = []
    lowest_loss = 1e9

    # Move the model to device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    print(f"Training on {device}.")

    # Get the evaluation data if it is not provided
    if eval_data is None:
        if linear_data:
            eval_data = get_reg_data(
                no_tasks=M, feature_size=feature_size, no_examples=N
            )
        else:
            eval_data = get_nonlinear_data(
                no_tasks=M, feature_size=feature_size, no_examples=N
            )
    assert eval_data is not None, "No evaluation data provided."
    e_eval = torch.tensor(eval_data[0]).float().to(device)
    eval_targets = torch.tensor(eval_data[1][:, -1]).float().to(device)

    # Define lr scheduler
    scheduler = StepLR(optimizer, step_size=1000, gamma=0.5)

    for step in tqdm(range(training_steps + 1)):
        # Generate train data
        if linear_data:
            train_data = get_reg_data(
                no_tasks=M, feature_size=feature_size, no_examples=N
            )
        else:
            train_data = get_nonlinear_data(
                no_tasks=M, feature_size=feature_size, no_examples=N
            )
        e_train = torch.tensor(train_data[0]).float().to(device)
        targets = torch.tensor(train_data[1][:, -1]).float().to(device)

        # Forward pass
        optimizer.zero_grad()
        if model_type == "attn":
            out = model(e_train, e_train, e_train)
        else:
            out = model(e_train, mask)
        preds = out[:, -1, -1] * (-1.0)
        loss = criterion(preds, targets)
        loss.backward()
        optimizer.step()
        scheduler.step()

        # Evaluate
        if step % 100 == 0:
            model.eval()
            with torch.no_grad():
                if model_type == "attn":
                    ev_preds = model(e_eval, e_eval, e_eval)
                else:
                    ev_preds = model(e_eval, mask)
                ev_preds = ev_preds[:, -1, -1] * (-1.0)
                eval_loss = criterion(ev_preds, eval_targets)
                eval_losses.append(eval_loss)
            model.train()
            if eval_loss < lowest_loss:
                lowest_loss = eval_loss
                if linear_data:
                    data_type = "lin_data"
                else:
                    data_type = "nonlin_data"
                if model_type == "transformer":
                    att = "transformer"
                elif model.softmax_att:
                    att = "softmax_attn"
                else:
                    att = "linear_attn"
                path = f"models/{att}-{data_type}.pth"
                torch.save(model.state_dict(), path)
            print(f"Step {step}, Train Loss: {loss.item():.3f}")
            print(f"Step {step}, Eval Loss: {eval_loss:.3f}")

In [40]:
# Now let's explore training the model
import torch.optim as optim

# Train
optimizer = optim.Adam(mha.parameters(), lr=lr)
criterion = torch.nn.MSELoss()

training_steps = 1000

train(
    mha,
    optimizer,
    criterion,
    eval_data=eval_data,
    training_steps=training_steps,
    linear_data=True,
    model_type="attn",
)

Training on cpu.


  0%|          | 3/1001 [00:00<01:59,  8.35it/s]

Step 0, Train Loss: 2.049
Step 0, Eval Loss: 0.908


 10%|█         | 104/1001 [00:05<00:44, 19.97it/s]

Step 100, Train Loss: 0.101
Step 100, Eval Loss: 0.040


 20%|██        | 203/1001 [00:10<00:40, 19.58it/s]

Step 200, Train Loss: 0.111
Step 200, Eval Loss: 0.011


 30%|███       | 302/1001 [00:14<00:34, 19.98it/s]

Step 300, Train Loss: 0.020
Step 300, Eval Loss: 0.007


 40%|████      | 403/1001 [00:19<00:28, 20.99it/s]

Step 400, Train Loss: 0.013
Step 400, Eval Loss: 0.015


 50%|█████     | 502/1001 [00:24<00:24, 20.18it/s]

Step 500, Train Loss: 0.016
Step 500, Eval Loss: 0.016


 60%|██████    | 604/1001 [00:29<00:17, 22.06it/s]

Step 600, Train Loss: 0.030
Step 600, Eval Loss: 0.008


 70%|███████   | 703/1001 [00:34<00:14, 19.95it/s]

Step 700, Train Loss: 0.005
Step 700, Eval Loss: 0.011


 80%|████████  | 805/1001 [00:38<00:09, 21.48it/s]

Step 800, Train Loss: 0.028
Step 800, Eval Loss: 0.015


 90%|█████████ | 904/1001 [00:43<00:04, 21.22it/s]

Step 900, Train Loss: 0.018
Step 900, Eval Loss: 0.011


100%|██████████| 1001/1001 [00:47<00:00, 21.12it/s]

Step 1000, Train Loss: 0.009
Step 1000, Eval Loss: 0.015





In [15]:
lr = 5e-4
# Let's do the same but with non linear data
eval_nl_data = get_nonlinear_data(no_tasks=M, feature_size=feature_size, no_examples=N)
e_eval_nl = torch.tensor(eval_nl_data[0]).float()

# Create a MultiHeadAttentionBlock
mha_nl = MultiHeadAttentionBlock(
    d_model=feature_size + 1, heads=1, dropout=0.0, softmax_att=False
)  # (batch_size, seq_len, d_model)

# Forward pass pre override
out_nl = mha_nl(e_eval_nl, e_eval_nl, e_eval_nl)

# Compare the output to the targets
eval_nl_targets = eval_nl_data[1][:, -1]
eval_nl_preds = out_nl[:, -1, -1] * (-1.0)

loss_nl = compute_loss(eval_nl_preds.detach().numpy(), eval_nl_targets)
print(f"Loss pre override for M: {M}, N: {N} is {loss_nl:.3f}.")

# Override the weights of the model
override_weights(mha_nl.w_q, la_params, "query")
override_weights(mha_nl.w_k, la_params, "key")
override_weights(mha_nl.w_v, la_params, "value")
override_weights(mha_nl.w_o, la_params, "linear")

# Forward pass
out_nl = mha_nl(e_eval_nl, e_eval_nl, e_eval_nl)

# Compare the output to the targets
eval_nl_targets = eval_nl_data[1][:, -1]
eval_nl_preds = out_nl[:, -1, -1] * (-1.0)

loss_nl = compute_loss(eval_nl_preds.detach().numpy(), eval_nl_targets)
print(f"Loss with GD weights for M: {M}, N: {N} is {loss_nl:.3f}.")

Loss pre override for M: 10, N: 1000 is 1059.016.
Loss with GD weights for M: 10, N: 1000 is 0.603.


In [16]:
lr = 5e-4
optimizer = optim.Adam(mha_nl.parameters(), lr=lr)
criterion = torch.nn.MSELoss()

training_steps = 3000

# Now let's explore training the model
train(
    mha_nl,
    optimizer,
    criterion,
    eval_data=eval_nl_data,
    training_steps=training_steps,
    linear_data=False,
)

Training on cuda.
Step 0, Train Loss: 1.222
Step 0, Eval Loss: 8.459
Step 100, Train Loss: 1.940
Step 100, Eval Loss: 2.947
Step 200, Train Loss: 1.526
Step 200, Eval Loss: 1.124
Step 300, Train Loss: 2.347
Step 300, Eval Loss: 1.023
Step 400, Train Loss: 1.603
Step 400, Eval Loss: 1.079
Step 500, Train Loss: 1.361
Step 500, Eval Loss: 1.208
Step 600, Train Loss: 2.761
Step 600, Eval Loss: 1.065
Step 700, Train Loss: 0.738
Step 700, Eval Loss: 1.183
Step 800, Train Loss: 1.057
Step 800, Eval Loss: 1.669
Step 900, Train Loss: 2.104
Step 900, Eval Loss: 1.173
Step 1000, Train Loss: 1.093
Step 1000, Eval Loss: 1.194
Step 1100, Train Loss: 0.666
Step 1100, Eval Loss: 1.440
Step 1200, Train Loss: 1.182
Step 1200, Eval Loss: 1.303
Step 1300, Train Loss: 1.037
Step 1300, Eval Loss: 1.566
Step 1400, Train Loss: 1.827
Step 1400, Eval Loss: 1.525
Step 1500, Train Loss: 0.792
Step 1500, Eval Loss: 1.274
Step 1600, Train Loss: 1.173
Step 1600, Eval Loss: 1.125
Step 1700, Train Loss: 1.082
Step 170

In [11]:
# Finally let's use softmax attention
# Create a MultiHeadAttentionBlock
mha_nl_sa = MultiHeadAttentionBlock(
    d_model=feature_size + 1, heads=1, dropout=0.0, softmax_att=True
)  # (batch_size, seq_len, d_model)

optimizer = optim.Adam(mha_nl_sa.parameters(), lr=lr)
criterion = torch.nn.MSELoss()

training_steps = 1000

# Training the model
train(
    mha_nl_sa,
    optimizer,
    criterion,
    eval_data=None,
    training_steps=training_steps,
    linear_data=False,
)

Training on cuda.
Step 0, Train Loss: 0.778
Step 0, Eval Loss: 0.747
Step 100, Train Loss: 0.854
Step 100, Eval Loss: 0.677
Step 200, Train Loss: 0.770
Step 200, Eval Loss: 0.567
Step 300, Train Loss: 0.759
Step 300, Eval Loss: 0.455
Step 400, Train Loss: 0.794
Step 400, Eval Loss: 0.432
Step 500, Train Loss: 0.550
Step 500, Eval Loss: 0.414
Step 600, Train Loss: 0.581
Step 600, Eval Loss: 0.449
Step 700, Train Loss: 1.445
Step 700, Eval Loss: 0.418
Step 800, Train Loss: 0.475
Step 800, Eval Loss: 0.403
Step 900, Train Loss: 0.499
Step 900, Eval Loss: 0.399
Step 1000, Train Loss: 0.486
Step 1000, Eval Loss: 0.427


In [29]:
from scratch_transformer import (
    LayerNormalization,
    FeedForwardBlock,
    ResidualConnection,
    EncoderBlock,
    MultiHeadAttentionBlock,
)
import torch
import torch.nn as nn
from data import get_nonlinear_data

feature_size = 10
output_size = 1
M = 10
N = 1000
lr = 1e-4
dropout = 0.2
mask = None

# get the data
eval_data = get_nonlinear_data(no_tasks=M, feature_size=feature_size, no_examples=N)
e = torch.tensor(eval_data[0]).float()


# MLP dimension usually 4 times the d_model
# Residual connection already contains layer normalizations

# Start with Self Attention
mha = MultiHeadAttentionBlock(
    d_model=feature_size + 1, heads=1, dropout=dropout, softmax_att=True
)  # (batch_size, seq_len, d_model)


# Feed Forward
ff = FeedForwardBlock(
    d_model=feature_size + 1, d_ff=4 * (feature_size + 1), dropout=dropout
)  # (batch_size, seq_len, d_model


# Create an EncoderBlock
eb = EncoderBlock(
    self_attention_block=mha,
    feed_forward_block=ff,
    dropout=dropout,
)

In [30]:
training_steps = 1000
optimizer = optim.Adam(eb.parameters(), lr=lr)
criterion = torch.nn.MSELoss()

total_params = sum(p.numel() for p in eb.parameters())
print(f"Total number of parameters: {total_params}")

# Training the model
train(
    eb,
    optimizer,
    criterion,
    eval_data=None,
    training_steps=training_steps,
    linear_data=False,
    model_type="transformer",
    mask=mask,
)

Training on cpu.
Step 0, Train Loss: 37.812
Step 0, Eval Loss: 5.308
Step 100, Train Loss: 12.215
Step 100, Eval Loss: 5.267
Step 200, Train Loss: 7.803
Step 200, Eval Loss: 5.223
Step 300, Train Loss: 4.062
Step 300, Eval Loss: 5.186
Step 400, Train Loss: 2.614
Step 400, Eval Loss: 5.146
Step 500, Train Loss: 5.747
Step 500, Eval Loss: 5.104
Step 600, Train Loss: 24.885
Step 600, Eval Loss: 5.065
Step 700, Train Loss: 11.340
Step 700, Eval Loss: 5.025
Step 800, Train Loss: 25.919
Step 800, Eval Loss: 4.989
Step 900, Train Loss: 7.570
Step 900, Eval Loss: 4.948
Step 1000, Train Loss: 15.817
Step 1000, Eval Loss: 4.906


In [53]:
import pandas as pd

stocks = pd.read_csv("data/stocks.csv")
stocks.head()


Unnamed: 0,DATE,LAST,OPEN,LOW,HIGH,3M IMPLIED VOL,SHORT INTEREST RATIO,Ticker,tweet_sentiment,news_sentiment
0,2006-02-06,126.6,126.44,126.17,126.8,,,SPY,,
1,2006-02-07,125.48,126.3,125.4,126.66,,,SPY,,
2,2006-02-08,126.62,125.88,125.6,126.78,,,SPY,,
3,2006-02-09,126.41,126.85,126.37,127.6,,,SPY,,
4,2006-02-10,126.64,126.42,125.45,127.13,,,SPY,,


In [137]:
spy = stocks.loc[
    stocks["Ticker"] == "SPY", ["DATE", "LAST", "OPEN", "LOW", "HIGH", "3M IMPLIED VOL"]
].dropna()
spy.reset_index(drop=True, inplace=True)


In [138]:
# create y
spy["price_1d"] = spy["LAST"].shift(-1)
spy["price_5d"] = spy["LAST"].shift(-5)
spy["price_10d"] = spy["LAST"].shift(-10)
spy["price_20d"] = spy["LAST"].shift(-20)
spy["open_1d"] = spy["OPEN"].shift(-1)


In [139]:
# divide the data into periods of 251 days
n = 251
spy["period"] = spy.index // n


In [140]:
# for each period, drop the last 20 rows to avoid lookahead bias
spy = (
    spy.groupby("period")
    .apply(
        lambda x: x.iloc[:-20], include_groups=True
    )  # include groups to later create test query
    .reset_index(drop=True)
)


  spy.groupby("period")


In [141]:
# Now let's fill the last entry of each period with 0.0
columns = ["price_1d", "price_5d", "price_10d", "price_20d", "open_1d"]
last_entry = spy.groupby("period").apply(
    lambda x: x.last_valid_index(), include_groups=False
)
spy_targets = np.array(spy.loc[last_entry, columns])
spy.loc[last_entry, columns] = 0.0
spy.groupby("period").last().head()


Unnamed: 0_level_0,DATE,LAST,OPEN,LOW,HIGH,3M IMPLIED VOL,price_1d,price_5d,price_10d,price_20d,open_1d
period,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,2015-04-15,210.43,210.05,209.95,211.04,14.0229,0.0,0.0,0.0,0.0,0.0
1,2016-04-13,208.0008,207.0,206.84,208.1,14.9924,0.0,0.0,0.0,0.0,0.0
2,2017-04-11,235.06,234.9,233.34,235.18,11.489,0.0,0.0,0.0,0.0,0.0
3,2018-04-11,263.76,263.47,263.39,265.64,16.1082,0.0,0.0,0.0,0.0,0.0
4,2019-04-10,288.29,287.77,287.31,288.3899,13.0733,0.0,0.0,0.0,0.0,0.0


In [142]:
# We're going to treat each period as a context
spy = spy.drop(columns=["DATE"])
groups = [group.drop(columns=["period"]).values for _, group in spy.groupby("period")]
spy_np = np.array(groups)


In [162]:
from scratch_transformer import (
    LayerNormalization,
    FeedForwardBlock,
    ResidualConnection,
    EncoderBlock,
    MultiHeadAttentionBlock,
    Encoder,
)
import torch
import torch.nn as nn

feature_size = spy_np.shape[2]
output_size = 1
lr = 1e-4
dropout = 0.2
mask = None
heads = 2
layers = 2

# convert to tensor
e = torch.tensor(spy_np).float()


# MLP dimension usually 4 times the d_model
# Residual connection already contains layer normalizations

# Start with Self Attention
mha = MultiHeadAttentionBlock(
    d_model=feature_size, heads=heads, dropout=dropout, softmax_att=True
)  # (batch_size, seq_len, d_model)

# Feed Forward
ff = FeedForwardBlock(
    d_model=feature_size, d_ff=4 * feature_size, dropout=dropout
)  # (batch_size, seq_len, d_model


# Create an EncoderBlock
eb = EncoderBlock(
    self_attention_block=mha,
    feed_forward_block=ff,
    dropout=dropout,
)

encoder_blocks = []
for _ in range(layers):
    encoder_self_attention_block = MultiHeadAttentionBlock(
        d_model=feature_size, heads=heads, dropout=dropout
    )
    encoder_feed_forward_block = FeedForwardBlock(
        d_model=feature_size, d_ff=4 * feature_size, dropout=dropout
    )
    encoder_block = EncoderBlock(
        self_attention_block=encoder_self_attention_block,
        feed_forward_block=encoder_feed_forward_block,
        dropout=dropout,
    )
    encoder_blocks.append(encoder_block)

# Don't worry about Encoder, it's just predefined in scratch_transformer
decoder = Encoder(
    nn.ModuleList(encoder_blocks),
)

for p in decoder.parameters():
    if p.dim() > 1:
        nn.init.xavier_uniform_(p)

out = decoder(e, mask)
# Compare the output to the targets
eval_targets = spy_targets
eval_preds = out[:, -1, -5:] * (-1.0)

loss = compute_loss(eval_preds.detach().numpy(), eval_targets)
print(f"Loss is {loss:.3f}.")


Loss is 295493.369.


In [None]:
training_steps = 1000
optimizer = optim.Adam(eb.parameters(), lr=lr)
criterion = torch.nn.MSELoss()

total_params = sum(p.numel() for p in eb.parameters())
print(f"Total number of parameters: {total_params}")

# Training the model
train(
    eb,
    optimizer,
    criterion,
    eval_data=None,
    training_steps=training_steps,
    linear_data=False,
    model_type="transformer",
    mask=mask,
)
