# Evaluation of Performance of Trained Transformer

In [1]:
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
from data_gen import get_data, format_data
from models import TransformerModel
import wandb
import yaml
from munch import Munch
import time

with open(f"configs/model_selection.yaml", "r") as yaml_file:
    args = Munch.fromYAML(yaml_file)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model = TransformerModel(
    n_dims=len(args.data.data_alphas) + 1,
    n_positions=args.data.N,
    n_layer=args.model.n_layer,
    n_head=args.model.n_head,
    n_embd=args.model.n_embd
).to(device)
model.load_state_dict(torch.load("models/model16867753992.pth", map_location=torch.device('cpu'))) # TODO: Remove map_location
model.eval()

TransformerModel(
  (_read_in): Linear(in_features=3, out_features=64, bias=True)
  (_backbone): GPT2Model(
    (wte): Embedding(50257, 64)
    (wpe): Embedding(21, 64)
    (drop): Dropout(p=0.0, inplace=False)
    (h): ModuleList(
      (0-2): 3 x GPT2Block(
        (ln_1): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.0, inplace=False)
          (resid_dropout): Dropout(p=0.0, inplace=False)
        )
        (ln_2): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.0, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
  )
  (_read_out): Linear(in_features=64, out_features=1, bias=True)
)

In [4]:
data_dict = get_data(alphas=args.data.data_alphas, N=args.data.N, d_d=args.data.d_d, train_samp_per_class=args.data.train_samp_per_class)

alphas, X, y = format_data(data_dict, train_samp_per_class=int(args.data.train_samp_per_class / len(args.data.data_alphas)))

Alphas: torch.Size([10000]), X: torch.Size([10000, 21, 3]), y: torch.Size([10000])


## Comparing to Ridge

In [5]:
def get_manual_select_model(X):

    pass

In [7]:
loss_fn = nn.MSELoss()

pred = model(X)
print(pred.shape)

model_selector_loss = loss_fn(pred, y).item()
alpha_1_loss = loss_fn(pred, y).item()
alpha_2_loss = loss_fn(pred, y).item()
manual_model_selection_loss = loss_fn(get_manual_select_model(X), y).item()

KeyboardInterrupt: 

## Does it generalize?