In [1]:
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

import numpy as np

from tqdm import tqdm

from callformer.transformer import ModelDimensions, CallFormer
from callformer.tokenizer import Tokenizer

import pickle
from copy import copy
from datetime import date

DATA_PATH = "full_samples.pkl"
MODEL_PATH = "drive/MyDrive/search_ai/callformer_model.chkpt"


with open(DATA_PATH, "rb") as f:
    full_samples = pickle.load(f)

tokenizer = Tokenizer()

token_samples = []

tokens = [{"call": "<|searchnotes|>",
           "args": []},
           {"call": "<|summarize|>"}]

for sample in full_samples:
    search_start_date = ""
    if sample[2][0] != -1:
        search_start_date = f'"{date(year=sample[2][0], month=sample[2][1], day=sample[2][2]).strftime("%Y-%m-%d")}"'
    call_string = (
        f'{tokens[0]["call"]}'
        f'({search_start_date})'
        f'{tokens[1]["call"]}'
        )
    toks = tokenizer.encode(call_string)
    token_samples.append((
                         sample[0], 
                         sample[1],
                         sample[2], 
                         torch.from_numpy(np.array(sample[3])).unsqueeze(0).float(),
                         toks))
    


class CallFormerDataset(Dataset):
    def __init__(self, samples, model_dims: ModelDimensions):
        self.samples = samples
        self.n_ctx = model_dims.n_ctx
        
    def __len__(self):
        return len(self.samples)
    
    def __getitem__(self, idx):
        seq = self.samples[idx][-1]
        full_seq = torch.hstack( (
                        torch.Tensor([tokenizer.sot]), 
                        seq,
                        torch.Tensor([tokenizer.eot]),))
        input = F.pad(full_seq[:-2], (0, self.n_ctx - full_seq.shape[-1]+2), value=tokenizer.pad)
        target = F.pad(full_seq[2:], (0, self.n_ctx - full_seq.shape[-1]+2), value=tokenizer.pad)
        
        embedding = self.samples[idx][-2]

        assert embedding.ndim in (2, 3)
        if embedding.ndim == 2:
            embedding = embedding.unsqueeze(0)

        return embedding, input.to(torch.long), target.to(torch.long)



In [2]:
STATE_SIZE = token_samples[0][-2].shape[-1]

model_dims = ModelDimensions(
                n_vocab=tokenizer.vocab_size,
                n_ctx=100,
                n_state=STATE_SIZE,
                n_head=8,
                n_layer=2)

model = CallFormer(model_dims)

In [6]:
ds = CallFormerDataset(token_samples, model_dims)
dloader = DataLoader(ds, batch_size=2, shuffle=True)

loss_fn = torch.nn.CrossEntropyLoss()

optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

epochs = 10

min_loss = np.inf

for n in range(epochs):
    for embedding, input, target in tqdm(dloader):
        logits = model.decoder(input, embedding)
        result = logits[:, 1:, :]
        
        # check that `result` doesn't contain and nan values
        if torch.isnan(result).any():
            print (result)
            assert False

        optimizer.zero_grad()
        loss = loss_fn(result.mT, target[...,:-1])
        loss.backward()
        optimizer.step()

        if loss.item() < min_loss:
            min_loss = loss.item()
            print(min_loss)
            #save_model(model, optimizer, "callformer.pth")

  0%|          | 1/3250 [00:02<2:17:37,  2.54s/it]

87.88435363769531


  0%|          | 4/3250 [00:14<3:00:59,  3.35s/it]

43.215335845947266


  0%|          | 5/3250 [00:16<2:44:56,  3.05s/it]

40.26648712158203


  0%|          | 8/3250 [00:25<2:38:25,  2.93s/it]

32.140499114990234


  1%|          | 21/3250 [01:31<5:44:50,  6.41s/it]

20.347829818725586


  1%|          | 27/3250 [02:49<8:32:48,  9.55s/it] 

20.111425399780273


  1%|          | 32/3250 [03:20<6:04:56,  6.80s/it]

16.273664474487305


  1%|          | 38/3250 [06:12<8:44:38,  9.80s/it] 


KeyboardInterrupt: 