In [2]:
import sys
sys.dont_write_bytecode = True
import torch
import torch.nn as nn
import torch.nn.functional as F
import export
from models import *
from transformers import AutoTokenizer
import pandas as pd
import torch.optim as optim

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [4]:
tokenizer = AutoTokenizer.from_pretrained("gpt2")
model = LSTMForNLP(50257, 128, 64, 2, True).to(device)

In [5]:
tokenizer.vocab_size

50257

In [6]:
optimizer = optim.AdamW(model.parameters(), 1e-1, weight_decay=1e-2)
criterion = nn.CrossEntropyLoss()

In [7]:
def prepare(x):
    tokens = torch.Tensor([tokenizer.encode(x[0]) + [ 26 ] + tokenizer.encode(x[1]) + [ 50256 ]]).long()
    return { "input_ids": tokens[:, :-1], "labels": tokens[:, 1:] }

dataset = [ prepare(x) for x in pd.read_csv("questions.csv").to_numpy() ]

In [8]:
dataset = [ prepare(["Hello", "bye"]) ]

In [9]:
for epoch in range(500):
    train_loss = 0

    for batch in dataset:
        model.last_hx = None

        input_ids = batch["input_ids"].to(device)
        labels = batch["labels"].to(device)

        logits = model(input_ids)
        loss = criterion(logits.view(-1, logits.size(-1)), labels.view(-1))
        train_loss += loss.item()

        nn.utils.clip_grad_norm_(model.parameters(), .1)
        nn.utils.clip_grad
        
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
    
    train_loss /= len(dataset)

    print("Epoch: {} \t train loss: {}".format(epoch, train_loss))

Epoch: 0 	 train loss: 10.840571403503418
Epoch: 1 	 train loss: 8.43688678741455
Epoch: 2 	 train loss: 2.9861838817596436
Epoch: 3 	 train loss: 0.24526067078113556
Epoch: 4 	 train loss: 0.020272158086299896
Epoch: 5 	 train loss: 0.005461444612592459
Epoch: 6 	 train loss: 0.003999364096671343
Epoch: 7 	 train loss: 0.0033545626793056726
Epoch: 8 	 train loss: 0.002719772281125188
Epoch: 9 	 train loss: 0.002066652989014983
Epoch: 10 	 train loss: 0.0015058689750730991
Epoch: 11 	 train loss: 0.0010789656080305576
Epoch: 12 	 train loss: 0.0007736051338724792
Epoch: 13 	 train loss: 0.0005608846549876034
Epoch: 14 	 train loss: 0.0004135586495976895
Epoch: 15 	 train loss: 0.0003111102560069412
Epoch: 16 	 train loss: 0.0002390084118815139
Epoch: 17 	 train loss: 0.0001875032321549952
Epoch: 18 	 train loss: 0.00015005072054918855
Epoch: 19 	 train loss: 0.0001223657454829663
Epoch: 20 	 train loss: 0.00010155083873542026
Epoch: 21 	 train loss: 8.566072938265279e-05
Epoch: 22 	 tr

In [10]:
@torch.no_grad
def generate(self, idx, max_new_tokens = 100, temperature = 1.0, top_k = None, eos_token_id = None):
    idx = idx.to(next(self.parameters()).device)
    if len(idx[-1, :]) == 0:
        return idx

    for _ in range(max_new_tokens):
        self.last_hx = None
        _idx = idx.clone()
        logits = self(_idx)
        logits = logits[:, -1, :] / temperature
        if top_k is not None:
            v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
            logits[logits < v[:, [-1]]] = -float('Inf')
        
        next_token = torch.multinomial(F.softmax(logits, dim=-1), num_samples=1)
        idx = torch.cat((idx, next_token), dim=1)
        if eos_token_id is not None and eos_token_id == next_token:
            break
    
    return idx

def prompt(self, prompt, tokenizer, max_new_tokens = 100):
    x = tokenizer.encode(prompt, return_tensors="pt")
    return tokenizer.decode(generate(self, x, max_new_tokens, 1.0, None, tokenizer.eos_token_id).squeeze())

In [11]:
prompt(model, "Hello", tokenizer)

'Hello;bye<|endoftext|>'

In [12]:
# torch.save(model.state_dict(), "model.pth")

In [13]:
export.lstm_for_nlp(model).keys()

dict_keys(['embedding.weight', 'lstm.cells.0.ih.weight', 'lstm.cells.0.ih.bias', 'lstm.cells.0.hh.weight', 'lstm.cells.0.hh.bias', 'lstm.cells.1.ih.weight', 'lstm.cells.1.ih.bias', 'lstm.cells.1.hh.weight', 'lstm.cells.1.hh.bias', 'out_proj.weight', 'out_proj.bias'])

In [14]:
export.export(export.lstm_for_nlp(model), "model.mdl")