In [1]:
import retnet
import torch
import torch.nn as nn
import transformer
from transformers import GPT2Tokenizer, GPT2Config, GPT2LMHeadModel


[2023-11-15 12:59:13,020] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)


  from pandas import MultiIndex, Int64Index


In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
layers = 12
hidden_dim = 768
ffn_size = 768*4
heads = 12
vocab_size = len(tokenizer)

net = transformer.Transformer(layers, hidden_dim, ffn_size, heads, vocab_size, 0.1).to(device)
print("Num parameters:",sum(p.numel() for p in net.parameters() if p.requires_grad))

Num parameters: 162299473


In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 1.3B model
layers = 12
hidden_dim = 768
ffn_size = 768*4
heads = 12
vocab_size = len(tokenizer)

net = retnet.RetNet(layers, hidden_dim, ffn_size, heads, vocab_size, dropout=0.1, double_v_dim=False).to(device)
print("1.3B model:",sum(p.numel() for p in net.parameters() if p.requires_grad))

1.3B model: 169358929


In [3]:

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

layers = 12
hidden_dim = 768
ffn_size = hidden_dim * 2
heads = 12

net = retnet.RetNet(layers, hidden_dim, ffn_size, heads, len(tokenizer), double_v_dim=True)
print("1.3B model:",sum(p.numel() for p in net.parameters() if p.requires_grad))

1.3B model: 162281041


In [9]:
import tqdm
import torch
from torch import nn
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
from functools import partial
import gc
def evaluate(model, tokenizer):
    testenc = load_dataset('wikitext', 'wikitext-2-raw-v1', split='test')
    testenc = tokenizer("\n\n".join(testenc['text']), return_tensors='pt')

    testenc = testenc.input_ids.to(model.device)
    nsamples = 40
    model = model.eval()

    nlls = []
    for i in tqdm.tqdm(range(nsamples), desc="evaluating..."):
        batch = testenc[:, (i * 2048):((i + 1) * 2048)].to(model.device)
        with torch.no_grad():
            lm_logits = model(batch).logits
        shift_logits = lm_logits[:, :-1, :].contiguous().float()
        shift_labels = testenc[:, (i * 2048):((i + 1) * 2048)][:, 1:]
        loss_fct = nn.CrossEntropyLoss()
        loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
        neg_log_likelihood = loss.float() * 2048
        nlls.append(neg_log_likelihood)

    return torch.exp(torch.stack(nlls).sum() / (nsamples * 2048))

In [10]:
from transformers import GPT2Tokenizer, GPT2Config, GPT2LMHeadModel

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

testenc = load_dataset('wikitext', 'wikitext-2-raw-v1', split='test')
testenc2 = tokenizer("\n\n".join(testenc['text']), return_tensors='pt')

[2023-11-11 16:18:47,638] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)


  from pandas import MultiIndex, Int64Index


Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/8.48k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/6.84k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/9.62k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/4.72M [00:00<?, ?B/s]

Generating test split:   0%|          | 0/4358 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/36718 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3760 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (287644 > 1024). Running this sequence through the model will result in indexing errors


In [13]:
testenc2.input_ids.shape

torch.Size([1, 287644])

In [14]:
len(tokenizer)

50257

In [12]:
testenc['text']

['',
 ' = Robert Boulter = \n',
 '',
 ' Robert Boulter is an English film , television and theatre actor . He had a guest @-@ starring role on the television series The Bill in 2000 . This was followed by a starring role in the play Herons written by Simon Stephens , which was performed in 2001 at the Royal Court Theatre . He had a guest role in the television series Judge John Deed in 2002 . In 2004 Boulter landed a role as " Craig " in the episode " Teddy \'s Story " of the television series The Long Firm ; he starred alongside actors Mark Strong and Derek Jacobi . He was cast in the 2005 theatre productions of the Philip Ridley play Mercury Fur , which was performed at the Drum Theatre in Plymouth and the Menier Chocolate Factory in London . He was directed by John Tiffany and starred alongside Ben Whishaw , Shane Zaza , Harry Kent , Fraser Ayres , Sophie Stanton and Dominic Hall . \n',
 ' In 2006 , Boulter starred alongside Whishaw in the play Citizenship written by Mark Ravenhill 

In [15]:
tokenizer("\n\n", return_tensors='pt')

{'input_ids': tensor([[628]]), 'attention_mask': tensor([[1]])}

In [16]:
tokenizer("\n", return_tensors='pt')

{'input_ids': tensor([[198]]), 'attention_mask': tensor([[1]])}

In [17]:
tokenizer("", return_tensors='pt')

{'input_ids': tensor([], size=(1, 0)), 'attention_mask': tensor([], size=(1, 0))}

In [18]:
tokenizer("\n\n\n\n", return_tensors='pt')

{'input_ids': tensor([[628, 628]]), 'attention_mask': tensor([[1, 1]])}

In [19]:
tokenizer("\n\n\n", return_tensors='pt')

{'input_ids': tensor([[628, 198]]), 'attention_mask': tensor([[1, 1]])}

In [20]:
tokenizer("hello!\n", return_tensors='pt')

{'input_ids': tensor([[31373,     0,   198]]), 'attention_mask': tensor([[1, 1, 1]])}

In [33]:
testenc = load_dataset('wikitext', 'wikitext-2-raw-v1', split='validation')

In [26]:
testenc2 = tokenizer("\n\n".join(testenc['text']), return_tensors='pt')

In [27]:
testenc2.input_ids.shape

torch.Size([1, 2428601])