In [1]:
# imports
import torch
import torch.nn as nn
from torch.nn import functional as F

In [2]:
with open('resources/input.txt') as inp:
    text = inp.read()    

In [3]:
print(f"Corpus size : {len(text)}")
print(text[:1000])

Corpus size : 1115394
First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor citizens, the patricians good.
What authority surfeits on would relieve us: if they
would yield us but the superfluity, while it were
wholesome, we might guess they relieved us humanely;
but they think we are too dear: the leanness that
afflicts us, the object of our misery, is as an
inventory to particularise their abundance; our
sufferance is a gain to them Let us revenge this with
our pikes, ere we become rakes: for the gods know I
speak this in hunger for bread, not in

In [4]:
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(' '.join(chars))
print(vocab_size)


   ! $ & ' , - . 3 : ; ? A B C D E F G H I J K L M N O P Q R S T U V W X Y Z a b c d e f g h i j k l m n o p q r s t u v w x y z
65


In [5]:
# create a mapping from characters to integers
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers
decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string

print(encode("hii there"))
print(decode(encode("hii there")))

[46, 47, 47, 1, 58, 46, 43, 56, 43]
hii there


In [6]:
data = torch.tensor(encode(text), dtype=torch.long)
print(data.shape, data.dtype)
print(data[:1000])

torch.Size([1115394]) torch.int64
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59,  1, 39, 56, 43,  1, 39, 50, 50,
         1, 56, 43, 57, 53, 50, 60, 43, 42,  1, 56, 39, 58, 46, 43, 56,  1, 58,
        53,  1, 42, 47, 43,  1, 58, 46, 39, 52,  1, 58, 53,  1, 44, 39, 51, 47,
        57, 46, 12,  0,  0, 13, 50, 50, 10,  0, 30, 43, 57, 53, 50, 60, 43, 42,
         8,  1, 56, 43, 57, 53, 50, 60, 43, 42,  8,  0,  0, 18, 47, 56, 57, 58,
         1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 18, 47, 56, 57, 58,  6,  1, 63,
        53, 59,  1, 49, 52, 53, 61,  1, 15, 39, 47, 59, 57,  1, 25, 39, 56, 41,
      

In [7]:
# split train/val 
train_percent = 0.9
train_n = int(0.9*len(data))
train_data = data[:train_n]
val_data = data[train_n:]

In [8]:
# block_size => max length or context size on which the transformer is trained
# in a block_size of 8, we have for example 9 training examples are packed.
# For example :
# example 1 => input : token1, output: token2
# example 2 => input : token1 & token2, output: token3
# example 2 => input : token1 & token2 & token3, output: token3
#....
# we train on all the 8 examples, not just for efficiency reasons, 
# but because we want our transformer to be aware of contexts as little as 1 and as big as block size
block_size = 8
train_data[:block_size+1]

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58])

In [9]:
x = train_data[:block_size]
y = train_data[1:block_size+1]
for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print(f"when input is {context} the target: {target}")

when input is tensor([18]) the target: 47
when input is tensor([18, 47]) the target: 56
when input is tensor([18, 47, 56]) the target: 57
when input is tensor([18, 47, 56, 57]) the target: 58
when input is tensor([18, 47, 56, 57, 58]) the target: 1
when input is tensor([18, 47, 56, 57, 58,  1]) the target: 15
when input is tensor([18, 47, 56, 57, 58,  1, 15]) the target: 47
when input is tensor([18, 47, 56, 57, 58,  1, 15, 47]) the target: 58


In [26]:
# the batch dimension
torch.manual_seed(1337)
batch_size = 4 # how many independent sequences that we processing in parallel
block_size = 8 # what is the max context lenght for the predictions

def get_batch(split):
    """
    split: 'train' or 'val'
    """
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    # Question: How would just picking up random integer makes sure that that we cover all examples in one epoch?
    x = torch.stack([data[i: i+block_size] for i in ix])
    y = torch.stack([data[i+1: i+1+block_size] for i in ix])
    return x, y

xb, yb = get_batch('train')
print(f"{xb=}")
print(xb)
print(yb.shape)
print(f"{yb=}")

print('----')

for b in range(batch_size): # batch dimension
    for t in range(block_size): # time dimension
        context = xb[b, :t+1]
        target = yb[b,t]
        print(f"when input is {context.tolist()} the target: {target}")

xb=tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
        [25, 17, 27, 10,  0, 21,  1, 54]])
tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
        [25, 17, 27, 10,  0, 21,  1, 54]])
torch.Size([4, 8])
yb=tensor([[43, 58,  5, 57,  1, 46, 43, 39],
        [53, 56,  1, 58, 46, 39, 58,  1],
        [58,  1, 58, 46, 39, 58,  1, 46],
        [17, 27, 10,  0, 21,  1, 54, 39]])
----
when input is [24] the target: 43
when input is [24, 43] the target: 58
when input is [24, 43, 58] the target: 5
when input is [24, 43, 58, 5] the target: 57
when input is [24, 43, 58, 5, 57] the target: 1
when input is [24, 43, 58, 5, 57, 1] the target: 46
when input is [24, 43, 58, 5, 57, 1, 46] the target: 43
when input is [24, 43, 58, 5, 57, 1, 46, 43] the target: 39
when input is [44] the target: 53
when input is [44, 53] the target: 56
when input is [44

In [10]:
torch.randint(12, (10,))

tensor([ 2,  2,  5,  1, 10,  9,  8,  5,  4,  5])

In [30]:
torch.manual_seed(1337)

class BigramLanguageModel(nn.Module):

    def __init__(self, vocab_size) -> None:
        super().__init__()
        # This is similar to the bigram model that we build in the makemore, can be interpreted as probability distribution of all the tokens given the previous token
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, idx, targets=None):
        # idx and targets are both (B, T) tensors of the integers (input examples that we created above)
        logits = self.token_embedding_table(idx) # (B, T, C)
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        # loss = F.cross_entropy(logits, targets)
        return logits, loss
    
    def generate(self, idx, max_tokens):
        for _ in range(max_tokens):
            # idx is a (B, T) tensor of integers
            logits, loss = self(idx)
            
            # pluck out the last time dimension of the logits tensor as we only need that to for the next token prediction
            logits = logits[:, -1, :] # (B, C)

            # we want to sample from the probability distribution of the last token
            probs = F.softmax(logits, dim=-1)

            # sample from the distribution
            next_token = torch.multinomial(probs, num_samples=1)

            # append the next token to the input
            idx = torch.cat([idx, next_token], dim=-1)
        return idx



In [31]:
model = BigramLanguageModel(vocab_size)
logits, loss = model(xb, yb)
print(f"{logits=}, \n{logits.shape=}")
print(f"{loss.item()=}, \n{loss.shape=}")

logits=tensor([[-1.5101, -0.0948,  1.0927,  ..., -0.6126, -0.6597,  0.7624],
        [ 0.3323, -0.0872, -0.7470,  ..., -0.6716, -0.9572, -0.9594],
        [ 0.2475, -0.6349, -1.2909,  ...,  1.3064, -0.2256, -1.8305],
        ...,
        [-2.1910, -0.7574,  1.9656,  ..., -0.3580,  0.8585, -0.6161],
        [ 0.5978, -0.0514, -0.0646,  ..., -1.4649, -2.0555,  1.8275],
        [-0.6787,  0.8662, -1.6433,  ...,  2.3671, -0.7775, -0.2586]],
       grad_fn=<ViewBackward0>), 
logits.shape=torch.Size([32, 65])
loss.item()=4.878634929656982, 
loss.shape=torch.Size([])


In [41]:
# a reasonable think to kick of the generation is to start with the 0 token, because that is a newline or space in our vocabulary
# also, we only want to feed to 1X1 tensor to the model because we are only interested in the last token and we don't want to do extra computation of providing multiple examples from the same sentence as we did in the training

idx = torch.zeros(1, 1, dtype=torch.long)
print(decode(model.generate(idx, max_tokens=300)[0].tolist()))


AS:
O:
or endeny lly mis ong. areichy, Sar relllathan.
DIVIABave e y s,
LARIO:
PUpror o fof; se, hincoftre hausphe ulviteg brin'spooud prorkeapiscu l--Sombll.


Toure ave gorteanknd f cawe thoweere I:
AI:
I thitoantic wes. araduartha thorsthe cut
Wise IOWeve; g kes wowis,

Thnd hepite
RIUS:
nd byve 


In [33]:
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)

In [38]:
# training loop to train our bigram model
batch_size = 32
for step in range(10000):
    xb, yb = get_batch('train')
    logits, loss = model(xb, yb)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    print(f"{loss.item()=} for {step=}")

loss.item()=2.7563703060150146 for step=0
loss.item()=2.6439034938812256 for step=1
loss.item()=2.843132734298706 for step=2
loss.item()=2.7639353275299072 for step=3
loss.item()=2.8054544925689697 for step=4
loss.item()=2.7736196517944336 for step=5
loss.item()=2.6826980113983154 for step=6
loss.item()=2.841294288635254 for step=7
loss.item()=2.8600258827209473 for step=8
loss.item()=2.795305013656616 for step=9
loss.item()=2.6759917736053467 for step=10
loss.item()=2.7781710624694824 for step=11
loss.item()=2.6532485485076904 for step=12
loss.item()=2.8755288124084473 for step=13
loss.item()=2.779254198074341 for step=14
loss.item()=2.7350516319274902 for step=15
loss.item()=2.762739658355713 for step=16
loss.item()=2.7352328300476074 for step=17
loss.item()=2.7938601970672607 for step=18
loss.item()=2.685953140258789 for step=19
loss.item()=2.7322537899017334 for step=20
loss.item()=2.726649761199951 for step=21
loss.item()=2.806256055831909 for step=22
loss.item()=2.747533798217773

# Transformer Model

### Mathematical trick about how self-attention can be implemented

In [43]:
torch.manual_seed(1337)
B, T, C = 4, 8, 10
x = torch.randn(B, T, C)
print(f"{x.shape=}")

x.shape=torch.Size([4, 8, 10])


In [44]:
# version1 : we want to use the probdist of the preceding tokens to incorporate the context for self-attenion
xbow = torch.zeros((B, T, C))
for b in range(B):
    for t in range(T):
        xprev = x[b, :t+1] # returns a tensor of shape (t, C)
        xbow[b, t] = torch.mean(xprev, dim=0)

In [53]:
# we can use torch.tril which returns a lower triangular matrix of the input matrix and use that to compute the mean
a = torch.ones(3, 3)
a = torch.tril(a)
print(f"{a=}")
a = a / torch.sum(a, dim=1, keepdim=True)
print(f"{a=}")

b = torch.arange(2, 8).view(3, 2).float()
print(f"{b=}")
c = a @ b # matrix multiplication of (3, 3) with (3, 2) => (3, 2), it dot products every row with every column of second matrix and save the sum in a_row, b_col position
print(f"{c=}")

a=tensor([[1., 0., 0.],
        [1., 1., 0.],
        [1., 1., 1.]])
a=tensor([[1.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000],
        [0.3333, 0.3333, 0.3333]])
b=tensor([[2., 3.],
        [4., 5.],
        [6., 7.]])
c=tensor([[2., 3.],
        [3., 4.],
        [4., 5.]])


In [58]:
# version2 : Now we can try to vectorize the self-attention type computation using the same approach
# our weight matrix which will give us the mean is (T, T) matrix
wgt = torch.tril(torch.ones(T, T))
wgt = wgt / torch.sum(wgt, dim=1, keepdim=True)
print(f"{wgt=}")

xbow2 = wgt @ x # (T, T) @ (B, T, C) ----> uses broadcasting to convert this to a (B, T, T) @ (B, T, C) => (B, T, C), so for each batch dim B, it will do (T, T) @ (T, C) => (T,C)
print(f"{xbow2.shape=}")
torch.allclose(xbow, xbow2)

wgt=tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3333, 0.3333, 0.3333, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2500, 0.2500, 0.2500, 0.2500, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2000, 0.2000, 0.2000, 0.2000, 0.2000, 0.0000, 0.0000, 0.0000],
        [0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.0000, 0.0000],
        [0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.0000],
        [0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250]])
xbow2.shape=torch.Size([4, 8, 10])


True

In [60]:
# version3 : we use the following version because we want these affinities and probabilties to be learned my our model. 
#So, instead of settting them all to one, our wgt vector will be a learnable parameter in the self-attention layer, and this vector will define which past tokens are important for the current token
tril = torch.tril(torch.ones(T, T))
wgt = torch.zeros((T, T))
wgt = wgt.masked_fill(tril == 0, float('-inf'))
wgt = F.softmax(wgt, dim=1)
xbow3 = wgt @ x
torch.allclose(xbow, xbow3)

True

In [65]:
# version 4: the actuall self-attention implementation using key and query vectors.
# every token emits a key and query vector and we compute the dot product of the query vector of the current token with the key vector of all the previous tokens 
#and use that to compute the attention weights
tril = torch.tril(torch.ones(T, T))

head_size = 16 # tunable hyperparameter
keys = nn.Linear(C, head_size, bias=False)
queries = nn.Linear(C, head_size, bias=False)
values = nn.Linear(C, head_size, bias=False)

# we want to compute the dot product of the query vector of the current token with the key vector of all the previous tokens
k = keys(x) # (B, T, C) @ (C, head_size) => (B, T, head_size)
q = queries(x) # (B, T, C) @ (C, head_size) => (B, T, head_size)
v = values(x) # (B, T, C) @ (C, head_size) => (B, T, head_size)

wei = q @ k.transpose(-1, -2) # (B, T, head_size) @ (B, head_size, T) => (B, T, T) , where H is the head_size

# we want to mask the upper triangular matrix
wei = wei.masked_fill(tril == 0, float('-inf')) # this is very crucial in case of decoder block, because we don't want the decoder to look into the future tokens
# however, for encoder we can skip this step because we want the encoder to look into the future tokens to get the context

wei = F.softmax(wei, dim=-1) # (B, T, T)

# we also need to decay the wei matrix by the square root of the head_size
# this is done in order to make sure that the dot product of the query and key vectors are not too large
# if the product is too large the result of softmax is very peaky, which means we will have very high affinity between with one token and very low affinity between the rest of the tokens
# this happens because the order of variance of wei is of the order of head_size, so we divide the wei by the square root of head_size to make sure that the variance of wei is of the order of a gaussian, i.e. 0 mean and 1 variance
wei = wei * (head_size ** -0.5)

out = wei @ v # (B, T, T) @ (B, T, H) => (B, T, H)

In [15]:
# single head of self-attention
class Head(nn.Module):
    """
    head_layer_effective shape => n_embd, head_size
    Operation: (B, T, n_embd) @ (n_embd, head_size) => (B, T, head_size)
    """

    def __init__(self, block_size, n_embd, head_size) -> None:
        """
        block_size = block size used for input
        n_embd = channels in the input that we get
        """
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

    def forward(self, x):
        """
        Input Tensor shape: B, T, n_embd
        output tensor shape: B, T, head_size
        """
        B, T, n_embd = x.shape
        k = self.key(x) # (B, T, n_embd) @ (n_embd, head_size) => (B, T, head_size)
        q = self.query(x)
        v = self.value(x)

        wei = q @ k.transpose(-1, -2) * (n_embd**-.5) # (B, T, head_size) @ (B, head_size, T) => (B, T, T)
        wei = wei.masked_fill(self.tril == 0, float('-inf'))

        wei = F.softmax(wei, dim=-1)

        embd = wei @ v # (B, T, T) @ (B, T, head_size) => (B, T, head_size)
        return embd

In [79]:
class MultiHeadAttention(nn.Module):
    def __init__(self, block_size, n_embd, num_heads, head_size) -> None:
        super().__init__()
        self.heads = nn.ModuleList([Head(block_size, n_embd, head_size) for _ in range(num_heads)])
        # Each head is effectively of size (n_embd, head_size)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1) # each head does: B, T, n_embd) @ (n_embd, head_size) => (B, T, head_size)
        # we can concatenate heads by using torch.cat([(B, T, head_size)], dim=-1) on the last dimension
        return out

In [90]:
torch.manual_seed(1337)

class TransformerModel(nn.Module):

    def __init__(self, vocab_size, block_size, n_embd) -> None:
        super().__init__()
        # This is similar to the bigram model that we build in the makemore, can be interpreted as probability distribution of all the tokens given the previous token
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.pos_embedding_table = nn.Embedding(block_size, n_embd)
        # self.sa_head = Head(block_size, n_embd, n_embd)
        # define head size and in case of multiple heads and set the single head size based on num of heads used
        head_size = n_embd
        n_heads = 4
        single_head_size = head_size // n_heads
        # print(head_size, n_heads, single_head_size)
        self.sa_multihead = MultiHeadAttention(block_size, n_embd, n_heads, single_head_size)
        self.lm_head = nn.Linear(n_embd, vocab_size)
        self.vocal_size = vocab_size
        self.block_size = block_size
        self.n_embd= n_embd

    def forward(self, idx, targets=None):
        # idx and targets are both (B, T) tensors of the integers (input examples that we created above)
        B, T = idx.shape
        tok_emb = self.token_embedding_table(idx) # (B, T, C)
        pos_emb = self.pos_embedding_table(torch.arange(T, device=idx.device)) # (T, C)
        x = tok_emb + pos_emb # (B, T, C) + (T, C) ---> (B, T, C) + (1, T, C) ---> (B, T, C) + (B, T, C) => (B, T, C)
        # embd = self.sa_head(x) # (B, T, C) => (B, T, head_size)
        # print(f"{x.shape=}")
        embd = self.sa_multihead(x) # (B, T, C) => (B, T, heads*head_size)
        # print(embd.shape)
        logits = self.lm_head(embd) # (B, T, vocab_size)
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        # loss = F.cross_entropy(logits, targets)
        return logits, loss
    
    def generate(self, idx, max_tokens):
        for _ in range(max_tokens):
            # idx is a (B, T) tensor of integers
            idx_cond = idx[:, -self.block_size:]
            logits, loss = self(idx_cond)
            
            # pluck out the last time dimension of the logits tensor as we only need that to for the next token prediction
            logits = logits[:, -1, :] # (B, C)

            # we want to sample from the probability distribution of the last token
            probs = F.softmax(logits, dim=-1)

            # sample from the distribution
            next_token = torch.multinomial(probs, num_samples=1)

            # append the next token to the input
            idx = torch.cat([idx, next_token], dim=-1)
        return idx

In [85]:
# global variables
learning_rate = 1e-3
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200
n_embd = 32
block_size = 8
model = TransformerModel(vocab_size, block_size, n_embd)
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

In [87]:
# training loop to train our transformer model
batch_size = 32
max_iters = 5000
train_losses = []
val_losses = []
for step in range(5000):
    xb, yb = get_batch('train')
    logits, loss = model(xb, yb)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    print(f"{loss.item()=} for {step=}")
    train_losses.append(loss.item())
for _ in range(eval_iters):
    with torch.no_grad():
        xb, yb = get_batch('val')
        _, val_loss = model(xb, yb)
        val_losses.append(val_loss.item())
        print(f"{loss.item()=} for {step=}")

print(f'Final train loss={torch.tensor(train_losses[:100]).mean().item()}')
print(f'Final val loss={torch.tensor(val_losses[:100]).mean().item()}')

x.shape=torch.Size([32, 8, 32])
torch.Size([32, 8, 32])
loss.item()=2.1637213230133057 for step=0
x.shape=torch.Size([32, 8, 32])
torch.Size([32, 8, 32])
loss.item()=2.267672300338745 for step=1
x.shape=torch.Size([32, 8, 32])
torch.Size([32, 8, 32])
loss.item()=2.3851919174194336 for step=2
x.shape=torch.Size([32, 8, 32])
torch.Size([32, 8, 32])
loss.item()=2.316465139389038 for step=3
x.shape=torch.Size([32, 8, 32])
torch.Size([32, 8, 32])
loss.item()=2.1163268089294434 for step=4
x.shape=torch.Size([32, 8, 32])
torch.Size([32, 8, 32])
loss.item()=2.348674774169922 for step=5
x.shape=torch.Size([32, 8, 32])
torch.Size([32, 8, 32])
loss.item()=2.227454423904419 for step=6
x.shape=torch.Size([32, 8, 32])
torch.Size([32, 8, 32])
loss.item()=2.30926251411438 for step=7
x.shape=torch.Size([32, 8, 32])
torch.Size([32, 8, 32])
loss.item()=2.114501714706421 for step=8
x.shape=torch.Size([32, 8, 32])
torch.Size([32, 8, 32])
loss.item()=2.2056920528411865 for step=9
x.shape=torch.Size([32, 8, 

In [93]:
print(decode(model.generate(torch.zeros(1, 32, dtype=torch.long), max_tokens=5000)[0].tolist()))

x.shape=torch.Size([1, 8, 32])
torch.Size([1, 8, 32])
x.shape=torch.Size([1, 8, 32])
torch.Size([1, 8, 32])
x.shape=torch.Size([1, 8, 32])
torch.Size([1, 8, 32])
x.shape=torch.Size([1, 8, 32])
torch.Size([1, 8, 32])
x.shape=torch.Size([1, 8, 32])
torch.Size([1, 8, 32])
x.shape=torch.Size([1, 8, 32])
torch.Size([1, 8, 32])
x.shape=torch.Size([1, 8, 32])
torch.Size([1, 8, 32])
x.shape=torch.Size([1, 8, 32])
torch.Size([1, 8, 32])
x.shape=torch.Size([1, 8, 32])
torch.Size([1, 8, 32])
x.shape=torch.Size([1, 8, 32])
torch.Size([1, 8, 32])
x.shape=torch.Size([1, 8, 32])
torch.Size([1, 8, 32])
x.shape=torch.Size([1, 8, 32])
torch.Size([1, 8, 32])
x.shape=torch.Size([1, 8, 32])
torch.Size([1, 8, 32])
x.shape=torch.Size([1, 8, 32])
torch.Size([1, 8, 32])
x.shape=torch.Size([1, 8, 32])
torch.Size([1, 8, 32])
x.shape=torch.Size([1, 8, 32])
torch.Size([1, 8, 32])
x.shape=torch.Size([1, 8, 32])
torch.Size([1, 8, 32])
x.shape=torch.Size([1, 8, 32])
torch.Size([1, 8, 32])
x.shape=torch.Size([1, 8, 32