In [3]:
# We always start with a dataset to train on. Let's download the tiny shakespeare dataset
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

--2023-07-31 15:17:45--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘input.txt’


2023-07-31 15:17:45 (13.2 MB/s) - ‘input.txt’ saved [1115394/1115394]



In [2]:
import os
import pathlib
import torch

In [3]:
# read it in to inspect it
with open('./dataset/input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

In [4]:
# let's look at the first 1000 characters
print(text[:1000])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor citizens, the patricians good.
What authority surfeits on would relieve us: if they
would yield us but the superfluity, while it were
wholesome, we might guess they relieved us humanely;
but they think we are too dear: the leanness that
afflicts us, the object of our misery, is as an
inventory to particularise their abundance; our
sufferance is a gain to them Let us revenge this with
our pikes, ere we become rakes: for the gods know I
speak this in hunger for bread, not in thirst for revenge.



In [5]:
# here are all the unique characters that occur in this text
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(''.join(chars))
print(vocab_size)


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
65


In [6]:
# create a mapping from characters to integers
stoi = {ch :i for i, ch in enumerate(chars)}
itos = {i :ch for i, ch in enumerate(chars)}

encode = lambda s : [stoi[c] for c in s] # encoder: take a string, output a list of integers
decode = lambda l: ''.join(itos[i] for i in l) # decoder: take a list of integers, output a string

print(encode('hii there')) 
print(decode(encode('hii there')))


[46, 47, 47, 1, 58, 46, 43, 56, 43]
hii there


In [7]:
# let's now encode the entire text dataset and store it into a torch.Tensor
data = torch.tensor(encode(text), dtype=torch.long)

print(data.shape, data.dtype)
print(data[:1000]) # the 1000 characters we looked at earier will to the GPT look like this


torch.Size([1115394]) torch.int64
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59,  1, 39, 56, 43,  1, 39, 50, 50,
         1, 56, 43, 57, 53, 50, 60, 43, 42,  1, 56, 39, 58, 46, 43, 56,  1, 58,
        53,  1, 42, 47, 43,  1, 58, 46, 39, 52,  1, 58, 53,  1, 44, 39, 51, 47,
        57, 46, 12,  0,  0, 13, 50, 50, 10,  0, 30, 43, 57, 53, 50, 60, 43, 42,
         8,  1, 56, 43, 57, 53, 50, 60, 43, 42,  8,  0,  0, 18, 47, 56, 57, 58,
         1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 18, 47, 56, 57, 58,  6,  1, 63,
        53, 59,  1, 49, 52, 53, 61,  1, 15, 39, 47, 59, 57,  1, 25, 39, 56, 41,
      

In [8]:
# Let's now split up the data into train and validation sets
n = int(0.9*len(data)) # first 90% will be train, rest val
train_data = data[:n]
val_data = data[n:]

In [9]:
# Now we are gonna take arbitary chucks of this data and train it. This chuck is called block size. We need to do this cause its computationally prohibitive to take and train all the data at once. 
# lets look at the train data. 
block_size = 8
train_data[:block_size+1]

# we take block size + 1 cause there are 8 individual examples packed in there. what do you mean by that?
# In the context of 18, 47 comes next.
# In the context of 18 and 47, 56 comes next. 
# In the context of 18, 47 and 56, 57 comes next and so on. 


tensor([18, 47, 56, 57, 58,  1, 15, 47, 58])

In [10]:
x = train_data[:block_size]
y = train_data[1:block_size+1]

for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print(f"when input is {context}, target is {target}")

when input is tensor([18]), target is 47
when input is tensor([18, 47]), target is 56
when input is tensor([18, 47, 56]), target is 57
when input is tensor([18, 47, 56, 57]), target is 58
when input is tensor([18, 47, 56, 57, 58]), target is 1
when input is tensor([18, 47, 56, 57, 58,  1]), target is 15
when input is tensor([18, 47, 56, 57, 58,  1, 15]), target is 47
when input is tensor([18, 47, 56, 57, 58,  1, 15, 47]), target is 58


In [11]:
# Now we train on all the above 8 examples from context 1 to all the way up to block size is because we want the transformer network used to seeing context from as little as one to all the way up to block size. 

# This is also useful while inferencing, when transformer can start predicting from 1 to block size and after that we start truncating as it can never recieve more than block size context. (not super clear?)

# Thus we have looked at the time dimension of the tensors that are going to be fed. 
# There is one more dimension to care about and that is batch dimension. 
# When we are processing these chunks, we want to stack multiple chunks in mini batch in a single tensor. This is done purely for computational efficiency reasons. Gpus are good at parallel processing and hence mini batches. Each of these batches are processed independently and they dont talk to each other. 


In [12]:
#This is how one batch is created and looks like
torch.manual_seed(1337)

batch_size = 4 # how many independent sequences will we process in parallel?
block_size = 8 # what is the maximum context length for predictions?

def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split== 'train' else val_data

    ix = torch.randint(len(data)-block_size, (batch_size,))  # generate batch size number of random numbers between 0 and len(data)-block_size

    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])

    return x, y

xb, yb = get_batch('train')
print('inputs:')
print(xb.shape)
print(xb)
print('targets:')
print(yb.shape)
print(yb)

print('----')


# SO this 4 * 8 array contains a total of 32 examples!!!! 
# And they are completely independent as far as the transformer is concered.
# these examples are 

for b in range(batch_size): # batch dimension
    for t in range(block_size): # time dimension
        context = xb[b, :t+1]
        target = yb[b,t]
        print(f"when input is {context.tolist()} the target: {target}")


inputs:
torch.Size([4, 8])
tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
        [25, 17, 27, 10,  0, 21,  1, 54]])
targets:
torch.Size([4, 8])
tensor([[43, 58,  5, 57,  1, 46, 43, 39],
        [53, 56,  1, 58, 46, 39, 58,  1],
        [58,  1, 58, 46, 39, 58,  1, 46],
        [17, 27, 10,  0, 21,  1, 54, 39]])
----
when input is [24] the target: 43
when input is [24, 43] the target: 58
when input is [24, 43, 58] the target: 5
when input is [24, 43, 58, 5] the target: 57
when input is [24, 43, 58, 5, 57] the target: 1
when input is [24, 43, 58, 5, 57, 1] the target: 46
when input is [24, 43, 58, 5, 57, 1, 46] the target: 43
when input is [24, 43, 58, 5, 57, 1, 46, 43] the target: 39
when input is [44] the target: 53
when input is [44, 53] the target: 56
when input is [44, 53, 56] the target: 1
when input is [44, 53, 56, 1] the target: 58
when input is [44, 53, 56, 1, 58] the target: 46
when input is [44, 53

In [20]:
import torch
import torch.nn as nn
from torch.nn import functional as F

torch.manual_seed(1337)

class BigramLanguageModel(nn.Module):

    def __init__ (self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size
                                                  )
    
        #embedding is a thin wrapper around the tensor which creates vocab_size, vocab_size tensor. 

        # Creating an embedding layer for 10 unique items, each represented by 5-dimensional embeddings
        # embedding_layer = nn.Embedding(num_embeddings=10, embedding_dim=5)

        # In the above example, you could indeed use a regular tensor of size (10, 5) to represent embeddings for 10 unique items, and there would not be a functional difference in this specific scenario. However, using the nn.Embedding layer has certain advantages and is more common in practice when working with embeddings for categorical data. Let's explore the reasons:

        # Memory Efficiency: When dealing with large datasets or vocabularies, using an nn.Embedding layer can be more memory-efficient. The embedding layer only stores the embeddings for the unique items (based on the number of unique indices), while a regular tensor of size (10, 5) would allocate memory for all elements, including those that might not correspond to any actual item.

        # Computational Efficiency: The nn.Embedding layer has optimized implementations for efficient indexing and lookup operations. When you pass a batch of input indices to the embedding layer, it efficiently retrieves the corresponding embeddings. This optimized implementation is especially beneficial when working with large-scale models and datasets.

        # Flexibility: The nn.Embedding layer is designed to be integrated seamlessly with other layers in PyTorch's neural network modules (nn.Module). It allows you to easily train and update the embeddings during the learning process, making it convenient for end-to-end training.

        # Integration with Embedding Lookup: When using nn.Embedding, you can utilize the torch.nn.functional.embedding function, which provides efficient embedding lookup capabilities. This function is particularly useful when you need to perform lookups across multiple indices simultaneously, such as in recurrent neural networks (RNNs) or transformer models.

        # However, if your use case involves a small number of unique items, and memory or computational efficiency is not a concern, you can still represent embeddings using regular tensors. The decision between using a regular tensor and an nn.Embedding layer depends on the specific requirements of your model, the size of your vocabulary, and your computational resources. For larger-scale applications, nn.Embedding is generally preferred due to its efficiency and convenience.


         #bigram model is just learning this table. given a word what the probablity of the next word. No context is considered. 

    def forward(self, idx, targets = None):

        # In bigram model the probability of the next word is captured in the table. Hence just simply look up that value with the index of next word.
        # C = Channel = embedding size = vocab size
        # B = Batch size = no of independent sequences that will be processed in parallel
        # T = block size = context lenght = time dimension
        # idx and targets are both (B,T) tensor of integers
        logits = self.token_embedding_table(idx) # (B,T,C) # it plucks out that particular row for index 24 for eg i.e similar to  [24, :]

        # logits are the name given to the representation of the set of predicted possible characters for the next character in the 65x65 table. 
        # logits usually refer to the raw output of a neural network's final layer before the application of an activation function to produce probabilities.
        
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            # cross entropy expects it in different shape. 
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            # cross entropy is just log likelihood loss. 
            loss = F.cross_entropy(logits, targets)
            # internal converts logits to probabilities for the next character and applies the loss function. 
            # Next, the loss is computed as the negative log-likelihood loss. For each token in the batch, the cross-entropy loss measures the difference between the predicted probabilities and the true next word's one-hot encoded representation (the target label).

            # The negative log-likelihood loss is mathematically equivalent to multiplying the one-hot encoded target label with the logarithm of the corresponding probability for the true next word.

            # By minimizing the cross-entropy loss, the model learns to improve its predictions and generate more accurate sequences of words.

            # In summary, the provided code snippet reshapes the logits and targets to match the expected input shapes for the F.cross_entropy function. Inside the function, the logits are converted to probabilities using softmax, and the cross-entropy loss is calculated as the negative log-likelihood between the predicted probabilities and the true one-hot encoded target labels.

        return logits, loss
    
    def generate(self, idx, max_new_tokens):
        """
        Job is to given the batch and some T generate the time dimension for the specified set of time (tokens)
        """
        # idx is (B, T) arrary of indices in the current context
        for _ in range(max_new_tokens):

            # get the predictions
            logits, loss = self.forward(idx, targets=None)

            # focus only on the last time step
            logits  = logits[:, -1, :] # Becomes (B, C)

            #apply softmax to get probabilities 
            probs = F.softmax(logits, dim=-1) # (B, C)

            # smaple from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)

            # append sampled index to the running sequence 
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)

        return idx

    

m = BigramLanguageModel(vocab_size)
logits, loss = m.forward(idx =xb, targets=yb)
print(logits.shape)
print(loss)
    

idx = torch.zeros((1, 1), dtype = torch.long)
print(decode((m.generate(idx = idx, max_new_tokens=100).tolist()[0])))
    

> [0;32m/var/folders/lr/1stpmkw94f1497nf96vdnf_m0000gn/T/ipykernel_1532/3520260071.py[0m(41)[0;36mforward[0;34m()[0m
[0;32m     39 [0;31m        [0;31m# T = block size = context lenght = time dimension[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     40 [0;31m        [0;31m# idx and targets are both (B,T) tensor of integers[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m---> 41 [0;31m        [0mlogits[0m [0;34m=[0m [0mself[0m[0;34m.[0m[0mtoken_embedding_table[0m[0;34m([0m[0midx[0m[0;34m)[0m [0;31m# (B,T,C) # it plucks out that particular row for index 24 for eg i.e similar to  [24, :][0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     42 [0;31m[0;34m[0m[0m
[0m[0;32m     43 [0;31m        [0;31m# logits are the name given to the representation of the set of predicted possible characters for the next character in the 65x65 table.[0m[0;34m[0m[0;34m[0m[0m
[0m
torch.Size([32, 8])
tensor([[ 1, 57, 59, 52,  6,  0, 31, 53],
        [58, 46, 43,  1, 41, 47, 58, 63],


In [15]:
# create a Pytorch optimizer
optimizer = torch.optim.Adam(m.parameters(), lr=1e-3)

# Typically good lr is 1e-4
# Optimizer object will basically take the gradients and update the parameters using the gradients

In [16]:
batch_size = 32
for steps in range(10000):

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = m.forward(idx=xb, targets=yb)
    optimizer.zero_grad(set_to_none=True) # we are zeroing out all the gradients from the previous step
    loss.backward() # getting the gradients for all the parameters
    optimizer.step() # using those gradients to update our parameters


print(loss.item())


2.3796486854553223


In [19]:
xb[0]

tensor([ 1, 57, 59, 52,  6,  0, 31, 53])

In [31]:
print(decode((m.generate(idx = torch.zeros((1, 1), dtype = torch.long), max_new_tokens=400).tolist()[0])))


llo br. ave aviasurf my, mayo t ivee iuedrd whar ksth y h bora s be hese, woweee; the! KI 'de, ulseecherd d o blllando;

Whe, oraingofof win!
RIfans picspeserer hee tha,
TOFonk? me ain ckntoty dedo bo'llll st ta d:
ELIS me hurf lal y, ma dus pe athouo
By bre ndy; by s afreanoo adicererupa anse tecorro llaus a!
OLeneerithesinthengove fal amas trr
TI ar I t, mes, n sar; my w, fredeeyong
THek' merer,


### Mathematical Trick in Self Attention

In [8]:
# toy example illustrating how matrix multiplication can be used for a "weighted aggregation"
torch.manual_seed(42)
a = torch.tril(torch.ones(3, 3))
a = a / torch.sum(a, 1, keepdim=True)
b = torch.randint(0,10,(3,2)).float()
c = a @ b
print('a=')
print(a)
print('--')
print('b=')
print(b)
print('--')
print('c=')
print(c)

a=
tensor([[1.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000],
        [0.3333, 0.3333, 0.3333]])
--
b=
tensor([[2., 7.],
        [6., 4.],
        [6., 5.]])
--
c=
tensor([[2.0000, 7.0000],
        [4.0000, 5.5000],
        [4.6667, 5.3333]])


In [59]:
torch.sum(torch.tril(torch.ones(3, 3)), 1, keepdim=True)

tensor([[1.],
        [2.],
        [3.]])

In [61]:
# consider the following toy example:

torch.manual_seed(1337)
B,T,C = 4,8,2 # batch, time, channels
x = torch.randn(B,T,C)
x.shape

torch.Size([4, 8, 2])

In [78]:
# We want x[b,t] = mean_{i<=t} x[b,i]
xbow = torch.zeros((B,T,C))
for b in range(B):
    for t in range(T):
        xprev = x[b,:t+1] # (t,C)
        xbow[b,t] = torch.mean(xprev, 0)

In [79]:
# version 2: using matrix multiply for a weighted aggregation
wei = torch.tril(torch.ones(T, T))
wei = wei / wei.sum(1, keepdim=True)
xbow2 = wei @ x # (B, T, T) @ (B, T, C) ----> (B, T, C)
torch.allclose(xbow, xbow2)

True

In [81]:
wei = torch.tril(torch.ones(T, T))
wei / wei.sum(1, keepdim=True)
wei @ x[]

tensor([[[ 0.1808, -0.0700],
         [-0.1789, -0.9852],
         [ 0.4469, -0.9597],
         [ 1.4014, -0.8953],
         [ 1.7626,  0.2725],
         [ 0.4127, -0.2376],
         [ 0.6486, -0.4774],
         [-0.2725,  1.0659]],

        [[ 1.3488, -0.1396],
         [ 1.6346,  0.8255],
         [-0.4025,  1.3186],
         [ 1.0845,  1.9096],
         [ 1.2105,  0.3470],
         [ 0.0504,  0.0121],
         [ 0.4982, -0.7895],
         [ 2.0218,  1.7191]],

        [[-0.6631, -0.2513],
         [ 0.3470, -0.1297],
         [ 0.5054,  1.0043],
         [-0.6484,  0.7059],
         [-1.1560, -0.2180],
         [-0.6093, -1.7128],
         [-1.8150, -1.1410],
         [-2.4123, -1.8347]],

        [[ 1.6455, -0.8030],
         [ 2.9969, -1.0789],
         [ 1.4861,  1.0259],
         [ 4.2491, -0.7207],
         [ 5.7007, -2.2310],
         [ 6.5218, -2.4425],
         [ 7.3007, -0.9092],
         [ 8.9105, -1.3124]]])

In [83]:
wei @ x[1]

tensor([[ 1.3488, -0.1396],
        [ 1.6346,  0.8255],
        [-0.4025,  1.3186],
        [ 1.0845,  1.9096],
        [ 1.2105,  0.3470],
        [ 0.0504,  0.0121],
        [ 0.4982, -0.7895],
        [ 2.0218,  1.7191]])

In [4]:
from torch.nn import functional as F

# version 3: use Softmax
tril = torch.tril(torch.ones(T, T))
wei = torch.zeros((T,T)) # dont actually want this to be all uniform. Because different tokens wil find different other tokens interesting and we want that to be data dependent.  
# For example I am a vowel then may be I am looking for consonants in my past and may I want to know what those consonants are and I want that information to flow to me. 
# So I want to now gather information from the past, but I want to do it in a data dependent way. And this is the problem that self attention solves. 
wei = wei.masked_fill(tril == 0, float('-inf'))
wei = F.softmax(wei, dim=-1)
xbow3 = wei @ x
torch.allclose(xbow, xbow3)

NameError: name 'xbow' is not defined

In [6]:
import torch
import torch.nn as nn
from torch.nn import functional as F

In [16]:
# version 4: self-attention!

# The way self attention essentially solves this is the following .
# Every single node/ token at each position will emit two vectors. It will emit a query and It will emit a key. 
# The query vector roughly speaking is what am I looking for. And the key vector roughly speaking is what do I contain. 
# And then the way we get affinities between these tokens now in a sequence is basically we do a dot product between the keys and the queries. 
# So my query dot products will all the keys from all the other tokens. And that dot product now becomes wei. 
# Dot product is essentially how similar two vectors are to each other and a magnitude for that strenght. 
# thus how similar is my query to the keys from other tokens and the magnitude. so I will get to learn more about that token as compared to other tokens in the sequence
# Tokens with higher similarity (larger dot product) will have higher attention weights in the final attention distribution.
# so its essentially just finding similarity at scale in different contexts and different settings. the final weights effectively serving as an compression algo. 

# Now there is one more component, value. instead of aggreagting the value of token directly we use different value for that token. 
# so in a way its like hey here's my query, here's my key and if the query matches your key then this is the value that i will give to you. thus, the original value of token is private to that token. 


# Also, attention can be seen as a communication mechanism in a directed graph done in a data dependent manner. 
# What happens is every node has some vector of information and it gets to aggregate information via a weighted sum from all of the nodes that point to it, and this is done in a data dependent manner. 
# can be applied to any directed graph.
# Also these vectors dont have a notion of space. If you want it then you'd need to add it. this is what we did when we calculated the positional embeddings and added it to that vector. 

# Also the batches dont talk to each other. Hence in this analogy, its similar to 4 seperate pools of 8 nodes. Hence 32 nodes being computed at once. 

# Now this is called self attention cause, key, query and value are coming from the same source (same x). 
# In principle, It can be very general - 
# say queries are produced from x, but keys and values come from a whole seperate source. 
# thats cross attention. Hence its used when we there is a seperate pool of nodes we'd like to pull information from into our nodes.

torch.manual_seed(1337)
B, T, C = 4, 8, 32 # batch, time, channels
x = torch.randn(B, T, C)

#Let;s see a single Head perform self-attention
head_size = 16
key = nn.Linear(C, head_size, bias=False)
query = nn.Linear(C, head_size, bias=False)
value = nn.Linear(C, head_size, bias=False)
k = key(x)   # (B, T, 16)
q = query(x) # (B, T, 16)
wei = q @ k.transpose( -2, -1) # (B, T, 16) @ (B, 16, T) ---> (B, T, T)

tril = torch.tril(torch.ones(T, T)) 
wei = wei.masked_fill(tril == 0, float('-inf')) # prevents communication of past to future
wei = F.softmax(wei, dim=-1)

v = value(x)
out = wei @ v

torch.Size([4, 8, 16])

In [17]:
wei

tensor([[[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.1574, 0.8426, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.2088, 0.1646, 0.6266, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.5792, 0.1187, 0.1889, 0.1131, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.0294, 0.1052, 0.0469, 0.0276, 0.7909, 0.0000, 0.0000, 0.0000],
         [0.0176, 0.2689, 0.0215, 0.0089, 0.6812, 0.0019, 0.0000, 0.0000],
         [0.1691, 0.4066, 0.0438, 0.0416, 0.1048, 0.2012, 0.0329, 0.0000],
         [0.0210, 0.0843, 0.0555, 0.2297, 0.0573, 0.0709, 0.2423, 0.2391]],

        [[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.1687, 0.8313, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.2477, 0.0514, 0.7008, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.4410, 0.0957, 0.3747, 0.0887, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.0069, 0.0456, 0.0300, 0.7748, 0.1427, 0.0000, 0.0000, 0.0000],
         [0.0660, 0.089

In [18]:
out.shape

torch.Size([4, 8, 16])

In [19]:
out

tensor([[[-1.5713e-01,  8.8009e-01,  1.6152e-01, -7.8239e-01, -1.4289e-01,
           7.4676e-01,  1.0068e-01, -5.2395e-01, -8.8726e-01,  1.9067e-01,
           1.7616e-01, -5.9426e-01, -4.8124e-01, -4.8599e-01,  2.8623e-01,
           5.7099e-01],
         [ 6.7643e-01, -5.4770e-01, -2.4780e-01,  3.1430e-01, -1.2798e-01,
          -2.9521e-01, -4.2962e-01, -1.0891e-01, -4.9282e-02,  7.2679e-01,
           7.1296e-01, -1.1639e-01,  3.2665e-01,  3.4315e-01, -7.0975e-02,
           1.2716e+00],
         [ 4.8227e-01, -1.0688e-01, -4.0555e-01,  1.7696e-01,  1.5811e-01,
          -1.6967e-01,  1.6217e-02,  2.1509e-02, -2.4903e-01, -3.7725e-01,
           2.7867e-01,  1.6295e-01, -2.8951e-01, -6.7610e-02, -1.4162e-01,
           1.2194e+00],
         [ 1.9708e-01,  2.8561e-01, -1.3028e-01, -2.6552e-01,  6.6781e-02,
           1.9535e-01,  2.8074e-02, -2.4511e-01, -4.6466e-01,  6.9287e-02,
           1.5284e-01, -2.0324e-01, -2.4789e-01, -1.6213e-01,  1.9474e-01,
           7.6778e-01],
    

In [14]:
tril = torch.tril(torch.ones(T, T)) 
tril

tensor([[1., 0., 0., 0., 0., 0., 0., 0.],
        [1., 1., 0., 0., 0., 0., 0., 0.],
        [1., 1., 1., 0., 0., 0., 0., 0.],
        [1., 1., 1., 1., 0., 0., 0., 0.],
        [1., 1., 1., 1., 1., 0., 0., 0.],
        [1., 1., 1., 1., 1., 1., 0., 0.],
        [1., 1., 1., 1., 1., 1., 1., 0.],
        [1., 1., 1., 1., 1., 1., 1., 1.]])