In [1]:
exec(open('init_notebook.py').read())

current working dirF:\ml-from-scratch


In [2]:
%load_ext autoreload
%autoreload 2

# Resources
1. Generative Deep Learning by David Foster
2. https://github.com/karpathy/minGPT/blob/master/demo.ipynb

GPT is a **decoder** only model (pretraining model to be used by different decoders). We will build a casual masked language model (Casual MLM)

# PipeLine
1. Preprocessing Text -> Tokenize -> Sentence Clipper/Padder -> Input Dataset
2. Input Dataset -> token embedding + positional embedding -> input embedding
3. Input Sequence Embeddings -> self-attention A(Q,K,V) -> single head
4. Single Heads -> concat attentions -> Multihead
5. Training
   

## Encoder
5. multi-head + query -> layer-norm -> FFNs -> layer-norm -> output next token

In GPT, both token and positional embeddings are learned.


**This is the GPT implementation in Pytorch. The book has the tensorflow+keras implementation. Borrowed some ideas from minGPT**
### Dependencies:
1. Pytorch 2.x (should work in 1.x, too, as we are using as less as possible from torch)
2. Pandas
3. scikit-learn
4. tqdm
5. numpy
6. matplotlib

In [72]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset
from torch.utils.data.dataloader import DataLoader
import pandas as pd
import os
import math
import re
import string
from typing import *
import numpy.typing as npt
from tqdm import tqdm

from sklearn.feature_extraction.text import CountVectorizer
import heapq

# Custom types
Index = int
Char = str
Path = str
Sentence = str
TokenizedSentence = List[int]
Word = str
TokenId = int
Corpus = List[Sentence]
TokenizedCorpus = List[TokenizedSentence]
Vocabulary = Dict[Word, TokenId]

# Torch types, we use numpy typing
SequenceLength = int
EmbeddingDim = int
Vector = npt.NDArray 
Embedding = Vector
SequenceEmbedding = npt.NDArray[Vector] # [seqLen, embeddingDim]
BatchSequenceEmbedding = npt.NDArray[SequenceEmbedding] # [batch, seqlen, embeddingDim]

# Attention types
Query = Vector
Queries = npt.NDArray[Query]
BatchQueries = npt.NDArray[Queries]
Key = Vector
Keys = npt.NDArray[Key]
BatchKeys = npt.NDArray[Keys]
Value = Vector
Values = npt.NDArray[Value]
BatchValues = npt.NDArray[Values]
AttentionMask = npt.NDArray[SequenceEmbedding] # [batch, seqlen, embeddingDim]

Attention = Value # a single weighted value embedding for a query
Attentions = npt.NDArray[Attention] # for all the queries of a sentence
BatchAttentions = npt.NDArray[Attentions] # for a batch of sentences, batch, seqlen, seqlen

AttentionLogit = Vector # each query has d_k attention scores. these are logits, not softmax weights
AttentionLogits = npt.NDArray[AttentionLogit]
BatchAttentionLogits = npt.NDArray[AttentionLogits]

In [4]:
# Configurations
dataDir = "E:/Datasets/wine-reviews"
vocabSize = 10_000
sentenceLength = 100
embeddingDim = 50

In [5]:
#######################################################################################################
############# The Pipeline class which is the orchestration of each step of the process ###############
#######################################################################################################
class Pipeline:
    def __init__(self, vocabSize: int, sentenceLength: int, embeddingDim: int):
        self.vocabSize = vocabSize
        self.sentenceLength = sentenceLength
        self.embeddingDim = embeddingDim
        pass
        
    def save(self, directory: Path):
        pass
        
    def preprocess(self, corpus: Corpus) -> Corpus:
        pass
    def tokenize(self, corpus: Corpus, vocabSize: int) -> Tuple[Vocabulary, TokenizedCorpus]:
        pass
    def createDatasets(self, tokenizedCorpus: TokenizedCorpus, sentenceLength) -> Tuple[Dataset, Dataset]:
        pass
    
pipeline = Pipeline(vocabSize, sentenceLength, embeddingDim)

## 1.1 Preprocessing
1. read text,
2. pad punctuations with spaces to convert them to individual words.                                                                                                                                                                                                                                                                       

In [6]:
df = pd.read_csv(os.path.join(dataDir, "winemag-data-130k-v2.csv"))

In [7]:
# df.head()

In [8]:
corpus = df["description"].tolist()

In [9]:
# data[:10]

In [10]:
class PreProcessorText:
    
    def padPunk(self, s: Sentence) -> Sentence: # this does not work well. We will do BPE next. isn't -> becomes bad is ' t. now t is a word! It's it ' s
        s = re.sub(f"([{string.punctuation}])", r" \1 ", s) #to words
        s = re.sub(' +', ' ', s) # multiple spaces to one
        return s
    
    def smallCase(self, corpus: List[Sentence]) -> List[Sentence]:
        return [s.lower() for s in corpus]

    def __call__(self, corpus: List[Sentence]) -> List[Sentence]:
        corpus = [self.padPunk(s) for s in corpus] 
        return self.smallCase(corpus)
        
pipeline.preprocess = lambda corpus: PreProcessorText()(corpus)
    
    

In [11]:
corpus = pipeline.preprocess(corpus)

In [12]:
corpus[:3]

["aromas include tropical fruit , broom , brimstone and dried herb . the palate isn ' t overly expressive , offering unripened apple , citrus and dried sage alongside brisk acidity . ",
 "this is ripe and fruity , a wine that is smooth while still structured . firm tannins are filled out with juicy red berry fruits and freshened with acidity . it ' s already drinkable , although it will certainly be better from 2016 . ",
 'tart and snappy , the flavors of lime flesh and rind dominate . some green pineapple pokes through , with crisp acidity underscoring the flavors . the wine was all stainless - steel fermented . ']

## 1.2 Tokenize
1. vocabulary
2. tokenize sentences

In [13]:
class SentenceTokenizer:
    
    def getVocabulary(self, corpus: Corpus, vocabSize: int) -> Vocabulary:
        vectorizer = CountVectorizer().fit(corpus)
        topWords = heapq.nlargest(vocabSize-1, vectorizer.vocabulary_, key=lambda w: vectorizer.vocabulary_[w])
        wordToToken = {}
        for idx, w in enumerate(topWords):
            wordToToken[w] = idx
    
        wordToToken['UNK'] = len(wordToToken)
        print(f"Created a vocabulary with top {len(wordToToken)} words from {len(vectorizer.vocabulary_)} words with UNK as the last word")
        return wordToToken

    def tokenizeSentence(self, sentence: Sentence, vocabulary: Vocabulary) -> TokenizedSentence:
        words = sentence.split()
        tSen = [
            vocabulary[w] if w in vocabulary
            else vocabulary["UNK"]
                for w in words
        ]
        return tSen

    def build(self, corpus: Corpus, vocabSize: int) -> Tuple[Vocabulary, TokenizedCorpus]:
        vocabulary = self.getVocabulary(corpus, vocabSize)
        tokenizedCorpus = [self.tokenizeSentence(s, vocabulary) for s in corpus]
        return vocabulary, tokenizedCorpus

    def __call__(self, corpus: Corpus, vocabSize: int) -> Tuple[Vocabulary, TokenizedCorpus]:
        return self.build(corpus, vocabSize)

pipeline.tokenize = lambda corpus: SentenceTokenizer()(corpus, pipeline.vocabSize)
        



In [14]:
vocabulary, tokenizedCorpus = pipeline.tokenize(corpus)
# sentenceTokenizer = SentenceTokenizer()
# vocabulary, tokenizedCorpus = sentenceTokenizer.build(corpus, vocabSize)
vocabulary["the"]

Created a vocabulary with top 10000 words from 31274 words with UNK as the last word


3415

In [15]:
print(tokenizedCorpus[0]) # mostl UNKs as we have over 30 words

[9999, 9999, 2608, 9999, 9999, 9999, 9999, 9999, 9999, 9999, 9999, 9999, 3415, 9999, 9999, 9999, 9999, 9999, 9999, 9999, 9999, 1901, 9999, 9999, 9999, 9999, 9999, 7486, 9999, 9999, 9999, 9999]


## 1.3 Dataset
1. fixed length sentences
2. torch format

In [16]:
class SentenceDataset(Dataset):
    def __init__(self, tokenizedCorpus: TokenizedCorpus, split: str, length: int, vocabSize: int):
        assert split in {'train', 'test'} # borrowed from minGPT
        self.split = split
        self.length = length
        self.vocabSize = vocabSize
        self.corpus = tokenizedCorpus.copy()

        if self.split == 'train':
            self.size = int(len(self.corpus) * 0.7)
        else:
            self.size = len(self.corpus) - int(len(self.corpus) * 0.7)
            
        self.ids = self._generateIds()
        self._reshapeCorpus()
        

    def _reshapeCorpus(self):
        # converts to fixed length sentences by clipping or padding
        for idx in tqdm(self.ids, desc=f"reshaping {self.split} corpus"):
            self.corpus[idx] = self._reshapeSentence(self.corpus[idx])
        pass

    def _reshapeSentence(self, sentence: TokenizedSentence) -> TokenizedSentence:
        if len(sentence) >= self.length:
            return sentence[:self.length]
        # we pad with zeros. # zero not in vocab
        return sentence + [0] * (self.length - len(sentence))
        

    def _generateIds(self) -> List[Index]:
        # we just get the top for train and bot for test.
        if self.split == 'train':
            return list(range(self.size))
        else:
            start = int(len(self.corpus) * 0.7) # train index ends before start
            return [i+start for i in range(self.size)]

    def getVocabSize(self) -> int:
        return self.vocabSize

    def getBlockSize(self) -> int:
        # as this is an encoder. we feed n inputs, n-1 outputs, and then read outputs? need more clarifications.
        return self.vocabSize * 2 - 1
        

    def __len__(self):
        return self.size

    def __getitem__(self, idx: Index) -> TokenizedSentence:
        return self.corpus[self.ids[idx]] # ids can be sparse

pipeline.createDatasets = lambda tokenizedCorpus: (SentenceDataset(tokenizedCorpus, "train", pipeline.sentenceLength, pipeline.vocabSize), SentenceDataset(tokenizedCorpus, "test", pipeline.sentenceLength, pipeline.vocabSize))
    
        

In [17]:
# trainSet = SentenceDataset(tokenizedCorpus, "train", 100)
# testSet = SentenceDataset(tokenizedCorpus, "test", 100)
trainSet, testSet = pipeline.createDatasets(tokenizedCorpus)

reshaping train corpus: 100%|████████████████████████████████████████████████| 90979/90979 [00:00<00:00, 295381.48it/s]
reshaping test corpus: 100%|█████████████████████████████████████████████████| 38992/38992 [00:00<00:00, 158501.11it/s]


In [18]:
assert trainSet.ids[-1] + 1 == testSet.ids[0]
assert testSet.ids[-1] + 1 == len(tokenizedCorpus)

## 2. Embedding
We will learn both token embedding and positional encoding. However, we can also use pretrained embedding and trigonometric functions (sine, cosine) from the original Transformer for faster training

In [19]:
# tokenEmbedding = nn.Embedding(vocabSize, embeddingDim)
# posEmbedding = nn.Embedding(sentenceLength, embeddingDim) # number of positions. the dim is the same as tokens as they will be summed
pipeline.createEmbedding = lambda : (nn.Embedding(pipeline.vocabSize, pipeline.embeddingDim), nn.Embedding(pipeline.sentenceLength, pipeline.embeddingDim))
tokenEmbedding, posEmbedding = pipeline.createEmbedding()

In [20]:
tokenEmbedding, posEmbedding

(Embedding(10000, 50), Embedding(100, 50))

## 3. The Single Head
**From this point on, everything will have a batch dimension at 0**
1. Input Embedding
2. Self-Attention

### 3.1 Input Embedding
1. Convert the sentence token batch to embedding batch
   

In [83]:
class InputProcessor:
    def embed(self, batch: List[TokenizedSentence], tokenEmbedding: nn.Embedding, posEmbedding: nn.Embedding) -> BatchSequenceEmbedding:
        sentenceEmbeddings = []
        for sentence in batch:
            sentenceEmbeddings.append(self.embedSentence(sentence, tokenEmbedding, posEmbedding))
        
        return torch.stack(sentenceEmbeddings)

    def embedSentence(self, sentenceTokens:TokenizedSentence, tokenEmbedding: nn.Embedding, posEmbedding: nn.Embedding) -> SequenceEmbedding:
        positions = torch.tensor(range(len(sentenceTokens)))
        posEmbeddings = posEmbedding(positions)
        tokenEmbeddings = tokenEmbedding(torch.tensor(sentenceTokens))
        # return tokenEmbeddings * math.sqrt(len(sentenceTokens)) + posEmbeddings # we just add a weight to the token embeddings.
        return tokenEmbeddings + posEmbeddings # we just add no weight to the token embeddings.


def testInputProcessor():
    class Embedding:
        def __call__(self, indices: Iterable[int]) -> SequenceEmbedding:
            embeddings = []
            for i in indices:
                embeddings.append([i] * 3)
            return torch.tensor(embeddings)
            
    embedding = Embedding() # everything is a index repeated 3 times
    inputProcessor = InputProcessor()
    batch = [
                [1, 2],
                [3, 4]
    ]

    gotEmbeddings = inputProcessor.embed(batch, embedding, embedding)
    expectedEmbeddings = torch.tensor(
        [
            [
                [1, 1, 1],
                [3, 3, 3] # pos 1 adds 1
            ],
            [
                [3, 3, 3],
                [5, 5, 5] # pos 1 adds 1
            ]
        ]
    ) 
    # print("got", gotEmbeddings)
    # print("expected", expectedEmbeddings)
    assert np.allclose(gotEmbeddings, expectedEmbeddings)
    print("All good")

testInputProcessor()
    


All good


## 3.2 Self-Attention - Test Driven
1. Mask is added after attention scores are computed to save computation?

In [144]:

class CasualSingleHead(nn.Module):
    def __init__(self, d_tokenEmbedding: int, hiddenSize: int):
        super().__init__()

        self.d_tokenEmbedding = d_tokenEmbedding
        self.hiddenSize = hiddenSize # == d_q, d_k, d_v
        self.d_k = self.hiddenSize
        # stacked ffns for Q, K, V projections. Or we could use three different linear layers which will be executed sequentially. but this may have issues for batch inputs to a linear layer.
        self.qkvProjections = nn.Linear(d_tokenEmbedding, 3 * hiddenSize) # [0- hiddenSize] for q, [hiddenSize - 2* hiddenSize] for k, [2* hiddenSize - 3 * hiddenSize] for v.
        self.attentionDropout = nn.Dropout(0.2)
        self.residualDropout = nn.Dropout(0.2)
        self.skipLayerNormFirst = nn.LayerNorm(self.hiddenSize, eps=1e-6)
        self.skipLayerNormLast = nn.LayerNorm(self.hiddenSize, eps=1e-6)
        self.ffn1 = nn.Linear(self.hiddenSize, self.hiddenSize * 2)
        self.ffn2 = nn.Linear(self.hiddenSize * 2, self.hiddenSize)
        
        

    def forward(self, batch: BatchSequenceEmbedding, mask: Optional[AttentionMask]) -> Tuple[BatchAttentions, BatchAttentionLogits]:
        # Steps
        # 1. input -> Q, K, V
        # 2. Q, K, V -> Attention
        # 3. optional Contact heads -> project on Q
        # 3. skip connection with Q
        # 4. layer norm
        # 5. FFNs
        # 6. Skip connection with previous norm
        # 7. layer norm
        batchSize, seqLen, tokenEmbeddingSize = batch.size()
        batchQueries, batchKeys, batchValues = self.getBatchQKV(batch) 
        attention, attentionLogits = self.scaledDotProduct(batchQueries, batchKeys, batchValues, mask)
        attention  = self.attentionDropout(attention)
        
        # 3. skip connection with Q
        attention += batchQueries 
        # 4. layer norm
        norm1 = self.skipLayerNormFirst(attention)
        
        # 5. FFNs on norm1
        attention = self.ffn1(norm1)
        attention = self.ffn2(attention)
        
        # 6. Skip connection with previous norm
        attention += norm1
        # 7. layer norm
        norm2 = self.skipLayerNormLast(attention)

        return norm2, attentionLogits
        

    def getBatchQKV(self, batch: BatchSequenceEmbedding) -> Tuple[BatchQueries, BatchKeys, BatchValues]:
         # each split will have hiddenSize number. First two dims are batch and seqlen, we split at dim 2 which is the embedding dim.
        # we have a single head, so no need to do anything
        return self.qkvProjections(batch).split(self.hiddenSize, dim=2)
        
    def scaledDotProduct(self, Q: BatchQueries, K: BatchKeys, V: BatchValues, mask: Optional[AttentionMask]) -> Tuple[BatchAttentions, BatchAttentionLogits]:

        # some definitions to help understand the dimensions
        # T = seqLen = one input sequence length. T is a convention
        # seqLen = number of queries
        # seqLen = number of keys in this implementation as every token is attending to every token in the sequence including itself
        # seqLen = number of values in this implementation as every token is attending to every token in the sequence including itself
        # d_q = size of a query vector
        # d_k = size of a key vector
        # d_v = size of a value vector
        # in our implementation, d_q == d_k == d_v
        # so each input batch is of size = (batch, T, d_q)
        
        
        # Three steps of self.attention
        # 1. QK_T
        # 2. Apply attention mask on QK_T
        # 3. softmax
        # 4. calculate attention value QK_T V
        
        # qkT = torch.matmul(Q, K.transpose(-2, -1) # batch, seqlen, d_k. we transpose the last two dimensions of K
        # batch, seqlen, d_k. we transpose the last two dimensions of K
        # @ is a matmul operator 
        QK_T = Q @ K.transpose(-2, -1) # size = batch, T, T, we have T queries, each query as T keys.
        QK_T /= math.sqrt(self.d_k) # scaled

        if mask is not None:
            # now there are three ways we can apply mask, and 1 way we cannot
            # we cannot zero out the mask values as our weight loggits can be negative, so the softmax will significantly change if we zero out the masked loggits
            # we can set masked attentions to -inf with masked_fill 
            # or we can add -inf to the masked positions
            # with fill
            maskCondition = mask == 0
            QK_T = QK_T.masked_fill(maskCondition, -1.0e9) # may we get overflow? depends on our float size
            # with add
            # QK_T -= (1 - mask) * 1e9

        attnWeights = F.softmax(QK_T, dim=-1) # softmax for each query scores. last dim has each query scores
        # @ == matmul # batch, 
        attn = attnWeights @ V # batch, seqlen , d_v # each query has 1 attn-value vector
        return attn, QK_T 

        
        
        

In [147]:
# Self Attention Tests

def testSingleHead():

    # no mask test, batch_size = 1
    singleHead = CasualSingleHead(sentenceLength, hiddenSize=3) 
    Q = torch.tensor([[[1, 2, 3], [4, 5, 6]]], dtype=torch.float) #(batch=1, seqlen=2, d_q=3)
    K = torch.tensor([[[1, 1, 1], [2, 2, 2]]], dtype=torch.float) #(batch=1, seqlen=2, d_k=3)
    V = torch.tensor([[[1, 1, 1], [2, 2, 2]]], dtype=torch.float) #(batch=1, seqlen=2, d_v=3)
    attentions, attentionLogits = singleHead.scaledDotProduct(Q, K, V, None)

    d_k = 3
    expectedLogits = torch.tensor([[[ 6., 12.], [15., 30.]]]) / math.sqrt(d_k)
    expectedAttentions = F.softmax(expectedLogits, dim=-1) @ V
    
    # print("got", attentionLogits)
    # print("expected", expectedLogits)
    
    assert np.allclose(attentionLogits, expectedLogits)
    assert np.allclose(attentions, expectedAttentions)

    # with mask test, batch_size = 1

    mask = torch.tensor([[[1, 0], [1, 1]]]) #(batch=1, seqlen=2, seqlen=2) # first query attends to first key, second query attends to first and second key
    attentions, attentionLogits = singleHead.scaledDotProduct(Q, K, V, mask)
    
    expectedLogits = torch.tensor([[[ 6., 12.], [15., 30.]]]) / math.sqrt(d_k)
    expectedLogits[0][0][1] = -1.0e9 # first batch, first query, second mask
    expectedAttentions = F.softmax(expectedLogits, dim=-1) @ V
    
    # print("got", attentionLogits)
    # print("expected", expectedLogits)
    assert np.allclose(attentionLogits, expectedLogits)
    assert np.allclose(attentions, expectedAttentions)

    # no mask test, batch_size = 2
    singleHead = SingleHead(sentenceLength, hiddenSize=3) 
    Q = torch.tensor([[[1, 2, 3], [4, 5, 6]],
                     [[10, 20, 30], [40, 50, 60]]], dtype=torch.float) #(batch=2, seqlen=2, d_q=3)
    K = torch.tensor([[[1, 1, 1], [2, 2, 2]],
                     [[-1, -1, -1], [-2, -2, -2]]], dtype=torch.float) #(batch=2, seqlen=2, d_k=3)
    V = torch.tensor([[[1, 1, 1], [2, 2, 2]],
                     [[-1, -1, -1], [-2, -2, -2]]], dtype=torch.float) #(batch=2, seqlen=2, d_v=3)
    attentions, attentionLogits = singleHead.scaledDotProduct(Q, K, V, None)

    d_k = 3
    expectedLogits = torch.tensor([[[ 6., 12.], [15., 30.]],
                                  [[ -60., -120.],[-150., -300.]]]) / math.sqrt(d_k)
    expectedAttentions = F.softmax(expectedLogits, dim=-1) @ V

    # print("got", attentionLogits)
    # print("expected", expectedLogits)
    
    assert np.allclose(attentionLogits, expectedLogits)
    assert np.allclose(attentions, expectedAttentions)



def testProjections():

    batchSize = 1
    seqLen = 2
    hiddenSize = 10
    d_tokenEmbedding = 2
    batch = torch.tensor([[[1, 1], [2, 2]]], dtype=torch.float) # batch, seq len, input_dim
    
    singleHead = CasualSingleHead(d_tokenEmbedding, hiddenSize=hiddenSize) 
    
    layer = singleHead.qkvProjections # d_input = 2, d_qkv = 3
    output = layer(batch)
    expectQ, expectK, expectV = output.split(10, dim=2)
    
    
    batchQueries, batchKeys, batchValues = singleHead.getBatchQKV(batch)
    assert batchSize == batchQueries.shape[0]
    assert seqLen == batchQueries.shape[1]
    assert hiddenSize == batchQueries.shape[2]
    
    assert batchSize == batchKeys.shape[0]
    assert seqLen == batchKeys.shape[1]
    assert hiddenSize == batchKeys.shape[2]
    
    assert batchSize == batchValues.shape[0]
    assert seqLen == batchValues.shape[1]
    assert hiddenSize == batchValues.shape[2]

    assert torch.allclose(expectQ, batchQueries)
    assert torch.allclose(expectK, batchKeys)
    assert torch.allclose(expectV, batchValues)

def testForward():
    batchSize = 1
    seqLen = 2
    hiddenSize = 10
    d_tokenEmbedding = 2
    batch = torch.tensor([[[1, 1], [2, 2]]], dtype=torch.float) # batch, seq len, input_dim
    singleHead = CasualSingleHead(d_tokenEmbedding, hiddenSize=hiddenSize) 
    attention, attentionLoggits = singleHead.forward(batch, None)
    assert attention.shape == (batchSize, seqLen, hiddenSize)
    assert attentionLoggits.shape == (batchSize, seqLen, seqLen)
    
    
def test():
    testSingleHead()
    testProjections()
    testForward()
    print("All good")

test()

torch.Size([1, 2, 10])
torch.Size([1, 2, 10]) torch.Size([1, 2, 10])
All good


In [111]:
layer = nn.Linear(2, 3*10) # d_input = 2, d_qkv = 3
batch = torch.tensor([[[1, 1], [2, 2]]], dtype=torch.float) # batch, seq len, input_dim
output = layer(batch)
splits = output.split(10, dim=2)
# now each split has batch, seqlen,  10 dimensions
# output, splits
print(output.shape, len(splits), splits[0].shape)
singleHead = CasualSingleHead(2, hiddenSize=10) 
singleHead(batch)

torch.Size([1, 2, 30]) 3 torch.Size([1, 2, 10])


In [100]:
# Q = torch.tensor([[[1, 2, 3], [4, 5, 6]]], dtype=torch.float) #(seqlen, embeddingDim)
# K = torch.tensor([[[1, 1, 1], [2, 2, 2]]], dtype=torch.float)
# Q, K
# Q = torch.tensor([[[1, 2, 3], [4, 5, 6]],
#                  [[10, 20, 30], [40, 50, 60]]], dtype=torch.float) #(batch=2, seqlen=2, d_q=3)
# K = torch.tensor([[[1, 1, 1], [2, 2, 2]],
#                  [[-1, -1, -1], [-2, -2, -2]]], dtype=torch.float) #(batch=2, seqlen=2, d_k=3)
# V = torch.tensor([[[1, 1, 1], [2, 2, 2]],
#                  [[-1, -1, -1], [-2, -2, -2]]], dtype=torch.float) #(batch=2, seqlen=2, d_v=3)
# torch.matmul(Q, K.transpose(-2, -1))

In [40]:
singleHead = CasualSingleHead(sentenceLength, hiddenSize=3) 
Q = torch.tensor([[[1, 2, 3], [4, 5, 6]]], dtype=torch.float) #(batch=1, seqlen=2, d_q=3)
K = torch.tensor([[[1, 1, 1], [2, 2, 2]]], dtype=torch.float) #(batch=1, seqlen=2, d_k=3)
V = torch.tensor([[[1, 1, 1], [2, 2, 2]]], dtype=torch.float) #(batch=1, seqlen=2, d_v=3)
attentions, attentionLogits = singleHead.scaledDotProduct(Q, K, V, None)
attentionLogits

QK_T torch.Size([1, 2, 2])


tensor([[[ 3.4641,  6.9282],
         [ 8.6603, 17.3205]]])

In [None]:
singleHead.forward(