In [1]:
import torch
import torch.nn as nn
import numpy as np

from fastai.text.all import *

In [55]:
path = untar_data(URLs.IMDB)

In [3]:
get_imdb = partial(get_text_files,folders=['train','test','unsup'])

dls_lm = DataBlock(
    blocks=TextBlock.from_folder(path, is_lm=True),
    get_items=get_imdb, splitter=RandomSplitter(0.1)
).dataloaders(path, path=path, bs=128, seq_len=80)



In [84]:
def attention_score(q, k, v):
    dim = q.size(-1)
    score = torch.bmm(q, k.transpose(1,2)) / np.sqrt(dim)
    attention_weights = F.softmax(score, dim=-1)
    return torch.bmm(attention_weights,v)

class AttentionHead(nn.Module):
    def __init__(self, embed_dim, head_dim):
        super().__init__()
        self.q = nn.Linear(embed_dim, head_dim)
        self.k = nn.Linear(embed_dim, head_dim)
        self.v = nn.Linear(embed_dim, head_dim)

    def forward(self, hidden_state):
        attn_outputs = attention_score(
            self.q(hidden_state), self.k(hidden_state), self.v(hidden_state))
        return attn_outputs

class MultiHeadAttention(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_heads):
        super().__init__()
        embed_dim = embed_dim
        num_heads = num_heads
        head_dim = embed_dim // num_heads
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.heads = nn.ModuleList(
            [AttentionHead(embed_dim, head_dim) for _ in range(num_heads)]
        )
        self.output_linear = nn.Linear(embed_dim, embed_dim)

    def forward(self, hidden_state):
        embed_state = self.embedding(hidden_state)
        x = torch.cat([h(embed_state) for h in self.heads], dim=-1)
        print("x")
        x = self.output_linear(x)
        print("x")
        return x

"""class AttentionHead(Module):
    def __init__(self, vocab_size, embed_dim, num_heads, num_layers):
        super().__init__()
        self.query = nn.Linear(embed_dim, num_heads)
        self.key = nn.Linear(embed_dim, num_heads)
        self.value = nn.Linear(embed_dim, num_heads)
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.fc = nn.Linear(embed_dim, embed_dim)

    def forward(self, hidden_state):
        embed_state = self.embedding(hidden_state)
        print("1")
        print(embed_state.shape)
        attention = attention_score(self.query(embed_state), self.key(embed_state), self.value(embed_state))
        print("2")
        print(attention.shape)
        output = self.fc(attention)
        print("3")
        return output """

'class AttentionHead(Module):\n    def __init__(self, vocab_size, embed_dim, num_heads, num_layers):\n        super().__init__()\n        self.query = nn.Linear(embed_dim, num_heads)\n        self.key = nn.Linear(embed_dim, num_heads)\n        self.value = nn.Linear(embed_dim, num_heads)\n        self.embedding = nn.Embedding(vocab_size, embed_dim)\n        self.fc = nn.Linear(embed_dim, embed_dim)\n\n    def forward(self, hidden_state):\n        embed_state = self.embedding(hidden_state)\n        print("1")\n        print(embed_state.shape)\n        attention = attention_score(self.query(embed_state), self.key(embed_state), self.value(embed_state))\n        print("2")\n        print(attention.shape)\n        output = self.fc(attention)\n        print("3")\n        return output '

In [78]:
len(dls_lm.vocab)

60008

In [83]:
dls_lm.vocab[0:20]

['xxunk',
 'xxpad',
 'xxbos',
 'xxeos',
 'xxfld',
 'xxrep',
 'xxwrep',
 'xxup',
 'xxmaj',
 'the',
 '.',
 ',',
 'and',
 'a',
 'of',
 'to',
 'is',
 'it',
 'in',
 'i']

In [97]:
def scaled_dot_product_attention(query, key, value):
    dim_k = query.size(-1)
    scores = torch.bmm(query, key.transpose(1, 2)) / np.sqrt(dim_k)
    weights = F.softmax(scores, dim=-1)
    return torch.bmm(weights, value)

In [98]:
class AttentionHead(nn.Module):
    def __init__(self, embed_dim, head_dim):
        super().__init__()
        self.q = nn.Linear(embed_dim, head_dim)
        self.k = nn.Linear(embed_dim, head_dim)
        self.v = nn.Linear(embed_dim, head_dim)

    def forward(self, hidden_state):
        attn_outputs = scaled_dot_product_attention(
            self.q(hidden_state), self.k(hidden_state), self.v(hidden_state))
        return attn_outputs

In [99]:
class MultiHeadAttention(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_heads):
        super().__init__()
        embed_dim = embed_dim
        num_heads = num_heads
        head_dim = embed_dim // num_heads
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.heads = nn.ModuleList(
            [AttentionHead(embed_dim, head_dim) for _ in range(num_heads)]
        )
        self.output_linear = nn.Linear(embed_dim, embed_dim)

    def forward(self, hidden_state):
        embed_state = self.embedding(hidden_state)
        x = torch.cat([h(embed_state) for h in self.heads], dim=-1)
        x = self.output_linear(x)
        return x

In [100]:
model = MultiHeadAttention(
    vocab_size=len(dls_lm.vocab), 
    embed_dim=256, 
    num_heads=4
)

learn = Learner(
    dls_lm, 
    model, 
    loss_func=CrossEntropyLossFlat(), 
    metrics=[accuracy]
)

learn.fit_one_cycle(1, 1e-3)

epoch,train_loss,valid_loss,accuracy,time


IndexError: Target 2541 is out of bounds.

In [58]:
len(dls_lm.vocab)

60008

In [48]:
files = get_text_files(path, folders=['train','test','unsup'])

#print(len(files))

txt = files[0].open().read()

spacy = WordTokenizer()
toks = first(spacy([txt]))

tkn = Tokenizer(spacy)

txts = L(o.open().read() for o in files[:2])

toks = tkn(txt)

toks200 = txts[:2].map(tkn)

num = Numericalize()
num.setup(toks200)
coll_repr(num.vocab,20) 

nums = num(toks)[:20]

nums200 = toks200.map(num)

indices = torch.randint(0, 20, (4, 20))

output = model(indices)

print(len(output))
print(output[0])
print(output[1])

tensor([[16, 17,  2,  2,  6,  1,  9, 10,  9, 12,  5, 11,  6, 12, 15,  7, 17, 12,
         11,  8],
        [ 8, 19,  1,  7,  0,  8,  8, 12, 14, 14, 15, 15, 17, 13,  6, 15,  3, 10,
         10,  7],
        [17,  9, 13, 13, 10, 15,  5,  3,  4, 15, 12,  2,  2,  3,  7,  2, 10, 14,
          3,  4],
        [ 4, 15,  0, 15,  6, 13, 19,  1, 17,  4, 15,  4, 13, 17,  6, 19, 11, 14,
          1, 12]])
4
tensor([[ 1.3166,  1.2653, -0.8882,  ...,  0.6736, -0.5486, -0.1169],
        [ 0.5207, -1.0598, -0.8658,  ..., -2.5326, -0.4499, -2.5706],
        [ 0.5527, -1.1022, -0.1875,  ..., -0.1967, -1.6810, -0.1650],
        ...,
        [ 0.2932,  1.2106, -0.4803,  ..., -0.6262, -1.1011,  0.8535],
        [-0.4808,  0.9224,  0.5651,  ...,  1.6599,  1.1682, -0.4029],
        [ 0.2960, -0.9986, -1.1205,  ..., -1.2373, -1.2838, -0.0358]],
       grad_fn=<SelectBackward0>)
tensor([[ 0.2960, -0.9986, -1.1205,  ..., -1.2373, -1.2838, -0.0358],
        [ 0.2618, -0.2529, -0.3142,  ..., -0.4964,  0.8957, -0.

In [26]:
print(output[0])

TensorText([-2.4876e-01,  8.7750e-01, -9.8756e-01,  2.0200e-01,  6.0552e-01,
            -3.6406e-02, -3.3233e-01, -1.5803e+00,  8.4406e-01, -4.9262e-02,
             1.3237e-01, -7.7748e-01,  1.6111e+00, -5.6500e-01,  1.2172e+00,
             7.7553e-01, -5.7634e-01, -1.8688e-02,  5.3741e-01, -1.4699e+00,
            -1.5635e-01,  4.3277e-01,  9.5384e-01, -7.5215e-01,  6.8795e-01,
             1.4515e+00,  2.8927e-01, -5.3780e-01,  7.7532e-01, -3.5651e-01,
             4.2777e-01,  3.0546e-01,  5.6941e-01,  1.2695e+00,  1.2101e+00,
            -1.0118e+00,  1.6662e-01, -1.5973e-01,  6.7527e-01, -1.4960e+00,
            -1.2805e+00,  1.8608e+00,  1.0499e+00, -4.6016e-01, -4.6964e-01,
            -3.4260e-01, -8.3802e-02, -3.3977e-02, -3.3420e-01, -2.0656e-01,
            -7.8265e-01,  3.7368e-01, -4.8485e-01, -5.0912e-01, -3.7323e-01,
            -1.2227e+00,  7.4206e-01, -1.0810e+00,  3.4904e-01, -2.0306e-01,
             4.3522e-01, -6.9151e-01, -1.1948e+00,  2.1300e+00,  1.4522e+00,

In [28]:
print(len(output[1]))

256


In [49]:
output.shape

torch.Size([4, 20, 256])

In [51]:
q = k = v = output
dim = q.size(-1)
print(q.shape)
print(k.transpose(1,2).shape)
score = torch.bmm(q, k.transpose(1,2)) / np.sqrt(dim)
attention_weights = F.softmax(score, dim=-1)
result =  torch.bmm(attention_weights,v)

torch.Size([4, 20, 256])
torch.Size([4, 256, 20])


In [37]:
import torch
import torch.nn as nn

# Example parameters
vocab_size = 10  # Size of the vocabulary
embedding_dim = 50  # Dimension of the word embeddings
sequence_length = 20  # Length of each sequence (number of tokens)
batch_size = 5  # Number of sequences in a batch

# Initialize an nn.Embedding layer
embedding_layer = nn.Embedding(vocab_size, embedding_dim)

# Generate a 2D tensor of indices representing tokenized sequences
# Example indices tensor with shape (batch_size, sequence_length)
indices = torch.randint(0, vocab_size, (batch_size, sequence_length))
print(indices)

# Pass the indices tensor through the embedding layer to get word embeddings
# Output tensor will have shape (batch_size, sequence_length, embedding_dim)
embeddings = embedding_layer(indices)

print(embeddings.shape)


tensor([[9, 6, 4, 3, 8, 4, 3, 4, 1, 6, 9, 1, 6, 0, 1, 1, 7, 6, 1, 2],
        [0, 5, 0, 4, 9, 7, 9, 9, 6, 8, 8, 0, 6, 2, 8, 3, 4, 7, 0, 5],
        [7, 6, 2, 4, 2, 8, 2, 6, 8, 9, 0, 9, 3, 8, 0, 7, 4, 6, 2, 7],
        [5, 5, 2, 0, 6, 2, 3, 0, 2, 3, 8, 2, 1, 6, 4, 2, 5, 2, 5, 9],
        [2, 4, 4, 6, 8, 4, 3, 6, 2, 4, 0, 0, 7, 6, 2, 7, 7, 3, 0, 7]])
torch.Size([5, 20, 50])
