In [26]:
import time
import copy
from typing import Tuple

import torch

from torch import Tensor

import math


class SelfAttentionHead(torch.nn.Module):
    def __init__(self, embedding_dim, query_dim, key_dim, value_dim):
        super().__init__()
        self.query_dim = query_dim
        self.key_dim = key_dim
        self.value_dim = value_dim
        self.Wq = torch.nn.Linear(embedding_dim, query_dim, bias=False)
        self.Wk = torch.nn.Linear(embedding_dim, key_dim, bias=False)
        self.Wv = torch.nn.Linear(embedding_dim, value_dim, bias=False)

    def forward(self, x):
        q = torch.matmul(x, torch.transpose(self.Wq.weight, 0, 1))
        k = torch.matmul(x, torch.transpose(self.Wk.weight, 0, 1))
        v = torch.matmul(x, torch.transpose(self.Wv.weight.data, 0, 1))

        energy = torch.matmul(q, k.transpose(1, 2))
        normalized_energy = torch.softmax(energy / math.sqrt(self.key_dim), dim=2)
        out = torch.matmul(normalized_energy, v)

        return out


class MultiHeadAttention(torch.nn.Module):
    def __init__(self, nheads, embedding_dim, query_dim, key_dim, value_dim):
        super().__init__()
        self.attention_heads = torch.nn.ModuleList([
            SelfAttentionHead(embedding_dim, query_dim, key_dim, value_dim)
            for _ in range(nheads)
        ])
        self.Wo = torch.nn.Linear(nheads * value_dim, embedding_dim)

    def forward(self, x):
        output = torch.cat(tuple(ah(x) for ah in self.attention_heads), dim=2)
        output = self.Wo(output)
        return output


class EncoderLayer(torch.nn.Module):
    def __init__(self, nheads, embedding_dim, query_dim, key_dim, value_dim):
        super().__init__()

        self.multi_head_attention = MultiHeadAttention(nheads, embedding_dim, query_dim, key_dim, value_dim)
        self.norm1 = torch.nn.LayerNorm(embedding_dim)

        self.fully_connected = torch.nn.Sequential(
            torch.nn.Linear(embedding_dim, embedding_dim * 4),
            torch.nn.ReLU(),
            torch.nn.Linear(embedding_dim * 4, embedding_dim * 4),
            torch.nn.ReLU(),
            torch.nn.Linear(embedding_dim * 4, embedding_dim),
        )

        self.norm2 = torch.nn.LayerNorm(embedding_dim)

    def forward(self, x):
        x = x + self.multi_head_attention(x)
        x = self.norm1(x)
        x = x + self.fully_connected(x)
        x = self.norm2(x)

        return x


class PositionalEncoding(torch.nn.Module):

    def __init__(self, d_model: int, dropout: float = 0.1, max_len: int = 5000):
        super().__init__()
        self.dropout = torch.nn.Dropout(p=dropout)

        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe = torch.zeros(max_len, 1, d_model)
        pe[:, 0, 0::2] = torch.sin(position * div_term)
        pe[:, 0, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x: Tensor) -> Tensor:
        """
        Args:
            x: Tensor, shape [seq_len, batch_size, embedding_dim]
        """
        x = x + self.pe[:x.size(0)]
        return self.dropout(x)


class Encoder(torch.nn.Module):
    def __init__(self, n_tokens, num_encoder_layers, nheads, embedding_dim, query_dim, key_dim, value_dim, max_len):
        super().__init__()
        self.embedding_dim = embedding_dim
        self.embedding = torch.nn.Embedding(n_tokens, embedding_dim)
        self.encoder_layers = torch.nn.ModuleList([
            EncoderLayer(nheads, embedding_dim, query_dim, key_dim, value_dim)
            for _ in range(num_encoder_layers)
        ])
        
        self.pe = PositionalEncoding(d_model=embedding_dim, max_len=embedding_dim // 2)
        self.classifier = torch.nn.Linear(max_len * embedding_dim, 2)

    def forward(self, x):
        # print('applying embedding to:', x.shape)
        x = self.embedding(x) * math.sqrt(self.embedding_dim)
        # print('applying pos encoding to:', x.shape)
        x = self.pe(x)
        # print('applying encoder layers to:', x.shape)
        for layer in self.encoder_layers:
            x = layer(x)
        # print('applying classifier to:', x.shape)
        x = self.classifier(x.flatten(start_dim=1))
        x = torch.softmax(x, dim=1)
        return x

In [27]:
import datasets
from transformers import AutoTokenizer


torch.manual_seed(42)
tok = AutoTokenizer.from_pretrained('distilbert-base-uncased')

ntokens = len(tok.get_vocab().keys())
model = Encoder(ntokens, num_encoder_layers=3, nheads=8, embedding_dim=200, query_dim=64, key_dim=64, value_dim=64, max_len=256)


data = datasets.load_dataset('imdb').with_format('torch').shuffle(seed=42)
train_data = data['train']
test_data = data['test']


Found cached dataset imdb (/home/ubuntu/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0)


  0%|          | 0/3 [00:00<?, ?it/s]

Loading cached shuffled indices for dataset at /home/ubuntu/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0/cache-01ad04b69ceba701.arrow
Loading cached shuffled indices for dataset at /home/ubuntu/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0/cache-f85c25fbaaf4c75e.arrow
Loading cached shuffled indices for dataset at /home/ubuntu/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0/cache-bfdb1e053999b3b1.arrow


In [28]:

criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-04)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95)

MAX = 16

def train(model: torch.nn.Module, epochs: int) -> None:
    model.train()  # turn on train mode

    for epoch in range(1, epochs + 1):
        total_loss = 0.
        log_interval = 1
        batch_size = 5
        start_time = time.time()

        for i in range(0, MAX, batch_size):
            batch = train_data[i:i+batch_size]
            tokenized = tok.batch_encode_plus(batch['text'], max_length=256,
                                            padding='max_length', truncation=True, return_tensors='pt')['input_ids']
            output = model(tokenized)
            oh = torch.nn.functional.one_hot(batch['label'], num_classes=2).float()
            loss = criterion(output, oh)
            optimizer.zero_grad()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
            optimizer.step()

            total_loss += loss.item()

            if (i // batch_size) % log_interval == 0 and i > 0:
                lr = scheduler.get_last_lr()[0]
                ms_per_batch = (time.time() - start_time) * 1000 / log_interval
                cur_loss = total_loss / log_interval
                ppl = math.exp(cur_loss)
                print(f'| epoch {epoch:3d} | {i // batch_size:5.0f}/{MAX//batch_size:5.0f} batches | '
                    f'lr {lr:02.2f} | ms/batch {ms_per_batch:5.2f} | '
                    f'loss {cur_loss:5.2f} | ppl {ppl:8.2f}')
                total_loss = 0
                start_time = time.time()
        print('======')
        scheduler.step()

In [29]:
train(model, 20)

| epoch   1 |     1/    3 batches | lr 0.00 | ms/batch 1053.31 | loss  1.31 | ppl     3.70
| epoch   1 |     2/    3 batches | lr 0.00 | ms/batch 517.44 | loss  0.73 | ppl     2.09
| epoch   1 |     3/    3 batches | lr 0.00 | ms/batch 517.33 | loss  0.71 | ppl     2.03
| epoch   2 |     1/    3 batches | lr 0.00 | ms/batch 1041.34 | loss  0.63 | ppl     1.88
| epoch   2 |     2/    3 batches | lr 0.00 | ms/batch 525.41 | loss  0.38 | ppl     1.46
| epoch   2 |     3/    3 batches | lr 0.00 | ms/batch 518.73 | loss  0.34 | ppl     1.41
| epoch   3 |     1/    3 batches | lr 0.00 | ms/batch 1059.99 | loss  0.63 | ppl     1.87
| epoch   3 |     2/    3 batches | lr 0.00 | ms/batch 515.86 | loss  0.33 | ppl     1.39
| epoch   3 |     3/    3 batches | lr 0.00 | ms/batch 515.93 | loss  0.32 | ppl     1.37
| epoch   4 |     1/    3 batches | lr 0.00 | ms/batch 1033.88 | loss  0.63 | ppl     1.87
| epoch   4 |     2/    3 batches | lr 0.00 | ms/batch 521.93 | loss  0.31 | ppl     1.37
| epoc

KeyboardInterrupt: 

In [25]:

tokenized = tok.encode_plus(test_data[0]['text'], max_length=256,
                                  padding='max_length', truncation=True, return_tensors='pt')['input_ids']
print(tokenized.shape)

torch.Size([1, 256])


In [19]:
predicted = model(tokenized).argmax(dim=1)

RuntimeError: The size of tensor a (100) must match the size of tensor b (50) at non-singleton dimension 0

In [15]:
actual = torch.tensor([test_data[i]['label'].item() for i in range(0, 100)])

In [283]:
sum(actual)

tensor(47)

In [284]:
sum(predicted == actual)

tensor(52)

In [255]:
model = torch.load('/home/ubuntu/models/my-transformer-trained')

In [256]:
for i in range(20):
    tokenized = tok.batch_encode_plus([test_data[i]['text']], max_length=256,
                                    padding='max_length', truncation=True, return_tensors='pt')['input_ids']
    print(model(tokenized).argmax(dim=1), test_data[i]['label'])

AttributeError: 'Encoder' object has no attribute 'encoder_layers'