In [None]:
%pip install transformers datasets -q
import torch
import torch.nn as nn
import torch.nn.functional as F
import math

In [60]:

class PositionEmbed(nn.Module):
    
    def __init__(self, dim_model, max_len=512) -> None:
        super().__init__()
        
        # embedding is a matrix of size (max_len, dim_model)
        # for each possible position i, j contains the sinusoid of frequency i / 10000^(2j/dim_model)
        pe = torch.zeros(max_len, dim_model)
        pe.requires_grad = False
        
        # create a 2D tensor with the position indices
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = (torch.arange(0, dim_model, 2, dtype=torch.float) * -(math.log(10000.0) / dim_model)).exp()

        # for each 2 entries, starting at 0, we get a sin and cos activation
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)

        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)

    def forward(self, x):
        # get the position embeddings for all tokens up to the current position idx
        return self.pe[:, :x.size(1)]

class BERTEmbedding(nn.Module):
    def __init__(self, vocab_size, embed_size=512, dropout=0.1) -> None:
        super().__init__()
        print("Embedding size: ", vocab_size, embed_size, dropout)
        
        self.token = nn.Embedding(vocab_size, embed_size, padding_idx=0)
        embedding_dim = self.token.embedding_dim
        print("Embedding dim: ", embedding_dim)
        self.position = PositionEmbed(dim_model=embedding_dim)
        self.dropout = nn.Dropout(p=dropout)
        self.embed_size = embed_size

    def forward(self, sequence):
        x = self.token(sequence)
        print("X shape: ", x.shape, self.position(x).shape)
        x = x + self.position(x)
        x = self.dropout(x)
        return x

# Compute a single attention head
class Attention(nn.Module):
    
    # matrix multiplication of query and key, then scaled by the square root of the dimension of the query
    def forward(self, query, key, value, mask=None, dropout=None):
        scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(query.size(-1))
        if mask is not None:
            scores = scores.masked_fill(mask == 0, -1e9)

        p_attn = F.softmax(scores, dim=-1)

        if dropout is not None:
            p_attn = dropout(p_attn)

        return torch.matmul(p_attn, value), p_attn

class MultiHeadAttention(nn.Module):
    def __init__(self, attn_heads, hidden, dropout=0.1) -> None:
        super().__init__()
        assert hidden % attn_heads == 0

        # We assume d_v always equals d_k
        self.d_k = hidden // attn_heads
        self.h = attn_heads

        # linear layers for query, key and value
        self.linear_layers = nn.ModuleList([nn.Linear(hidden, hidden) for _ in range(3)])
        # final linear layer for output
        self.output_linear = nn.Linear(hidden, hidden)
        
        # attention - performed per batch of queries
        self.attention = Attention()

        self.dropout = nn.Dropout(p=dropout)

    def forward(self, query, key, value, mask=None):
        batch_size = query.size(0)
        
        # linear projection from hidden to d_k * h
        # i.e. for each linear layer, we get the query, key and value
        # these represent the linear layer for each head
        query, key, value = [l(x).view(batch_size, -1, self.h, self.d_k).transpose(1, 2) for l, x in zip(self.linear_layers, (query, key, value))]

        # compute attention for all heads in a batch
        x, attention = self.attention(query, key, value, mask=mask, dropout=self.dropout)

        # concatenate all heads
        x = x.transpose(1, 2).contiguous().view(batch_size, -1, self.h * self.d_k)

        # apply final linear layer
        return self.output_linear(x)

class SublayerConnection(nn.Module):
    
    def __init__(self, hidden, dropout) -> None:
        super(SublayerConnection, self).__init__()
        self.norm = nn.LayerNorm(hidden)
        self.dropout = nn.Dropout(p=dropout)

    def forward(self, x, sublayer):
        return x + self.dropout(sublayer(self.norm(x)))

# Feed forward layer, with dropout and GELU activation
class PositionwiseFeedForward(nn.Module):
    
    def __init__(self, hidden, feed_forward_hidden, dropout=0.1) -> None:
        super(PositionwiseFeedForward, self).__init__()
        self.w_1 = nn.Linear(hidden, feed_forward_hidden)
        self.w_2 = nn.Linear(feed_forward_hidden, hidden)
        self.dropout = nn.Dropout(p=dropout)
        # gelu is the same as RELU with a slight dip before 0
        self.activation = nn.GELU()

    def forward(self, x):
        return self.w_2(self.dropout(self.activation(self.w_1(x))))

class TransformerBlock(nn.Module):
    
    def __init__(self, hidden, attn_heads, feed_forward_hidden, dropout) -> None:
        super().__init__()
        self.attention = MultiHeadAttention(attn_heads, hidden)
        self.feed_forward = PositionwiseFeedForward(hidden, feed_forward_hidden, dropout)
        self.input_sublayer = SublayerConnection(hidden, dropout)
        self.output_sublayer = SublayerConnection(hidden, dropout)
        self.dropout = nn.Dropout(p=dropout)

    def forward(self, x, mask):
        x = self.input_sublayer(x, lambda _x: self.attention(_x, _x, _x, mask))
        x = self.output_sublayer(x, self.feed_forward)
        return self.dropout(x)

class BERT(nn.Module):
    
    def __init__(self, vocab_size, hidden=768, n_layers=12, attn_heads=12, dropout=0.1):
        super().__init__()
        self.hidden = hidden
        self.n_layers = n_layers
        self.attn_heads = attn_heads 

        self.feed_forward_hidden = hidden * 4 # 4 is hyperparameter

        print("BERT model: ", vocab_size, hidden, n_layers, attn_heads, dropout)
        self.embedding = BERTEmbedding(vocab_size, hidden, dropout)

        self.transformer_blocks = nn.ModuleList(
            [TransformerBlock(hidden, attn_heads, hidden*4, dropout) for _ in range(n_layers)]
        )

        # masked LM
        self.linear = nn.Linear(hidden, vocab_size)
        self.softmax = nn.LogSoftmax(dim=-1)

    def forward(self, x):
        
        # attention mask for padded token
        # torch.ByteTensor([batch_size, 1, seq_len, seq_len)
        mask = (x > 0).unsqueeze(1).repeat(1, x.size(1), 1).unsqueeze(1)

        # get the embedding for the input sequence
        x = self.embedding(x)

        for transformer in self.transformer_blocks:
            x = transformer(x, mask)

        # masked LM
        x = self.softmax(self.linear(x))
        
        return x

In [51]:
from datasets import load_dataset
from transformers import BertTokenizer

dataset = load_dataset("Gustavosta/Stable-Diffusion-Prompts")
tokenizer: BertTokenizer = BertTokenizer.from_pretrained("bert-base-uncased", use_fast=True)
dataset = dataset.map(lambda x: tokenizer(x["Prompt"], truncation=True, padding="max_length", max_length=512), batched=True)
dataset = dataset.with_format("torch")

Using custom data configuration Gustavosta--Stable-Diffusion-Prompts-d22aeec0ba2a9fdb
Found cached dataset parquet (C:/Users/coold/.cache/huggingface/datasets/Gustavosta___parquet/Gustavosta--Stable-Diffusion-Prompts-d22aeec0ba2a9fdb/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/2 [00:00<?, ?it/s]

Loading cached processed dataset at C:\Users\coold\.cache\huggingface\datasets\Gustavosta___parquet\Gustavosta--Stable-Diffusion-Prompts-d22aeec0ba2a9fdb\0.0.0\2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec\cache-32927d571d57457b.arrow
Loading cached processed dataset at C:\Users\coold\.cache\huggingface\datasets\Gustavosta___parquet\Gustavosta--Stable-Diffusion-Prompts-d22aeec0ba2a9fdb\0.0.0\2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec\cache-e66740f71a08c0b8.arrow


In [61]:
import torch
import torch.nn as nn
from torch.optim import Adam
from torch.utils.data import DataLoader
import numpy as np
import tqdm
from IPython.display import display

class ScheduledOptim():
    '''A simple wrapper class for learning rate scheduling'''

    def __init__(self, optimizer, d_model, n_warmup_steps):
        self._optimizer = optimizer
        self.n_warmup_steps = n_warmup_steps
        self.n_current_steps = 0
        self.init_lr = np.power(d_model, -0.5)

    def step_and_update_lr(self):
        "Step with the inner optimizer"
        self._update_learning_rate()
        self._optimizer.step()

    def zero_grad(self):
        "Zero out the gradients by the inner optimizer"
        self._optimizer.zero_grad()

    def _get_lr_scale(self):
        return np.min([
            np.power(self.n_current_steps, -0.5),
            np.power(self.n_warmup_steps, -1.5) * self.n_current_steps])

    def _update_learning_rate(self):
        ''' Learning rate scheduling per step '''

        self.n_current_steps += 1
        lr = self.init_lr * self._get_lr_scale()

        for param_group in self._optimizer.param_groups:
            param_group['lr'] = lr


class BERTTrainer:
    
    def __init__(self, bert: BERT, vocab_size: int,
                train_dataloader: DataLoader, test_dataloader: DataLoader = None,
                lr: float = 1e-4, betas=(0.9, 0.999), weight_decay: float = 0.01, warmup_steps=10000,
                with_cuda: bool = True, cuda_devices=None, log_freq: int = 10):
        """
        :param bert: BERT model which you want to train
        :param vocab_size: total word vocab size
        :param train_dataloader: train dataset data loader
        :param test_dataloader: test dataset data loader [can be None]
        :param lr: learning rate of optimizer
        :param betas: Adam optimizer betas
        :param weight_decay: Adam optimizer weight decay param
        :param with_cuda: traning with cuda
        :param log_freq: logging frequency of the batch iteration
        """

        # Setup cuda device for BERT training, argument -c, --cuda should be true
        cuda_condition = torch.cuda.is_available() and with_cuda
        self.device = torch.device("cuda:0" if cuda_condition else "cpu")
        print("Device:", self.device)
        print("CUDA Available: ", torch.cuda.is_available())

        # This BERT model will be saved every epoch
        # Initialize the BERT Language Model, with BERT model
        self.model = bert.to(self.device)

        # Distributed GPU training if CUDA can detect more than 1 GPU
        if with_cuda and torch.cuda.device_count() > 1:
            print("Using %d GPUS for BERT" % torch.cuda.device_count())
            self.model = nn.DataParallel(self.model, device_ids=cuda_devices)

        # Setting the train and test data loader
        self.train_data = train_dataloader
        self.test_data = test_dataloader

        # Setting the Adam optimizer with hyper-param
        self.optim = Adam(self.model.parameters(), lr=lr, betas=betas, weight_decay=weight_decay)
        self.optim_schedule = ScheduledOptim(self.optim, self.model.hidden, n_warmup_steps=warmup_steps)

        # Using Negative Log Likelihood Loss function for predicting the masked_token
        self.criterion = nn.NLLLoss(ignore_index=0)

        self.log_freq = log_freq

        print("Total Parameters:", sum([p.nelement() for p in self.model.parameters()]))

    def train(self, epoch):
        self.iteration(epoch, self.train_data)

    def test(self, epoch):
        self.iteration(epoch, self.test_data, train=False)

    def iteration(self, epoch, data_loader, train=True):
        str_code = "train" if train else "test"

        # Setting the tqdm progress bar
        data_iter = tqdm.tqdm(enumerate(data_loader),
                              desc="EP_%s:%d" % (str_code, epoch),
                              total=len(data_loader),
                              bar_format="{l_bar}{r_bar}")

        avg_loss = 0.0

        for i, data in data_iter:
            # 0. batch_data will be sent into the device(GPU or cpu)
            input_ids = data["input_ids"].to(self.device)

            # 1. forward the next_sentence_prediction and masked_lm model
            mask_lm_output = self.model.forward(input_ids)

            # 2-2. NLLLoss of predicting masked token word
            loss = self.criterion(mask_lm_output.transpose(1, 2), data["input_ids"])

            # 3. backward and optimization only in train
            if train:
                self.optim_schedule.zero_grad()
                loss.backward()
                self.optim_schedule.step_and_update_lr()

            # next sentence prediction accuracy
            avg_loss += loss.item()

            post_fix = {
                "epoch": epoch,
                "iter": i,
                "avg_loss": avg_loss / (i + 1),
                "loss": loss.item()
            }

            if i % self.log_freq == 0:
                data_iter.write(str(post_fix))

        print("EP%d_%s, avg_loss=" % (epoch, str_code), avg_loss / len(data_iter))

    def save(self, epoch, file_path="output/bert_trained.model"):
        """
        Saving the current BERT model on file_path

        :param epoch: current epoch number
        :param file_path: model output path which gonna be file_path+"ep%d" % epoch
        :return: final_output_path
        """
        output_path = file_path + ".ep%d" % epoch
        torch.save(self.bert.cpu(), output_path)
        self.bert.to(self.device)
        print("EP:%d Model Saved on:" % epoch, output_path)
        return output_path


In [62]:
from argparse import Namespace
from torch.utils.data import DataLoader

args = Namespace(
    hidden=256,
    batch_size=64,
    layers=8,
    attn_heads=8,
    adam_weight_decay=0.01,
    adam_beta1=0.9,
    epochs=10,
    log_freq=10,
    adam_beta2=0.999,
    cuda_devices=[0],
    num_workers=4,
    lr=1e-3,
    with_cuda=True,
)

print("Building BERT model")

bert = BERT(
    tokenizer.vocab_size,
    hidden=args.hidden,
    n_layers=args.layers,
    attn_heads=args.attn_heads,
)

train_dataloader = DataLoader(
    dataset["train"], batch_size=args.batch_size, num_workers=args.num_workers
)
test_dataloader = DataLoader(
    dataset["test"], batch_size=args.batch_size, num_workers=args.num_workers
)

print("Creating BERT Trainer")
trainer = BERTTrainer(
    bert,
    tokenizer.vocab_size,
    train_dataloader=train_dataloader,
    test_dataloader=test_dataloader,
    lr=args.lr,
    betas=(args.adam_beta1, args.adam_beta2),
    weight_decay=args.adam_weight_decay,
    log_freq=args.log_freq,
    with_cuda=args.with_cuda,
    cuda_devices=args.cuda_devices,
)

for epoch in range(args.epochs):
    trainer.train(epoch)
    trainer.save(epoch, args.output_path)

    if test_dataloader is not None:
        trainer.test(epoch)


Building BERT model
BERT model:  30522 256 8 8 0.1
Embedding size:  30522 256 0.1
Embedding dim:  256
Creating BERT Trainer
Device: cpu
CUDA Available:  False
Total Parameters: 21975866


EP_train:0:   0%|| 0/1152 [00:00<?, ?it/s]

X shape:  torch.Size([64, 512, 256]) torch.Size([1, 512, 256])
