In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Importing Libraries

In [2]:
from transformers import AutoTokenizer, DataCollatorForLanguageModeling
from datasets import load_dataset, DatasetDict, load_from_disk
import torch
from torch import tensor, einsum
from torch.nn import Module, Sequential, Softmax, GELU, Linear, Dropout, LayerNorm, Embedding, ModuleList, DataParallel
import math
import numpy as np
import matplotlib.pyplot as plt

2024-08-01 00:28:09.236763: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-08-01 00:28:09.236910: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-08-01 00:28:09.354169: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


# The Transformer Architecture and Encoder Definitions
#### Training in Kaggle, Did not import the python files

In [3]:
class Attention(Module):
    def __init__(self, embedding_dim, num_heads, *args, **kwargs) -> None:
        assert embedding_dim % num_heads == 0, "The embedding dimension must be divisible by the number of heads"
        super().__init__(*args, **kwargs)
        self.embed_dim = embedding_dim
        self.n_head = num_heads
        self.head_dim = embedding_dim // num_heads
        
        self.queryLinear = Linear(self.head_dim, self.head_dim, bias=False)
        self.keyLinear = Linear(self.head_dim, self.head_dim, bias=False)
        self.valueLinear = Linear(self.head_dim, self.head_dim, bias=False)
        
        self.fc = Linear(self.head_dim, self.embed_dim)
        
        self.sft = Softmax(dim=-1)
        
    def forward(self, queries, keys, values, mask=None):
        # first reshape the tensors to be divided into heads
        qs, ks, vs = queries.shape, keys.shape, values.shape
        assert qs[0] == ks[0] and ks[0] == vs[0], "The batch size should be the same across all passed tensors"
        assert ks[1] == vs[1], "The sequence length should be the same across all the keys and the values"
        assert qs[2] == self.embed_dim and ks[2] == self.embed_dim and vs[2] == self.embed_dim, f"The embedding size should be equal to {self.embed_dim} across all tensors"
        
        queries = queries.reshape((qs[0], qs[1], self.n_head, self.head_dim))
        keys = keys.reshape((ks[0], ks[1], self.n_head, self.head_dim))
        values = values.reshape((vs[0], vs[1], self.n_head, self.head_dim))
        
        queries = self.queryLinear(queries)
        keys = self.keyLinear(keys)
        values = self.valueLinear(values)
        
        # b the batch size
        # q, k the lengths of the sequences
        # n the number of heads
        # h the head dim
        product = einsum("bqnh,bknh->bnqk", queries, keys)
        
        if mask is not None:
            expanded_mask = mask.unsqueeze(1).unsqueeze(1).expand(-1, self.n_head, qs[1], -1)
            product = product.masked_fill(expanded_mask == 0, -1e4)
        
        insight = self.sft(product / math.sqrt(self.embed_dim))
        
        # b the batch size
        # q, k, v the lengths of the sequences (k == v)
        # n the number of heads
        # h the head dim
        output = einsum("bnqk,bknh->bqnh", insight, values)
        
        return output.reshape((qs[0], qs[1], self.embed_dim))
    


class TransformerBlock(Module):
    def __init__(self, embedding_dim, num_heads, ffd_expansion = 4, dropout=0.2, *args, **kwargs) -> None:
        super().__init__(*args, **kwargs)
        self.attention = Attention(embedding_dim, num_heads)
        self.norm1 = LayerNorm(embedding_dim)
        self.ffd = Sequential(
            Linear(embedding_dim, embedding_dim * ffd_expansion),
            GELU(),
            Linear(embedding_dim*ffd_expansion, embedding_dim)
        )
        self.norm2 = LayerNorm(embedding_dim)
        self.dropout = Dropout(dropout)
        
    
    def forward(self, input, mask=None):
        attention = self.attention(input, input, input, mask)
        output1 = self.dropout(self.norm1(attention + input))
        output2 = self.ffd(output1) + output1
        output = self.dropout(self.norm2(output2))
        return output

In [4]:
def positionalEncoding(seq_len, embed_dim, n = 10000):
    assert embed_dim%2 == 0, "The embedding dimension must be even"
    
    pos = seq_len
    d = embed_dim
    
    positions = torch.arange(0, pos).unsqueeze(1)
    powers = torch.pow(n, torch.arange(0, d//2)/d)
    embeddings = torch.zeros((pos, d))
    
    embed_in = positions / powers
    
    embeddings[:, 0::2] = torch.sin(embed_in)
    embeddings[:, 1::2] = torch.cos(embed_in)
    
    
    return embeddings




class Encoder(Module):
    def __init__(self, vocab_size, embedding_dim, n_layers, n_heads, max_length, ffd = 4, dropout = 0.2, device = 'cpu', *args, **kwargs) -> None:
        super().__init__(*args, **kwargs)
        self.embedding_dim = embedding_dim
        self.n_layers = n_layers
        self.n_heads = n_heads
        self.max_length = max_length
        self.vocab_size = vocab_size
        self.embedding = Embedding(vocab_size, embedding_dim)
        self.positionalEncoding = positionalEncoding(max_length, embedding_dim)
        self.layers = ModuleList(
            [TransformerBlock(embedding_dim, n_heads, ffd, dropout) for _ in range(n_layers)]
        )
        self.dropout = Dropout(dropout)
        self.head = Linear(embedding_dim, vocab_size)
        
    def forward(self, x, mask=None):
        output = self.dropout(self.embedding(x) + self.positionalEncoding.unsqueeze(dim=0).expand(x.shape[0], self.max_length, self.embedding_dim).to(x.device))
        
        for layer in self.layers:
            output = layer(output, mask)
            
        return self.head(output)

# Loading and Tokenizing a fraction of the BookCorpus Dataset

In [5]:
num_proc = os.cpu_count()
print(f"Number of cores : {num_proc}")

dataset = load_dataset("bookcorpus/bookcorpus", trust_remote_code=True, num_proc=num_proc)

fraction_size = 0.1
dataset["train"] = dataset['train'].train_test_split(test_size=fraction_size, shuffle=False)['test']


tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased', use_fast=True)
print(f"Fast Tokenizer: {tokenizer.is_fast}")

max_length = 64
# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples["text"], return_special_tokens_mask=True, padding="max_length", truncation=True, max_length=max_length)

batch_size=5000
tokenized_dataset = dataset.map(tokenize_function, batched=True, batch_size=batch_size, num_proc=num_proc, remove_columns=dataset["train"].column_names)

Number of cores : 4


Downloading builder script:   0%|          | 0.00/3.25k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/18.5k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.18G [00:00<?, ?B/s]

Setting num_proc from 4 back to 1 for the train split to disable multiprocessing as it only contains one shard.


Generating train split:   0%|          | 0/74004228 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Fast Tokenizer: True


  self.pid = os.fork()


Map (num_proc=4):   0%|          | 0/7400423 [00:00<?, ? examples/s]

In [6]:
train_test_split = tokenized_dataset["train"].train_test_split(test_size=0.05)
train_dataset = train_test_split['train']
test_dataset = train_test_split['test']

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=True,
    mlm_probability=0.1
)

from torch.utils.data import DataLoader

num_cpus = 2

train_dataloader = DataLoader(
    train_dataset,
    shuffle=True,
    batch_size=256,
    collate_fn=data_collator,
    num_workers=num_cpus,
    pin_memory=True
)

test_dataloader = DataLoader(
    test_dataset,
    shuffle=False,
    batch_size=256,
    collate_fn=data_collator,
    num_workers=num_cpus,
    pin_memory=True
)

# Garbage Collection For Interactive Sessions

In [7]:
# import gc

# del model
# del optimizer

# for obj in gc.get_objects():
#     try:
#         if torch.is_tensor(obj) or (hasattr(obj, 'data') and torch.is_tensor(obj.data)):
#             del obj
#     except:
#         pass

# torch.cuda.empty_cache()

# gc.collect()

# torch.cuda.reset_max_memory_allocated()
# torch.cuda.reset_max_memory_cached()
# torch.cuda.reset_peak_memory_stats()
# torch.cuda.reset_accumulated_memory_stats()

# Instantiate the model

In [8]:
vocab_size = tokenizer.vocab_size
embed_size = 768
num_layers = 6
num_heads = 8
dropout = 0.1
expansion = 4
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

model = Encoder(vocab_size, embed_size, num_layers, num_heads, max_length, expansion, dropout, device).to(device)

if torch.cuda.device_count() > 1:
    model = DataParallel(model)

# Defining Hyperparameters and Training

In [9]:
from transformers import get_scheduler
from tqdm.auto import tqdm
from torch.nn import CrossEntropyLoss
from torch.optim import AdamW

# Hyperparameters
accumulation_steps = 2  # Number of steps to accumulate gradients
num_epochs = 1

# Loss function and optimizer
criterion = CrossEntropyLoss(ignore_index=-100)  # Ignore the masked tokens
optimizer = AdamW(model.parameters(), lr=5e-4)

# Learning rate scheduler
num_training_steps = len(train_dataloader) // accumulation_steps * num_epochs
num_warmup_steps = int(0.1 * num_training_steps)

lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=num_warmup_steps,
    num_training_steps=num_training_steps
)

for epoch in range(num_epochs): 

    model.train()
    progress_bar = tqdm(train_dataloader, desc=f"Epoch {epoch + 1}")
    
    optimizer.zero_grad()
    for step, batch in enumerate(progress_bar):
        input_ids = batch["input_ids"].to(device)
        mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)
        outputs = model(input_ids, mask)
        loss = criterion(outputs.view(-1, tokenizer.vocab_size), labels.view(-1))

        loss.backward()

        if (step + 1) % accumulation_steps == 0:
            optimizer.step()
            optimizer.zero_grad()
            lr_scheduler.step()
            
        progress_bar.set_postfix(loss=loss.item())

    print(f"Epoch {epoch + 1} completed with training loss: {loss.item()}")

    model.eval()
    eval_loss = 0
    num_eval_steps = len(test_dataloader)

    with torch.no_grad():
        for batch in test_dataloader:
            input_ids = batch["input_ids"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(input_ids)
            loss = criterion(outputs.view(-1, tokenizer.vocab_size), labels.view(-1))
            eval_loss += loss.item()

    avg_eval_loss = eval_loss / num_eval_steps
    print(f"Epoch {epoch + 1} evaluation loss: {avg_eval_loss}")

print("Training completed")


  self.pid = os.fork()


Epoch 1:   0%|          | 0/27463 [00:01<?, ?it/s]

Epoch 1 completed with training loss: 3.4281160831451416
Epoch 1 evaluation loss: 3.536606493332574
Training completed


# Saving the Model

In [10]:
torch.save(model.state_dict(), '/kaggle/working/mlm_model.pth')