In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

from transformers import AutoTokenizer, DataCollatorForLanguageModeling
from datasets import load_dataset, DatasetDict, load_from_disk
import torch
from torch import tensor, einsum
from torch.nn import Module, Sequential, Softmax, GELU, Linear, Dropout, LayerNorm, Embedding, ModuleList, DataParallel
import math
import numpy as np
from collections import OrderedDict


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

2024-08-02 01:35:15.978511: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-08-02 01:35:15.978673: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-08-02 01:35:16.174940: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


/kaggle/input/gpt-34/pytorch/default/1/mlm_model.pth


# The Transformer Architecture and Encoder Definitions
#### Training in Kaggle, Did not import the python files

In [2]:
class Attention(Module):
    def __init__(self, embedding_dim, num_heads, *args, **kwargs) -> None:
        assert embedding_dim % num_heads == 0, "The embedding dimension must be divisible by the number of heads"
        super().__init__(*args, **kwargs)
        self.embed_dim = embedding_dim
        self.n_head = num_heads
        self.head_dim = embedding_dim // num_heads
        
        self.queryLinear = Linear(self.head_dim, self.head_dim, bias=False)
        self.keyLinear = Linear(self.head_dim, self.head_dim, bias=False)
        self.valueLinear = Linear(self.head_dim, self.head_dim, bias=False)
        
        self.fc = Linear(self.head_dim, self.embed_dim)
        
        self.sft = Softmax(dim=-1)
        
    def forward(self, queries, keys, values, mask=None):
        # first reshape the tensors to be divided into heads
        qs, ks, vs = queries.shape, keys.shape, values.shape
        assert qs[0] == ks[0] and ks[0] == vs[0], "The batch size should be the same across all passed tensors"
        assert ks[1] == vs[1], "The sequence length should be the same across all the keys and the values"
        assert qs[2] == self.embed_dim and ks[2] == self.embed_dim and vs[2] == self.embed_dim, f"The embedding size should be equal to {self.embed_dim} across all tensors"
        
        queries = queries.reshape((qs[0], qs[1], self.n_head, self.head_dim))
        keys = keys.reshape((ks[0], ks[1], self.n_head, self.head_dim))
        values = values.reshape((vs[0], vs[1], self.n_head, self.head_dim))
        
        queries = self.queryLinear(queries)
        keys = self.keyLinear(keys)
        values = self.valueLinear(values)
        
        # b the batch size
        # q, k the lengths of the sequences
        # n the number of heads
        # h the head dim
        product = einsum("bqnh,bknh->bnqk", queries, keys)
        
        if mask is not None:
            expanded_mask = mask.unsqueeze(1).unsqueeze(1).expand(-1, self.n_head, qs[1], -1)
            product = product.masked_fill(expanded_mask == 0, -1e4)
        
        insight = self.sft(product / math.sqrt(self.embed_dim))
        
        # b the batch size
        # q, k, v the lengths of the sequences (k == v)
        # n the number of heads
        # h the head dim
        output = einsum("bnqk,bknh->bqnh", insight, values)
        
        return output.reshape((qs[0], qs[1], self.embed_dim))
    


class TransformerBlock(Module):
    def __init__(self, embedding_dim, num_heads, ffd_expansion = 4, dropout=0.2, *args, **kwargs) -> None:
        super().__init__(*args, **kwargs)
        self.attention = Attention(embedding_dim, num_heads)
        self.norm1 = LayerNorm(embedding_dim)
        self.ffd = Sequential(
            Linear(embedding_dim, embedding_dim * ffd_expansion),
            GELU(),
            Linear(embedding_dim*ffd_expansion, embedding_dim)
        )
        self.norm2 = LayerNorm(embedding_dim)
        self.dropout = Dropout(dropout)
        
    
    def forward(self, input, mask=None):
        attention = self.attention(input, input, input, mask)
        output1 = self.dropout(self.norm1(attention + input))
        output2 = self.ffd(output1) + output1
        output = self.dropout(self.norm2(output2))
        return output
    
def positionalEncoding(seq_len, embed_dim, n = 10000):
    assert embed_dim%2 == 0, "The embedding dimension must be even"
    
    pos = seq_len
    d = embed_dim
    
    positions = torch.arange(0, pos).unsqueeze(1)
    powers = torch.pow(n, torch.arange(0, d//2)/d)
    embeddings = torch.zeros((pos, d))
    
    embed_in = positions / powers
    
    embeddings[:, 0::2] = torch.sin(embed_in)
    embeddings[:, 1::2] = torch.cos(embed_in)
    
    
    return embeddings




class Encoder(Module):
    def __init__(self, vocab_size, embedding_dim, n_layers, n_heads, max_length, ffd = 4, dropout = 0.2, device = 'cpu', *args, **kwargs) -> None:
        super().__init__(*args, **kwargs)
        self.embedding_dim = embedding_dim
        self.n_layers = n_layers
        self.n_heads = n_heads
        self.max_length = max_length
        self.vocab_size = vocab_size
        self.embedding = Embedding(vocab_size, embedding_dim)
        self.positionalEncoding = positionalEncoding(max_length, embedding_dim)
        self.layers = ModuleList(
            [TransformerBlock(embedding_dim, n_heads, ffd, dropout) for _ in range(n_layers)]
        )
        self.dropout = Dropout(dropout)
        self.head = Linear(embedding_dim, vocab_size)
        
    def forward(self, x, mask=None):
        output = self.dropout(self.embedding(x) + self.positionalEncoding.unsqueeze(dim=0).expand(x.shape[0], self.max_length, self.embedding_dim).to(x.device))
        
        for layer in self.layers:
            output = layer(output, mask)
            
        return self.head(output)

In [3]:
num_proc = os.cpu_count()

tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased', use_fast=True)
print(f"Fast Tokenizer: {tokenizer.is_fast}")

max_length = 64

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Fast Tokenizer: True


# Instantiate the model

In [4]:
vocab_size = tokenizer.vocab_size
embed_size = 768
num_layers = 6
num_heads = 8
dropout = 0.1
expansion = 4
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print(device)
model = Encoder(vocab_size, embed_size, num_layers, num_heads, max_length, expansion, dropout, device).to(device)

# Create a new ordered dict to store the new state dict without 'module.' prefix
def remove_module_prefix(state_dict):
    new_state_dict = OrderedDict()
    for k, v in state_dict.items():
        if k.startswith('module.'):
            new_state_dict[k[7:]] = v
        else:
            new_state_dict[k] = v
    return new_state_dict

state_dict = remove_module_prefix(torch.load("/kaggle/input/gpt-34/pytorch/default/1/mlm_model.pth", map_location='cpu'))
model.load_state_dict(state_dict)

cpu


<All keys matched successfully>

# Utility for Masking Random Words in a sentence

In [5]:
from random import randint
from collections import OrderedDict

def mask_random_word(tokenizer, sentences):
    masked_sentences = []

    for sentence in sentences:
        encoding = tokenizer(sentence, return_tensors="pt", truncation=True, return_special_tokens_mask=True, padding="max_length", max_length=max_length)
        if len(encoding["input_ids"]) == 0:
            continue
        
        words = encoding.word_ids()
        i = len(words) - 1
        while i >= 0:
            if words[i] != None:
                break
            i -= 1
        r = i
        while True:
            r = randint(1, i)
            if words[r] is not None:
                break
        
        i = words[r]
        sen = {}
        sen["word_id"] = i
        start, end = encoding.word_to_chars(i)
        sen["word"] = sentence[start:end]
        sen["original"] = sentence
        for idx, k in enumerate(words):
            if k == i:
                encoding["input_ids"][0][idx] = tokenizer.mask_token_id
        sen["masked"] = sentence[:start] + tokenizer.mask_token + sentence[end:]
        sen["encoding"] = encoding
        
        masked_sentences.append(sen)
    
    return masked_sentences


# Testing the Utility against Transformers' Data Collator for MLM

In [6]:
f = mask_random_word(tokenizer, ["On a sunny afternoon, the children decided to play in the park, enjoying the fresh air and warm sunshine."])
for i in f:
    print(i["original"])
    print(i["masked"])
    print(i["word"])
    print(i["word_id"])
    print(i["encoding"]["input_ids"])
    print(i["encoding"]["attention_mask"])    

On a sunny afternoon, the children decided to play in the park, enjoying the fresh air and warm sunshine.
On a sunny afternoon, the children decided to play in the park, enjoying the fresh air and warm [MASK].
sunshine
20
tensor([[  101,  2006,  1037, 11559,  5027,  1010,  1996,  2336,  2787,  2000,
          2377,  1999,  1996,  2380,  1010,  9107,  1996,  4840,  2250,  1998,
          4010,   103,  1012,   102,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0]])
tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])


In [7]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=True,
    mlm_probability=0.1
)

sentence = ["On a sunny afternoon, the children decided to play in the park, enjoying the fresh air and warm sunshine."]

encoding = tokenizer(sentence, return_tensors="pt", truncation=True, return_special_tokens_mask=True, padding="max_length", max_length=max_length)

print(data_collator([encoding]))

{'input_ids': tensor([[[  101,  2006,  1037, 11559,  5027,  1010,  1996,  2336,  2787,  2000,
           2377,  1999,  1996,  2380,  1010,  9107,  1996,  4840,  2250,  1998,
           4010,  9609,  1012,   102,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0]]]), 'token_type_ids': tensor([[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]]), 'attention_mask': tensor([[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
          1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

# Function to Predict Masked Words

In [26]:
def predict_masked_words(model, tokenizer, masked_sentences):
    model.eval()
    predicted_sentences = []

    for masked_tokens in masked_sentences:
        # Convert tokens to input IDs
        input_ids = masked_tokens["encoding"]["input_ids"]
        mask = masked_tokens["encoding"]["attention_mask"]
        
        # Get predictions
        with torch.no_grad():
            outputs = model(input_ids, mask)
        
        # Get the predicted tokens
        predicted_ids = torch.argmax(outputs, dim=-1).squeeze()
        predicted_tokens = tokenizer.decode(predicted_ids[input_ids[0] == tokenizer.mask_token_id])
        
        print(f"Original : {masked_tokens['original']}")
        print(f'Masked : {masked_tokens["masked"]}')
        print(f"Predicted Word : {predicted_tokens}\n")
    

In [27]:
sentences = [
    "The quick brown fox jumps over the lazy dog, while the dog continues to sleep without a care in the world.",
    "On a sunny afternoon, the children decided to play in the park, enjoying the fresh air and warm sunshine.",
    "The artist spent hours painting the beautiful landscape, capturing every detail with precise brushstrokes and vibrant colors.",
    "After a long day at work, Sarah enjoyed relaxing with a good book and a cup of hot tea.",
    "The chef prepared a delicious three-course meal, using fresh ingredients from the local farmers' market.",
    "During the summer vacation, the family traveled to the mountains, where they hiked, fished, and enjoyed nature.",
    "The teacher explained the complex math problem, ensuring that every student understood the steps required to solve it.",
    "Every morning, John goes for a jog around the neighborhood, enjoying the peacefulness before the hustle and bustle begins.",
    "The company announced its plans to expand internationally, bringing its innovative products to new markets around the world.",
    "At the museum, visitors marveled at the ancient artifacts, learning about the history and culture of civilizations long gone.",
    "The new smartphone boasts a variety of features, including a high-resolution camera, long battery life, and fast processing speed.",
    "During the concert, the band played their greatest hits, energizing the crowd and creating an unforgettable experience.",
    "The scientist conducted experiments to test the hypothesis, meticulously recording data and analyzing the results.",
    "On weekends, Maria loves to bake cookies and cakes, sharing her delicious creations with friends and family.",
    "The small town held an annual festival, featuring local food vendors, live music, and various fun activities for all ages.",
    "With determination and hard work, Emily managed to save enough money to buy her first car.",
    "The novel's plot twists and turns kept readers on the edge of their seats, eagerly turning pages to see what happens next.",
    "At the beach, children built sandcastles, collected seashells, and played in the gentle waves.",
    "The volunteer organization worked tirelessly to provide aid and support to communities affected by natural disasters.",
    "The sports team trained rigorously, aiming to improve their skills and achieve victory in the upcoming championship.",
    "In the early morning hours, the fisherman set out on his boat, hoping for a bountiful catch.",
    "The new park features walking trails, playgrounds, and picnic areas, making it a popular spot for families.",
    "With a clear sky and mild temperatures, it was the perfect day for a leisurely bike ride through the countryside.",
    "The movie's stunning visual effects and gripping storyline captivated audiences, making it a box office hit.",
    "During the gardening season, the community garden flourished, with residents planting and harvesting various fruits and vegetables.",
    "The engineer designed a revolutionary new product, aiming to solve common problems with innovative technology.",
    "On a chilly evening, the family gathered around the fireplace, sharing stories and enjoying hot cocoa.",
    "The book club met monthly, discussing their latest read and sharing insights and perspectives on the story.",
    "The startup company quickly gained popularity, attracting investors and customers with its unique approach and cutting-edge products.",
    "At the zoo, visitors observed exotic animals from around the world, learning about their habitats and behaviors."
]


# Results

In [28]:
predict_masked_words(model, tokenizer, mask_random_word(tokenizer, sentences))

Original : The quick brown fox jumps over the lazy dog, while the dog continues to sleep without a care in the world.
Masked : The quick brown fox jumps over the lazy dog, while the [MASK] continues to sleep without a care in the world.
Predicted Word : man

Original : On a sunny afternoon, the children decided to play in the park, enjoying the fresh air and warm sunshine.
Masked : On a sunny afternoon, [MASK] children decided to play in the park, enjoying the fresh air and warm sunshine.
Predicted Word : the

Original : The artist spent hours painting the beautiful landscape, capturing every detail with precise brushstrokes and vibrant colors.
Masked : The artist spent hours painting the beautiful landscape, capturing every detail with precise brushstrokes and [MASK] colors.
Predicted Word : the

Original : After a long day at work, Sarah enjoyed relaxing with a good book and a cup of hot tea.
Masked : After a long day [MASK] work, Sarah enjoyed relaxing with a good book and a cup of 