# Pretrained Char Embeddings

In [None]:
# NOTE: Is there a embedding dataset of only words in the web?? Yes

# Load a .txt file separated by spaces where the first colulms is the character and the other columns corresponds
# to the word embedding

# Load the dataset
import pandas as pd
import numpy as np


# Load char embeddings
embeddings_path = 'glove.840B.300d-char.txt'

print('Processing pretrained character embeds...')
char_embeddings = {}
with open(embeddings_path, 'r') as f:
    for line in f:
        line_split = line.strip().split(" ")
        vec = torch.tensor(np.array(line_split[1:], dtype=float))
        char = line_split[0]
        char_embeddings[char] = vec

# For mask use underscore
token_mask = char_embeddings["_"]

# For padding use point
token_pad = char_embeddings["."]

# For CLS used to return an cumulated information for the whole word use #
token_cls = char_embeddings["#"]

# Filter Character Embeddings for char from a-z and three special tokens
char_embeddings = {char: vec for char, vec in char_embeddings.items() if char in "abcdefghijklmnopqrstuvwxyz"}
# Assing char indeces from 0 to 25 to the letters and 26 to 28 to the special tokens
char_indices = {char: i for i, char in enumerate(char_embeddings.keys())}

char_embeddings["_"] = token_mask
char_embeddings["."] = token_pad
char_embeddings["#"] = token_cls

char_indices["_"] = 26
char_indices["."] = 27
char_indices["#"] = 28

# Create an embedding matrix E
embedding_matrix = torch.zeros((len(char_embeddings), 300))
#embedding_matrix = np.random.uniform(-1, 1, (len(chars), 300))
for char, i in char_indices.items():
    #print ("{}, {}".format(char, i))
    embedding_vector = char_embeddings.get(char)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

embedding_matrix.shape

## BERT from scratch

In [96]:
import torch
from torch.nn import Module
from transformers import BertConfig, BertModel, BertTokenizer
from transformers.models.bert.modeling_bert import BertEmbeddings, BertEncoder, BertPooler
from torch.utils.data import Dataset, DataLoader

# Custom Character-Level Tokenizer
class CharTokenizer:
    def __init__(self):
        self.vocab = {char: idx for idx, char in enumerate("abcdefghijklmnopqrstuvwxyz", start=1)}
        self.vocab["[PAD]"] = 0
        self.vocab["[UNK]"] = len(self.vocab)
        self.vocab["[CLS]"] = len(self.vocab)
        self.vocab["[MASK]"] = len(self.vocab)
        self.inv_vocab = {v: k for k, v in self.vocab.items()}
        print("Vocab size: ", len(self.vocab))

    def tokenize(self, text):
        return list(text.lower())
    
    def get_ids(self, tokens):
        return [self.vocab.get(t, self.vocab["[UNK]"]) for t in tokens]

    def encode(self, text, max_length):
        tokens = self.tokenize(text)
        token_ids = self.get_ids(tokens)
        return [self.vocab["[CLS]"]] + token_ids[:max_length] + [self.vocab["[PAD]"]] * (max_length - len(token_ids) - 1)
    
    def decode(self, token_ids):
        return ''.join([self.inv_vocab.get(t, "[UNK]") for t in token_ids if t != 0])
    
# Custom Dataset
class WordDataset(Dataset):
    def __init__(self, words, tokenizer, max_length):
        self.words = words
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.words)
    
    def mlm_masking_word(self, sentence):
        # Tokenize the entire sentence
        # tokenized = self.tokenizer(sentence, return_tensors="pt", add_special_tokens=False)
        # tokens = tokenized["input_ids"].squeeze(0)  # Shape: (seq_len,)

        tokens = self.tokenizer.tokenize(sentence)
        token_ids = torch.tensor(self.tokenizer.get_ids(tokens))

        # Generate random probabilities for each token
        probs = torch.rand(token_ids.shape)

        # 15% of the tokens will be considered for masking
        mask_prob = probs < 0.15
        # print("0.15 Masked:",  mask_prob)

        # Initialize labels (original tokens for masked positions, 0 otherwise)
        labels = torch.where(mask_prob, token_ids, torch.zeros_like(token_ids))
        # print("labels:", labels)

        # 80% of masked tokens will be replaced with [MASK]
        mask_replace_prob = torch.rand(token_ids.shape)
        masked_tokens = torch.where(
            mask_prob & (mask_replace_prob < 0.8), 
            torch.tensor(self.tokenizer.vocab['[MASK]']), 
            token_ids
        )
        # print("80% from masked: ", mask_prob & (mask_replace_prob < 0.8))
        # print(masked_tokens)

        # 10% of masked tokens will be replaced with random tokens
        random_replace_prob = torch.rand(token_ids.shape)
        random_tokens = torch.randint(len(self.tokenizer.vocab), token_ids.shape)
        final_tokens = torch.where(
            mask_prob & (mask_replace_prob >= 0.8) & (random_replace_prob < 0.5),
            random_tokens,
            masked_tokens
        )
        # print("10% from masked: ", mask_prob & (mask_replace_prob >= 0.8) & (random_replace_prob < 0.5))
        # print(final_tokens)

        # Tokens not selected for masking remain unchanged
        # final_tokens = torch.where(mask_prob, final_tokens, token_ids)
        # print(final_tokens)

        # Adding special tokens ids and correcting labels
        return self.add_special_tokens(final_tokens, labels)
    

    def add_special_tokens(self, token_ids, labels):
        # Create CLS and PAD tokens
        cls_token = torch.tensor([self.tokenizer.vocab["[CLS]"]])
        pad_token = torch.tensor([self.tokenizer.vocab["[PAD]"]])

        # Add CLS token and truncate or pad token_ids
        truncated_tokens = token_ids[:self.max_length]
        padded_tokens = torch.cat([cls_token, truncated_tokens, pad_token.repeat(self.max_length - truncated_tokens.size(0) - 1)])

        # Add 0 for CLS and PAD tokens to labels
        zero_label = torch.tensor([0])
        truncated_labels = labels[:self.max_length]
        padded_labels = torch.cat([zero_label, truncated_labels, zero_label.repeat(self.max_length - truncated_labels.size(0) - 1)])

        # Outputs
        final_tokens = padded_tokens  # Shape: (max_length,)
        labels = padded_labels         # Shape: (max_length,)
        return final_tokens, labels


    def __getitem__(self, idx):
        word = self.words[idx]

        input_ids, labels = self.mlm_masking_word(word)

        attention_mask = torch.where(input_ids != self.tokenizer.vocab["[PAD]"], 1, 0)

        output = {"bert_input": input_ids,
                  "bert_label": labels,
                  "attention_mask": attention_mask}
        
        return output

# Head using embedding layer
# If I use the 300 Glove embeddings I can use the embedding layer to predict the word
# class MLMHead(torch.nn.Module):
#     def __init__(self, embedding_layer):
#         """
#         :param embedding_layer: Embedding layer from the model
#         """
#         super().__init__()
#         # Use the embedding layer's weight matrix for the linear layer
#         self.linear = torch.nn.Linear(embedding_layer.word_embeddings.weight.size(1),
#                                        embedding_layer.word_embeddings.weight.size(0))
#         self.linear.weight = embedding_layer.word_embeddings.weight  # Share weights
#         self.softmax = torch.nn.LogSoftmax(dim=-1)

#     def forward(self, x):
#         return self.softmax(self.linear(x))

class MLMHead(torch.nn.Module):
    """
    predicting origin token from masked input sequence
    n-class classification problem, n-class = vocab_size
    """

    def __init__(self, hidden, vocab_size):
        """
        :param hidden: output size of BERT model
        :param vocab_size: total vocab size
        """
        super().__init__()
        self.linear = torch.nn.Linear(hidden, vocab_size)
        self.softmax = torch.nn.LogSoftmax(dim=-1)

    def forward(self, x):
        return self.softmax(self.linear(x))

# Custom BERT Architecture with Configurable Layers
class CustomBERT(Module):
    def __init__(self, vocab_size, hidden_size, num_hidden_layers, num_attention_heads, max_position_embeddings, intermediate_size):
        super(CustomBERT, self).__init__()
        config = BertConfig(
            vocab_size=vocab_size,
            hidden_size=hidden_size,
            num_hidden_layers=num_hidden_layers,
            num_attention_heads=num_attention_heads,
            max_position_embeddings=max_position_embeddings,
            intermediate_size=intermediate_size,
        )
        self.embeddings = BertEmbeddings(config)
        self.encoder = BertEncoder(config)
        self.mlm_head = MLMHead(hidden_size, vocab_size)
        # self.pooler = BertPooler(config)

    def forward(self, input_ids, attention_mask):
        embeddings = self.embeddings(input_ids=input_ids)

        # NOTE: I have to add to dimension in between for the attention mask
        # because it will be used to calculatation the attention scores
        attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)

        # Encoder ouputs can return the embeddings in each layer, but here
        # I only interested in the last hidden state
        encoder_outputs = self.encoder(embeddings, attention_mask=attention_mask, return_dict=True)

        # Pooler is used to get the CLS token embedding and apply 
        # a linear transformation to it + tanh activation
        # output = self.pooler(encoder_outputs.last_hidden_state)

        # MLM head output
        output = self.mlm_head(encoder_outputs.last_hidden_state)

        # return encoder_outputs.last_hidden_state, output
        return output

In [105]:
import tqdm
import math


class BERTTrainer:
    def __init__(
        self, 
        model, 
        train_dataloader, 
        test_dataloader=None, 
        lr= 1e-4,
        weight_decay=0.01,
        betas=(0.9, 0.999),
        log_freq=10,
        device='cuda'
        ):

        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model = model.to(self.device)
        self.train_data = train_dataloader
        self.test_data = test_dataloader

        # Setting the Adam optimizer with hyper-param
        self.optimizer = torch.optim.Adam(self.model.parameters(), lr=lr, betas=betas, weight_decay=weight_decay)

        # Using Negative Log Likelihood Loss function for predicting the masked_token
        self.criterion = torch.nn.NLLLoss(ignore_index=0)
        self.log_freq = log_freq
        print("Total Parameters:", sum([p.nelement() for p in self.model.parameters()]))
    
    def train(self, epoch):
        self.iteration(epoch, self.train_data)

    def test(self, epoch):
        self.iteration(epoch, self.test_data, train=False)

    def iteration(self, epoch, data_loader, train=True):
        
        avg_loss = 0.0
        total_correct = 0  # To track correct predictions
        total_masked = 0   # To track total masked tokens

        mode = "train" if train else "test"

        # progress bar
        data_iter = tqdm.tqdm(
            enumerate(data_loader),
            desc="EP_%s:%d" % (mode, epoch),
            total=len(data_loader),
            bar_format="{l_bar}{r_bar}"
        )

        for i, data in data_iter:

            # batch_data will be sent into the device(GPU or cpu)
            data = {key: value.to(self.device) for key, value in data.items()}

            # forward the model
            mask_lm_output = self.model.forward(data["bert_input"], data["attention_mask"])

            # NLLLoss of predicting masked token word
            # transpose to (m, vocab_size, seq_len) vs (m, seq_len)
            # criterion(mask_lm_output.view(-1, mask_lm_output.size(-1)), data["bert_label"].view(-1))
            
            # NOTE: the mask_lm_output will return -log probability values,
            # then the criterion will only average the values of the masked tokens
            loss = self.criterion(mask_lm_output.transpose(1, 2), data["bert_label"])

            # backward and optimization only in train
            if train:
                # self.optim_schedule.zero_grad()
                self.optimizer.zero_grad()
                loss.backward()
                # self.optim_schedule.step_and_update_lr()
                self.optimizer.step()

            # Update average loss
            avg_loss += loss.item()

            # Calculate predictions and accuracy
            predictions = torch.argmax(mask_lm_output, dim=-1)  # Shape: (batch_size, seq_len)
            correct = (predictions == data["bert_label"]) & (data["bert_label"] != 0)  # Exclude padding
            total_correct += correct.sum().item()
            total_masked += (data["bert_label"] != 0).sum().item()  # Exclude padding tokens

            # Calculate perplexity
            # NOTE: perplexity is not well defined for masked language models like BERT (see summary of the models).
            perplexity = math.exp(avg_loss / (i + 1))

            # Calculate masked token accuracy
            accuracy = total_correct / total_masked if total_masked > 0 else 0

            post_fix = {
                "epoch": epoch,
                "iter": i,
                "avg_loss": round(avg_loss / (i + 1), 3),
                "loss": round(loss.item(), 3),
                "perplexity": round(perplexity, 3),
                "accuracy": round(accuracy, 3)
            }

            if i % self.log_freq == 0:
                data_iter.write(str(post_fix))


        # Final epoch logging
        final_accuracy = total_correct / total_masked if total_masked > 0 else 0
        # print(
        #     f"EP{epoch}, {mode}: \
        #     avg_loss={avg_loss / len(data_iter):.4f}, \
        #     perplexity={math.exp(avg_loss / len(data_iter)):.4f}, \
        #     accuracy={final_accuracy:.4f}"
        # )

In [None]:
# TODO: ideas para limpiar la data
# 1. remover palabras con 1 o 2 caracteres
# 2. obtener las raices de las palabras (stemming) -> tokenizer????
# 

In [116]:
class SuffixTree:
    def __init__(self):
        self.nodes = {0: {}}  # Root node
        self.num = 0  # Node counter

    def add_suffix(self, suffix):
        current_node = 0
        for char in suffix:
            if char not in self.nodes[current_node]:
                self.num += 1
                self.nodes[current_node][char] = self.num
                self.nodes[self.num] = {}
            current_node = self.nodes[current_node][char]

    def find_common_substrings(self, min_count=2, min_length=3):
        result = []
        stack = [(0, "")]
        while stack:
            node, path = stack.pop()
            if len(path) >= min_length and len(self.nodes[node]) >= min_count:
                result.append(path)
            for char, child_node in self.nodes[node].items():
                stack.append((child_node, path + char))
        return result


def generate_subwords(word_list, min_count=2, min_length=3):
    # Build a suffix tree
    tree = SuffixTree()
    for word in word_list:
        for i in range(len(word)):
            tree.add_suffix(word[i:])

    # Find frequent substrings
    common_substrings = tree.find_common_substrings(min_count, min_length)

    # Generate subwords by splitting words at common substrings
    subwords = set()
    for word in word_list:
        splits = [word]
        for substring in common_substrings:
            splits = [part for split in splits for part in split.split(substring) if part]
            subwords.update(splits)
        subwords.update(splits)

    # Filter out very short subwords
    subwords = {subword for subword in subwords if len(subword) >= min_length}

    return list(subwords)


# Example dataset
# raw_word_list = [
#     "replaying", "submarine", "unbelievable", "disconnected", "transformation", "antibacterial",
#     "automation", "endless", "happiness", "autonomous", "misunderstand"
# ]



# Generate subwords
subwords = generate_subwords(words, min_count=2, min_length=3)

# Save subwords to a file
output_file = "algorithmic_subwords.txt"
with open(output_file, "w") as f:
    for subword in subwords:
        f.write(f"{subword}\n")

print(f"Subwords saved to {output_file}:")
print(subwords)

KeyboardInterrupt: 

In [109]:
# Configuration
hidden_size = 128 # 300 for using Glove embeddings
num_hidden_layers = 4  # Change as needed
num_attention_heads = 4
max_position_embeddings = 32  # Max word length
intermediate_size = 512
max_word_length = 32
batch_size = 16

# Initialize Components
tokenizer = CharTokenizer()
vocab_size = len(tokenizer.vocab)
model = CustomBERT(
    vocab_size=vocab_size, # 4 special tokens
    hidden_size=hidden_size,
    num_hidden_layers=num_hidden_layers,
    num_attention_heads=num_attention_heads,
    max_position_embeddings=max_position_embeddings,
    intermediate_size=intermediate_size,
)


# # Example Dataset
# words = ["apple", "banana", "cherry", "date"]

# Load the dataset in a list from a .txt file
words = []
with open('words_250000_train.txt', 'r') as f:
    for line in f:
        words.append(line.strip())

train_data = WordDataset(words, tokenizer, max_word_length)
train_loader = DataLoader(train_data,
                          batch_size=batch_size, 
                          shuffle=True,
                          pin_memory=True)

Vocab size:  30


In [110]:
len(train_data)

227300

In [108]:
bert_trainer = BERTTrainer(model, train_loader, device='cpu')
epochs = 20

for epoch in range(epochs):
  bert_trainer.train(epoch)

Total Parameters: 805406


EP_train:0: 100%|| 1/1 [00:00<00:00, 46.51it/s]


{'epoch': 0, 'iter': 0, 'avg_loss': 3.663, 'loss': 3.663, 'perplexity': 38.97, 'accuracy': 0.0}


EP_train:1: 100%|| 1/1 [00:00<00:00, 63.48it/s]


{'epoch': 1, 'iter': 0, 'avg_loss': 3.296, 'loss': 3.296, 'perplexity': 26.999, 'accuracy': 0.0}


EP_train:2: 100%|| 1/1 [00:00<00:00, 61.96it/s]


{'epoch': 2, 'iter': 0, 'avg_loss': 3.191, 'loss': 3.191, 'perplexity': 24.32, 'accuracy': 0.0}


EP_train:3: 100%|| 1/1 [00:00<00:00, 51.27it/s]


{'epoch': 3, 'iter': 0, 'avg_loss': 3.578, 'loss': 3.578, 'perplexity': 35.812, 'accuracy': 0.0}


EP_train:4: 100%|| 1/1 [00:00<00:00, 62.68it/s]


{'epoch': 4, 'iter': 0, 'avg_loss': 3.588, 'loss': 3.588, 'perplexity': 36.177, 'accuracy': 0.0}


EP_train:5: 100%|| 1/1 [00:00<00:00, 58.93it/s]


{'epoch': 5, 'iter': 0, 'avg_loss': 2.541, 'loss': 2.541, 'perplexity': 12.694, 'accuracy': 0.0}


EP_train:6: 100%|| 1/1 [00:00<00:00, 63.34it/s]


{'epoch': 6, 'iter': 0, 'avg_loss': 3.241, 'loss': 3.241, 'perplexity': 25.559, 'accuracy': 0.0}


EP_train:7: 100%|| 1/1 [00:00<00:00, 64.09it/s]


{'epoch': 7, 'iter': 0, 'avg_loss': 3.507, 'loss': 3.507, 'perplexity': 33.335, 'accuracy': 0.0}


EP_train:8: 100%|| 1/1 [00:00<00:00, 63.43it/s]


{'epoch': 8, 'iter': 0, 'avg_loss': 2.503, 'loss': 2.503, 'perplexity': 12.224, 'accuracy': 0.0}


EP_train:9: 100%|| 1/1 [00:00<00:00, 68.36it/s]


{'epoch': 9, 'iter': 0, 'avg_loss': 2.882, 'loss': 2.882, 'perplexity': 17.848, 'accuracy': 0.0}


EP_train:10: 100%|| 1/1 [00:00<00:00, 68.80it/s]


{'epoch': 10, 'iter': 0, 'avg_loss': 3.442, 'loss': 3.442, 'perplexity': 31.238, 'accuracy': 0.0}


EP_train:11: 100%|| 1/1 [00:00<00:00, 66.41it/s]


{'epoch': 11, 'iter': 0, 'avg_loss': 2.925, 'loss': 2.925, 'perplexity': 18.642, 'accuracy': 0.2}


EP_train:12: 100%|| 1/1 [00:00<00:00, 67.27it/s]


{'epoch': 12, 'iter': 0, 'avg_loss': 2.99, 'loss': 2.99, 'perplexity': 19.878, 'accuracy': 0.143}


EP_train:13: 100%|| 1/1 [00:00<00:00, 68.61it/s]


{'epoch': 13, 'iter': 0, 'avg_loss': 1.499, 'loss': 1.499, 'perplexity': 4.476, 'accuracy': 1.0}


EP_train:14: 100%|| 1/1 [00:00<00:00, 60.64it/s]


{'epoch': 14, 'iter': 0, 'avg_loss': nan, 'loss': nan, 'perplexity': nan, 'accuracy': 0}


EP_train:15: 100%|| 1/1 [00:00<00:00, 68.91it/s]


{'epoch': 15, 'iter': 0, 'avg_loss': 3.983, 'loss': 3.983, 'perplexity': 53.67, 'accuracy': 0.0}


EP_train:16: 100%|| 1/1 [00:00<00:00, 72.89it/s]


{'epoch': 16, 'iter': 0, 'avg_loss': 1.727, 'loss': 1.727, 'perplexity': 5.623, 'accuracy': 1.0}


EP_train:17: 100%|| 1/1 [00:00<00:00, 72.46it/s]


{'epoch': 17, 'iter': 0, 'avg_loss': 2.716, 'loss': 2.716, 'perplexity': 15.123, 'accuracy': 0.0}


EP_train:18: 100%|| 1/1 [00:00<00:00, 57.23it/s]


{'epoch': 18, 'iter': 0, 'avg_loss': 2.67, 'loss': 2.67, 'perplexity': 14.447, 'accuracy': 0.333}


EP_train:19: 100%|| 1/1 [00:00<00:00, 72.07it/s]

{'epoch': 19, 'iter': 0, 'avg_loss': 1.594, 'loss': 1.594, 'perplexity': 4.923, 'accuracy': 1.0}



