##  Data Preparation

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from datasets import DatasetDict, Dataset
import os
import random
import torch

def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

# Set a seed for all libraries
set_seed(42)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load datasets 
df_train = pd.read_csv('data/to_mask_train.csv', converters={'word': str})
df_val = pd.read_csv('data/masked_val.csv', converters={'masked_word': str, 'labels': str, 'previous_guesses': eval})
df_test = pd.read_csv('data/masked_test.csv', converters={'masked_word': str, 'labels': str, 'previous_guesses': eval})

print(df_train.head())

# Add spaces to the word in training
df_train["word"] = df_train["word"].apply(lambda x: ' '.join(list(x)))

# Add spaces to the masked_word and labels
df_val["masked_word"] = df_val["masked_word"].apply(lambda x: ' '.join(list(x)))
df_val["labels"] = df_val["labels"].apply(lambda x: ' '.join(list(x)))
df_test["masked_word"] = df_test["masked_word"].apply(lambda x: ' '.join(list(x)))
df_test["labels"] = df_test["labels"].apply(lambda x: ' '.join(list(x)))

# Print the shapes of the datasets
print(df_train.shape)
print(df_val.shape)
print(df_test.shape)

          word  word_length  unique_chars
0      timpani            7             6
1       worsle            6             6
2        yinst            5             5
3  grangerized           11             8
4      matatua            7             4
(204570, 3)
(22730, 3)
(130436, 3)


In [3]:
df_train.head()

Unnamed: 0,word,word_length,unique_chars
0,t i m p a n i,7,6
1,w o r s l e,6,6
2,y i n s t,5,5
3,g r a n g e r i z e d,11,8
4,m a t a t u a,7,4


In [4]:
df_val.head()

Unnamed: 0,masked_word,labels,previous_guesses
0,h e _ _ c l _ t _ s,h e r a c l i t u s,"[0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, ..."
1,_ e _ o d _ _ _,m e r o d a c h,"[0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ..."
2,_ _ g _ _ _,i n g i r t,"[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,_ _ _ z u o k a,s h i z u o k a,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, ..."
4,m _ _ t _ _ _ _ n n e _ _ e d,m u l t i c h a n n e l l e d,"[0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, ..."


In [5]:
# Gather all the data in a DatasetDict
df_train.reset_index(drop=True, inplace=True)
df_test.reset_index(drop=True, inplace=True)
df_val.reset_index(drop=True, inplace=True)

data = DatasetDict({
    "train": Dataset.from_pandas(df_train[["word"]]),
    "valid": Dataset.from_pandas(df_val),
    "test": Dataset.from_pandas(df_test)
})

data

DatasetDict({
    train: Dataset({
        features: ['word'],
        num_rows: 204570
    })
    valid: Dataset({
        features: ['masked_word', 'labels', 'previous_guesses'],
        num_rows: 22730
    })
    test: Dataset({
        features: ['masked_word', 'labels', 'previous_guesses'],
        num_rows: 130436
    })
})

In [6]:
import random
import torch
from transformers import DataCollatorForLanguageModeling, BertTokenizer
from torch.utils.data import DataLoader

class CustomDataCollatorForMLM(DataCollatorForLanguageModeling):
    def __init__(self, tokenizer, mlm_probability=None, max_length=42, prior_frequencies_path = None):
        if mlm_probability is None:
            mlm_probability = random.uniform(0.3, 0.8)
        super().__init__(tokenizer=tokenizer, mlm_probability=mlm_probability)
        self.max_length = max_length
        self.char_to_index = {chr(i + ord('a')): i for i in range(26)}

        self.prior_frequencies = None
        # Load prior frequencies information
        if prior_frequencies_path is not None:
            self.prior_frequencies = torch.tensor(pd.read_csv(prior_frequencies_path).to_numpy(), dtype=torch.float32)
            print("Loaded Prior Information: ", self.prior_frequencies.shape)
        

    
    def __call__(self, examples):

        # Convert characters to their corresponding numerical labels
        labels = []
        words = []
        
        # Get a tensor of the shape (len(examples), 26) with the prior probabilities of each character
        prior_probs_batch = torch.full((len(examples), 26), 1/26, dtype=torch.float32)

        for i, example in enumerate(examples):
            example = example['word']

            # Adding prior information
            if (self.prior_frequencies is not None) and len(example) <= self.prior_frequencies.shape[0]:
                prior_probs_batch[i,:] = self.prior_frequencies[len(example)-1,:]

            row_label = [self.char_to_index[char] for char in example.split()]
            labels.append(row_label)
            words.append(example)

        # Pad labels to the same length (considering special tokens at the beginning and end of each label)
        labels = [ [-100] + row_label + [-100] * (self.max_length - len(row_label) - 1) for row_label in labels]
        labels = torch.tensor(labels)

        # Create a one-hot vector for the labels without consider the -100 values
        prev_guess = torch.zeros((labels.shape[0], 26), dtype=torch.int64)
        for i, label in enumerate(labels):
            prev_guess[i][label[label != -100]] = 1

            # NOTE: I could add until 5 random previous guesses to the prev_guess tensor
            #  to account for maximum number of mistakes made when playing Hangman
            random_guesses = torch.tensor([i for i in range(26) if i not in label])

            # Take from 0 to 5 values from the random guesses randomly
            if len(random_guesses) > 5:
                random_guesses = random_guesses[torch.randperm(len(random_guesses))[:5]]
                prev_guess[i][random_guesses] = 1

        # Tokenize and pad the input examples
        batch = self.tokenizer(words, truncation=True, padding='max_length', return_tensors="pt", max_length=self.max_length)
        
        # Get the input_ids and apply masking
        input_ids = batch["input_ids"]
        # labels = input_ids.clone()

        for i, input_id in enumerate(input_ids):

            # Get unique tokens
            unique_tokens = torch.unique(input_id)
            
            # Filter Special Tokens by setting probability to 0.0
            special_tokens_mask = self.tokenizer.get_special_tokens_mask(unique_tokens, already_has_special_tokens=True)
            probabilities = torch.full(unique_tokens.shape, self.mlm_probability)
            probabilities.masked_fill_(torch.tensor(special_tokens_mask, dtype=torch.bool), value=0.0)
            
            # Filter the tokens to mask
            tokens_to_mask = unique_tokens[torch.bernoulli(probabilities).bool()]

            # Mask all instances of the chosen tokens
            masked_indices = torch.zeros(input_id.shape, dtype=torch.bool)
            for token in tokens_to_mask:
                masked_indices[(input_id == token)] = True

            labels[i][~masked_indices] = -100  # We only compute loss on masked tokens

            # 80% of the time, replace masked input tokens with tokenizer.mask_token ([MASK])
            indices_replaced = torch.bernoulli(torch.full(input_id.shape, 0.8)).bool() & masked_indices
            input_ids[i][indices_replaced] = self.tokenizer.convert_tokens_to_ids(self.tokenizer.mask_token)

            # 10% of the time, replace masked input tokens with random word
            # indices_random = torch.bernoulli(torch.full(input_id.shape, 0.5)).bool() & masked_indices & ~indices_replaced
            # random_words = torch.randint(26, input_id.shape, dtype=torch.long)
            # input_ids[i][indices_random] = random_words[indices_random]

            # The rest of the time (10% of the time) we keep the masked input tokens unchanged

        # Set to zero the values masked in the labels
        for i, label in enumerate(labels):
            prev_guess[i][label[label != -100]] = 0

        batch["input_ids"] = input_ids
        batch["labels"] = labels
        batch["prev_guess"] = prev_guess
        batch["prior_probs"] = prior_probs_batch

        return batch


In [7]:
# TOY EXAMPLE
# Initialize the tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Initialize the custom data collator
data_collator = CustomDataCollatorForMLM(tokenizer, 
                                         mlm_probability=0.5, 
                                         max_length=40, 
                                         prior_frequencies_path="data/total_rel_freq.csv")

# Toy dataset
toy_data = {
    "word": ["h e l l o h e l l o h e l l o h e l l o h e l l o h e l l o", "w o r l d", "p a r i s", "b e r l i n"]
}
df_temp = pd.DataFrame(toy_data)
print(df_temp)

# Tokenize the toy dataset
TensorDataset = Dataset.from_pandas(df_temp)
print(TensorDataset)
toy_dataloader = DataLoader(TensorDataset, batch_size=2, collate_fn=data_collator)

# Print the first batch
for batch in toy_dataloader:
    print(batch['input_ids'])
    print(batch['labels'])
    print(batch['prev_guess'])
    print(batch['prior_probs'])
    break

Loaded Prior Information:  torch.Size([28, 26])
                                                word
0  h e l l o h e l l o h e l l o h e l l o h e l ...
1                                          w o r l d
2                                          p a r i s
3                                        b e r l i n
Dataset({
    features: ['word'],
    num_rows: 4
})
tensor([[ 101, 1044, 1041, 1048, 1048,  103, 1044, 1041, 1048, 1048,  103, 1044,
         1041, 1048, 1048,  103, 1044, 1041, 1048, 1048, 1051, 1044, 1041, 1048,
         1048,  103, 1044, 1041, 1048, 1048,  103,  102,    0,    0,    0,    0,
            0,    0,    0,    0],
        [ 101,  103,  103, 1054,  103, 1040,  102,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0]])
tensor([[-100, -100, -100, -100, -100,   14, -100, -100, -100, -100,   14, -100

In [8]:
# Initialize the tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Initialize the custom data collator
data_collator = CustomDataCollatorForMLM(tokenizer, mlm_probability=0.5, max_length=42, prior_frequencies_path="data/total_rel_freq.csv")

train_dataloader = DataLoader(
    data['train'], 
    batch_size=64,
    collate_fn = data_collator
)

# Prepare a batch using the custom data collator
for batch in train_dataloader:
    print("input_ids", batch['input_ids'].dtype)
    print("labels", batch['labels'].dtype)
    print("prev_guess", batch['prev_guess'].dtype)
    print("prior_probs", batch['prior_probs'].dtype)

    print("input_ids", batch['input_ids'][:10,:10], batch['input_ids'].shape)
    print("labels", batch['labels'][:10,:10], batch['labels'].shape)
    print("prev_guess", batch['prev_guess'][:10,:10], batch['prev_guess'].shape)
    print("prior_probs", batch['prior_probs'][:10,:10], batch['prior_probs'].shape)
    break



Loaded Prior Information:  torch.Size([28, 26])
input_ids torch.int64
labels torch.int64
prev_guess torch.int64
prior_probs torch.float32
input_ids tensor([[ 101, 1056,  103, 1049, 1052, 1037,  103,  103,  102,    0],
        [ 101, 1059,  103, 1054,  103, 1048,  103,  102,    0,    0],
        [ 101, 1061,  103, 1050, 1055, 1056,  102,    0,    0,    0],
        [ 101, 1043,  103, 1037,  103, 1043, 1041,  103, 1045, 1062],
        [ 101, 1049, 1037,  103, 1037,  103, 1057, 1037,  102,    0],
        [ 101, 1044, 1045, 1055, 1056, 1051, 1054, 1045,  103,  103],
        [ 101, 1054,  103, 1039,  103, 1048, 1048,  103, 1039, 1056],
        [ 101,  103,  103,  103, 1061, 1049, 1049,  103, 1040, 1045],
        [ 101, 1055, 1056,  103,  103,  103,  102,    0,    0,    0],
        [ 101, 1055, 1041, 1054, 1045,  103, 1051, 1039,  103, 1054]]) torch.Size([64, 42])
labels tensor([[-100, -100,    8, -100,   15, -100,   13,    8, -100, -100],
        [-100, -100,   14, -100,   18, -100,    4, -1

In [9]:
import torch
from transformers import BertTokenizer

class FixedTokenizer:
    def __init__(self, tokenizer, max_length=42, prior_frequencies_path = None):
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.char_to_index = {chr(i + ord('a')): i for i in range(26)}

        self.prior_frequencies = None
        # Load prior frequencies information
        if prior_frequencies_path is not None:
            self.prior_frequencies = torch.tensor(pd.read_csv(prior_frequencies_path).to_numpy(), dtype=torch.float32)
            print("Loaded Prior Information: ", self.prior_frequencies.shape)

    def __call__(self, examples):

        masked_word = examples['masked_word']
        labels_word = examples['labels']
        prev_guesses = examples['previous_guesses']

        # Create a labels tensor with all the values to -100
        labels_batch = torch.full((len(masked_word),self.max_length,), -100, dtype=torch.int64)

        # Get a tensor of the shape (len(examples), 26) with the prior probabilities of each character
        prior_probs_batch = torch.full((len(masked_word), 26), 1/26, dtype=torch.float32)

        for i in range(len(masked_word)):


            if (self.prior_frequencies is not None) and len(masked_word[i].split()) <= self.prior_frequencies.shape[0]:
                prior_probs_batch[i,:] = self.prior_frequencies[len(masked_word[i].split())-1,:]

            # Replace the underscore in masked_word with the special [MASK] token
            masked_word[i] = masked_word[i].replace('_', '[MASK]')

        # Tokenize the masked_words
        batch = self.tokenizer(masked_word, truncation=True, padding='max_length', return_tensors="pt", max_length=self.max_length)

        # Create the labels per word
        for i in range(len(masked_word)):

            # Split labels_word[i] into a list of characters considering they are separated by spaces
            labels_word[i] = labels_word[i].split()
            masked_word[i] = masked_word[i].split()


            # Convert labels_word to their corresponding numerical labels
            for j, char in enumerate(labels_word[i]):
                if masked_word[i][j] == '[MASK]':
                    labels_batch[i][j + 1] = self.char_to_index[char]

        prev_guesses_batch = torch.tensor(prev_guesses, dtype=torch.int64)

        batch['prev_guess'] = prev_guesses_batch
        batch['labels'] = labels_batch
        batch['prior_probs'] = prior_probs_batch

        return batch

# Initialize the tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Example usage with first 9 word
df = df_val[:9]

data_set = Dataset.from_pandas(df)

# Initialize the data collator
tokenizer = FixedTokenizer(tokenizer, max_length=42, prior_frequencies_path="data/total_rel_freq.csv")

# Process the examples
data_set = data_set.map(tokenizer, batched=True, batch_size=3)
data_set.set_format("torch")
data_set = data_set.remove_columns(['masked_word','previous_guesses', 'token_type_ids'])

print(data_set)

print(data_set['input_ids'][0])
print(data_set['labels'][0])
print(data_set['prev_guess'][0])

# Print the shapes of the processed data
print(data_set['input_ids'].shape)
print(data_set['labels'].shape)
print(data_set['prev_guess'].shape)
print(data_set['prior_probs'].shape)

# Data loader
val_dataloader = DataLoader(
    data_set, 
    batch_size=3,
    shuffle=False
)

# Check the first batch of the validation data loader
for batch in val_dataloader:
    print("input_ids", batch['input_ids'].dtype)
    print("labels", batch['labels'].dtype)
    print("prev_guess", batch['prev_guess'].dtype)
    print("prior_probs", batch['prior_probs'].dtype)

    print("input_ids", batch['input_ids'][0], batch['input_ids'].shape)
    print("labels", batch['labels'][0], batch['labels'].shape)
    print("prev_guess", batch['prev_guess'][0], batch['prev_guess'].shape)
    print("prior_probs", batch['prior_probs'][0], batch['prior_probs'].shape)
    break


Loaded Prior Information:  torch.Size([28, 26])


Map: 100%|██████████| 9/9 [00:00<00:00, 659.09 examples/s]

Dataset({
    features: ['labels', 'input_ids', 'attention_mask', 'prev_guess', 'prior_probs'],
    num_rows: 9
})
tensor([ 101, 1044, 1041,  103,  103, 1039, 1048,  103, 1056,  103, 1055,  102,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0])
tensor([-100, -100, -100,   17,    0, -100, -100,    8, -100,   20, -100, -100,
        -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
        -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
        -100, -100, -100, -100, -100, -100])
tensor([0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0,
        0, 0])
torch.Size([9, 42])
torch.Size([9, 42])
torch.Size([9, 26])
torch.Size([9, 26])
input_ids torch.int64
labels torch.int64
prev_guess torch.int64
prior_probs torch.float32
input_ids tensor([ 101, 1044, 1041,  103,  103, 10




In [10]:
# Tokenize the Validation and Test datasets

# Initialize the tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Initialize the data collator
tokenizer = FixedTokenizer(tokenizer, max_length=42, prior_frequencies_path="data/total_rel_freq.csv")


# Tokenize the val dataset
tokenized_val_data = data['valid'].map(tokenizer, batched=True)
tokenized_val_data.set_format("torch")
tokenized_val_data = tokenized_val_data.remove_columns(['masked_word','previous_guesses', 'token_type_ids'])

# Tokenize the test dataset
# tokenized_test_data = data['test'].map(tokenizer, batched=True, batch_size=3)
# tokenized_test_data.set_format("torch")
# tokenized_test_data = tokenized_test_data.remove_columns(['masked_word','previous_guesses', 'token_type_ids', 'attention_mask'])

Loaded Prior Information:  torch.Size([28, 26])


Map: 100%|██████████| 22730/22730 [00:03<00:00, 6719.72 examples/s]


# Desing the model for fine tuning

In [11]:
from datasets import load_dataset,Dataset,DatasetDict
from transformers import DataCollatorWithPadding,AutoModelForSequenceClassification, Trainer, TrainingArguments,AutoTokenizer,AutoModel,AutoConfig
from transformers.modeling_outputs import TokenClassifierOutput
import torch
import torch.nn as nn
import pandas as pd

In [19]:
class HangmanNet(nn.Module):
  def __init__(self,checkpoint, vocab_size = 26, hidden_ffn_size = 410, unfreeze_layers = 0, alpha = 0.5): 
    super(HangmanNet,self).__init__() 
    self.num_labels = vocab_size
    self.alpha = alpha 

    #Load Model with given checkpoint and extract its body
    self.model = AutoModel.from_pretrained(checkpoint,config=AutoConfig.from_pretrained(checkpoint, output_attentions=True,output_hidden_states=True))
    
    # Freeze all layers in the BERT model
    for param in self.model.parameters():
        param.requires_grad = False

    # Unfreeze the last `unfreeze_layers` layers
    if unfreeze_layers > 0:
        for layer in self.model.encoder.layer[-unfreeze_layers:]:
            for param in layer.parameters():
                param.requires_grad = True
        

    self.dropout = nn.Dropout(0.1)

    self.classifier = nn.Sequential(
        nn.Linear(768 + vocab_size, hidden_ffn_size),
        nn.ReLU(),
        nn.Linear(hidden_ffn_size, 26)
    )
    
    # self.classifier = nn.Linear(768 + vocab_size,vocab_size) # load and initialize weights
  
  def forward(self, input_ids=None, attention_mask=None, labels=None, prev_guess=None,
              token_type_ids=None, prior_probs=None):
      outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
      
      sequence_output = outputs.last_hidden_state  # (batch_size, sequence_length, hidden_size)
      sequence_output = self.dropout(sequence_output)

      # Concatenate the previous guesses to the sequence_output
      # (batch_size, sequence_length, hidden_size + vocab_size)
      sequence_output = torch.cat((sequence_output, prev_guess.unsqueeze(1).repeat(1, sequence_output.shape[1], 1)), dim=2)
      # sequence_output = torch.cat((sequence_output, prior_probs.unsqueeze(1).repeat(1, sequence_output.shape[1], 1)), dim=2)

      logits = self.classifier(sequence_output)  # (batch_size, sequence_length, num_labels)

      loss = None
      if labels is not None:
          loss_fct = nn.CrossEntropyLoss()
          loss_aux_fct = nn.MSELoss()

          # AUXILIARY LOSS
          # Compute the auxiliary loss as the mse between the probabilities of active logits and the prior probabilities
          # NOTE: softmax will lead the infinities to zero
          mask_active_logits = labels.view(-1) != -100
          active_logits = logits.view(-1, self.num_labels)[mask_active_logits]
          prior_probs = prior_probs.unsqueeze(1).repeat(1, sequence_output.shape[1], 1)
          active_prior_probs = prior_probs.view(-1, self.num_labels)[mask_active_logits]
          
          active_probs_model = torch.softmax(active_logits, dim=-1)
          
          # NOTE: I'm calculating the loss without masking the previous guesses
          auxiliar_loss = loss_aux_fct(active_probs_model, active_prior_probs)

          # MAIN CROSS ENTROPY LOSS
          # Mask the logits to zero out probabilities of previously guessed characters by considering the one-hot encoding in prev guesses
          mask_prev_guess = prev_guess.unsqueeze(1).repeat(1, sequence_output.shape[1], 1) == 1
          logits[mask_prev_guess] = -float("inf")

          # NOTE: I don't need to mask logits or labels, because labels are already set with the value -100, which will be cancel by the loss function
          masking_loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))


          loss = masking_loss + self.alpha * auxiliar_loss
        
      return TokenClassifierOutput(logits=logits, loss=loss, hidden_states=outputs.hidden_states,attentions=outputs.attentions)


In [20]:
# Check the model output
device = "cpu"
checkpoint = "bert-base-uncased"
model = HangmanNet(checkpoint=checkpoint, vocab_size = 26, unfreeze_layers = 1).to(device)

# Prepare a batch using the custom data collator
for batch in train_dataloader:

    input_ids = batch['input_ids']
    attention_mask = batch['attention_mask']
    labels = batch['labels']
    prev_guess = batch['prev_guess']
    prior_probs = batch['prior_probs']

    # Forward pass
    outputs = model(input_ids=input_ids, 
                    attention_mask=attention_mask, 
                    labels=labels, 
                    prev_guess=prev_guess,
                    prior_probs=prior_probs)
    print(outputs.loss)
    print(outputs.logits.shape)

    break




tensor(2.8497, grad_fn=<AddBackward0>)
torch.Size([64, 42, 26])


In [21]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
checkpoint = "bert-base-uncased"
unfreeze_layers = 2
alpha = 0.5
model = HangmanNet(checkpoint=checkpoint, vocab_size = 26, unfreeze_layers = unfreeze_layers, alpha=alpha).to(device)

# Print the trainable parameters of the model
print("Trainable parameters:")
for name, param in model.named_parameters():
    if param.requires_grad:
        print(name)  

Trainable parameters:
model.encoder.layer.10.attention.self.query.weight
model.encoder.layer.10.attention.self.query.bias
model.encoder.layer.10.attention.self.key.weight
model.encoder.layer.10.attention.self.key.bias
model.encoder.layer.10.attention.self.value.weight
model.encoder.layer.10.attention.self.value.bias
model.encoder.layer.10.attention.output.dense.weight
model.encoder.layer.10.attention.output.dense.bias
model.encoder.layer.10.attention.output.LayerNorm.weight
model.encoder.layer.10.attention.output.LayerNorm.bias
model.encoder.layer.10.intermediate.dense.weight
model.encoder.layer.10.intermediate.dense.bias
model.encoder.layer.10.output.dense.weight
model.encoder.layer.10.output.dense.bias
model.encoder.layer.10.output.LayerNorm.weight
model.encoder.layer.10.output.LayerNorm.bias
model.encoder.layer.11.attention.self.query.weight
model.encoder.layer.11.attention.self.query.bias
model.encoder.layer.11.attention.self.key.weight
model.encoder.layer.11.attention.self.key.bia

In [22]:
from transformers import AdamW, get_scheduler

lr = 0.00005
optimizer = AdamW(model.parameters(), lr=lr)

num_epochs = 50
num_training_steps = num_epochs * len(train_dataloader)
# lr_scheduler = get_scheduler(
#     "linear",
#     optimizer=optimizer,
#     num_warmup_steps=0,
#     num_training_steps=num_training_steps,
# )
print(num_training_steps)

159850




In [23]:
device

device(type='cuda')

In [24]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

mlm_probability = 0.5
data_collator = CustomDataCollatorForMLM(tokenizer, 
                                        #  mlm_probability=mlm_probability, 
                                         max_length=42, 
                                         prior_frequencies_path="data/total_rel_freq.csv")

Loaded Prior Information:  torch.Size([28, 26])


In [25]:
train_dataloader = DataLoader(
    data["train"], 
    batch_size=64,
    num_workers=8,
    collate_fn=data_collator,
    pin_memory=True)

# Probably add shuffle
    
eval_dataloader = DataLoader(tokenized_val_data,
                             batch_size=64,
                             num_workers=8,
                             pin_memory=True)

In [26]:
# Define a new metric, which is more representative of how actually the network is playing Hangman
def accuracy_unique_char(logits, labels):
    batch_size, _, _ = logits.shape
    correct_predictions = 0

    for i in range(batch_size):
        # Extract the logits and labels for the current sequence
        logits_seq = logits[i]
        labels_seq = labels[i]

        # Identify the indices of active tokens (not -100)
        active_indices = labels_seq != -100

        # Get the logits of the active tokens
        active_logits = logits_seq[active_indices]

        if active_logits.shape[0] == 0:
            continue  # skip sequences with no active tokens

        # Get the class with the highest probability among active tokens
        max_prob_class = torch.argmax(active_logits, dim=-1)

        # Get the actual labels of the active tokens
        active_labels = labels_seq[active_indices].unique()

        # Get the probabilities of the active tokens
        max_prob = torch.max(active_logits, dim=-1).values

        # Get the index of the maximum probability
        max_prob_index = torch.argmax(max_prob)

        # Take a greedy choose and take the largest probability
        max_prob_char = max_prob_class[max_prob_index]

        if max_prob_char in active_labels:
            correct_predictions += 1

    return correct_predictions

In [27]:
import torch
from torch.utils.tensorboard import SummaryWriter
from tqdm.auto import tqdm
from sklearn.metrics import accuracy_score, f1_score
import os

set_seed(42)

# Initialize TensorBoard writer
save_dir = "models/prior_prob_models"
save_dir = os.path.join(save_dir, f"model_lr_{lr}_mlm_prob_{mlm_probability}_alpha_{alpha}_unfreeze_{unfreeze_layers}")
print(f"Saving model to: {save_dir}")
writer = SummaryWriter(log_dir=save_dir)

# Initialize metrics storage
train_losses = []
val_losses = []
val_accuracies = []
val_f1_scores = []
val_accuracies_hangman = []

# Define the save frequency
save_frequency = 5  # Save model every n epochs, adjust this as needed

# Load the model checkpoint if available
checkpoint_path = "path_to_checkpoint"  # Provide the path to your saved checkpoint
start_epoch = 0

prev_val_loss = 1000
prev_train_loss = 1000

if os.path.exists(checkpoint_path):
    checkpoint = torch.load(checkpoint_path)
    model.load_state_dict(checkpoint['model_state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    start_epoch = checkpoint['epoch']
    train_losses = checkpoint['train_losses']
    val_losses = checkpoint['val_losses']
    val_accuracies = checkpoint['val_accuracies']
    val_f1_scores = checkpoint['val_f1_scores']
    val_accuracies_hangman = checkpoint['val_accuracies_hangman']
    print(f"Loaded checkpoint from epoch {start_epoch}")

for epoch in range(start_epoch, num_epochs):
    model.train()
    total_train_loss = 0

    for batch in tqdm(train_dataloader, desc=f"Training Epoch {epoch+1}/{num_epochs}"):
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        # lr_scheduler.step()
        optimizer.zero_grad()

        total_train_loss += loss.item()

    avg_train_loss = total_train_loss / len(train_dataloader)
    train_losses.append(avg_train_loss)

    # Validation
    model.eval()
    total_val_loss = 0
    all_predictions = []
    all_references = []

    # Intialize to calculate our new metric
    correct_predictions = 0
    total_sequences = 0

    for batch in tqdm(eval_dataloader, desc=f"Validation Epoch {epoch+1}/{num_epochs}"):
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)

        loss = outputs.loss
        total_val_loss += loss.item()

        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)

        flat_predictions = predictions.view(-1)
        flat_references = batch["labels"].view(-1)
        flat_attention_mask = batch["labels"].view(-1) != -100

        active_predictions = flat_predictions[flat_attention_mask]
        active_references = flat_references[flat_attention_mask]

        all_predictions.extend(active_predictions.cpu().numpy())
        all_references.extend(active_references.cpu().numpy())

        # Calculate the custom accuracy
        accuracy_hangman = accuracy_unique_char(logits, batch["labels"])
        correct_predictions += accuracy_hangman
        total_sequences += logits.shape[0]

    avg_val_loss = total_val_loss / len(eval_dataloader)
    val_losses.append(avg_val_loss)

    val_accuracy = accuracy_score(all_references, all_predictions)
    val_f1 = f1_score(all_references, all_predictions, average="macro")
    val_accuracy_hangman = correct_predictions / total_sequences

    val_accuracies.append(val_accuracy)
    val_f1_scores.append(val_f1)
    val_accuracies_hangman.append(val_accuracy_hangman)

    # Log metrics to TensorBoard
    writer.add_scalars('Loss', {'train_loss': avg_train_loss, 'val_loss' : avg_val_loss}, epoch)
    writer.add_scalar('Accuracy/Validation', val_accuracy, epoch)
    writer.add_scalar('F1/Validation', val_f1, epoch)
    writer.add_scalar('Accuracy/Validation_Hangman', val_accuracy_hangman, epoch)
    writer.add_scalar('Learning Rate', optimizer.param_groups[0]['lr'], epoch)
    writer.add_scalars('Loss Change: ', {'train_loss_change': prev_train_loss - avg_train_loss, 
                                        'val_loss_change': prev_val_loss - avg_val_loss}, epoch)


    print(f"Epoch {epoch+1}/{num_epochs}")
    print(f"Training Loss: {avg_train_loss:.4f}")
    print(f"Validation Loss: {avg_val_loss:.4f}")
    print(f"Validation Accuracy: {val_accuracy:.4f}")
    print(f"Validation F1 Score: {val_f1:.4f}")
    print(f"Validation Accuracy Hangman: {val_accuracy_hangman:.4f}")
    print(f"Learning Rate: {optimizer.param_groups[0]['lr']:.8f}")
    print(f"Loss Change: {prev_train_loss - avg_train_loss:.4f} (train), {prev_val_loss - avg_val_loss:.4f} (val)")

    prev_train_loss = avg_train_loss
    prev_val_loss = avg_val_loss
    
    # Save the model every n epochs
    if (epoch + 1) % save_frequency == 0:
        model_save_path = os.path.join(save_dir, f"model_epoch_{epoch+1}.pth")
        checkpoint = {
            'epoch': epoch + 1,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'train_losses': train_losses,
            'val_losses': val_losses,
            'val_accuracies': val_accuracies,
            'val_f1_scores': val_f1_scores,
            'val_accuracies_hangman': val_accuracies_hangman
        }
        torch.save(checkpoint, model_save_path)
        print(f"Model saved to {model_save_path}")

# Save the final model
model_save_path = os.path.join(save_dir, "model_final.pth")
checkpoint = {
    'epoch': epoch + 1,
    'model_state_dict': model.state_dict(),
    'optimizer_state_dict': optimizer.state_dict(),
    'train_losses': train_losses,
    'val_losses': val_losses,
    'val_accuracies': val_accuracies,
    'val_f1_scores': val_f1_scores,
    'val_accuracies_hangman': val_accuracies_hangman
}
torch.save(checkpoint, model_save_path)
print(f"Final model saved to {model_save_path}")

# Close the TensorBoard writer
writer.close()

Saving model to: models/prior_prob_models/model_lr_5e-05_mlm_prob_0.5_alpha_0.5_unfreeze_2


Training Epoch 1/50: 100%|██████████| 3197/3197 [05:21<00:00,  9.94it/s]
Validation Epoch 1/50: 100%|██████████| 356/356 [00:41<00:00,  8.66it/s]


Epoch 1/50
Training Loss: 1.7738
Validation Loss: 2.1317
Validation Accuracy: 0.2964
Validation F1 Score: 0.2021
Validation Accuracy Hangman: 0.5658
Learning Rate: 0.00005000
Loss Change: 998.2262 (train), 997.8683 (val)


Training Epoch 2/50:   7%|▋         | 225/3197 [00:26<05:51,  8.46it/s]


KeyboardInterrupt: 