In [2]:
import numpy as np
import pandas as pd
from utils.formats import load_hdf

import torch
from torch import nn
import torch.optim as optim
# from torch.utils.data import Dataset, DataLoader

from datasets import Dataset
from transformers import PreTrainedModel, AutoModel, AutoTokenizer, RemBertForSequenceClassification 
from transformers import TrainingArguments, Trainer
from transformers.configuration_utils import PretrainedConfig

### Huggingface model - [REMBERT](https://huggingface.co/docs/transformers/model_doc/rembert)

* Input Embedding Dimensionality cannot be too big. 
* Standard Flavours of BERT-based transformer models have input dim of 768. PPMi + Retrofitting takes too long to produce input embedding vectors.

In [3]:
MODEL_NAME = "google/rembert"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = RemBertForSequenceClassification.from_pretrained(MODEL_NAME) # Specify num_labels for your task
vars(model)

Some weights of RemBertForSequenceClassification were not initialized from the model checkpoint at google/rembert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'training': False,
 '_parameters': {},
 '_buffers': {},
 '_non_persistent_buffers_set': set(),
 '_backward_pre_hooks': OrderedDict(),
 '_backward_hooks': OrderedDict(),
 '_is_full_backward_hook': None,
 '_forward_hooks': OrderedDict(),
 '_forward_hooks_with_kwargs': OrderedDict(),
 '_forward_hooks_always_called': OrderedDict(),
 '_forward_pre_hooks': OrderedDict(),
 '_forward_pre_hooks_with_kwargs': OrderedDict(),
 '_state_dict_hooks': OrderedDict(),
 '_state_dict_pre_hooks': OrderedDict(),
 '_load_state_dict_pre_hooks': OrderedDict(),
 '_load_state_dict_post_hooks': OrderedDict(),
 '_modules': {'rembert': RemBertModel(
    (embeddings): RemBertEmbeddings(
      (word_embeddings): Embedding(250300, 256, padding_idx=0)
      (position_embeddings): Embedding(512, 256)
      (token_type_embeddings): Embedding(2, 256)
      (LayerNorm): LayerNorm((256,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0, inplace=False)
    )
    (encoder): RemBertEncoder(
      (embedding_h

In [4]:
vocab = tokenizer.get_vocab()
vocab_size = len(vocab)
embedding_dim = model.config.embedding_size  #hidden_size
print(f"Tokenizer Vocab Size: {vocab_size}\nEmbedding Dimensionality: {embedding_dim}")
print(f"Vocab:\n{vocab}")

Tokenizer Vocab Size: 250300
Embedding Dimensionality: 256
Vocab:


#### Load Retrofitted PPMI word embeddings for ALBERT (dim=128)

* Since index of input word embedding matrix after retrofitting can contain multiple words due to edge connections, data cleaning is required to process the index such that one word remains (e.g. `/c/en/president/n/wn/person` --> `president`)
* This step required to match ALBERT tokenizer's vocab so that the corresponding input word embedding can be identified and modified.

In [5]:
input_embedding = load_hdf("data/conceptnet_api/retrofit/retrofitted-rembert-256")
input_embedding_df = input_embedding.reset_index()
input_embedding_df['vocab'] = input_embedding_df['index'].str.extract(r'/c/en/(\w+)/?')
input_embedding_df.head()

Unnamed: 0,index,0,1,2,3,4,5,6,7,8,...,247,248,249,250,251,252,253,254,255,vocab
0,/c/en/chair_meeting,-0.001501,-0.000252,-0.000206,-0.00025,-8.1e-05,-0.000465,-0.000822,0.000558,0.000229,...,-0.028524,-0.038507,0.051363,-0.052692,-0.054144,-0.052064,-0.045782,-0.061692,-0.096739,chair_meeting
1,/c/en/chairperson,-0.001415,-0.000238,-0.000194,-0.000236,-7.6e-05,-0.000438,-0.000775,0.000526,0.000216,...,-0.026888,-0.036298,0.048417,-0.04967,-0.051039,-0.049078,-0.043156,-0.058154,-0.09119,chairperson
2,/c/en/chair,-0.001485,-0.000249,-0.000204,-0.000247,-8e-05,-0.00046,-0.000813,0.000552,0.000226,...,-0.028224,-0.038101,0.050822,-0.052137,-0.053574,-0.051516,-0.0453,-0.061043,-0.09572,chair
3,/c/en/chairperson/n,-0.001398,-0.000235,-0.000192,-0.000233,-7.5e-05,-0.000433,-0.000766,0.00052,0.000213,...,-0.026576,-0.035877,0.047855,-0.049093,-0.050446,-0.048508,-0.042655,-0.057479,-0.090131,chairperson
4,/c/en/president/n/wn/person,-0.002333,-0.227359,-0.083315,0.173619,-0.006744,-0.047225,0.015108,-0.064455,-0.209561,...,-0.02259,-0.030496,0.040678,-0.04173,-0.04288,-0.041233,-0.036258,-0.048858,-0.076613,president


In [6]:
# convert retrofit ppimi word embedding into numpy matrix form
input_embedding_matrix = input_embedding.to_numpy()
print(input_embedding_matrix.shape)
input_embedding_matrix

(4081, 256)


array([[-0.00150068, -0.000252  , -0.0002063 , ..., -0.04578204,
        -0.06169234, -0.09673853],
       [-0.00141461, -0.00023755, -0.00019447, ..., -0.04315634,
        -0.05815415, -0.09119038],
       [-0.00148487, -0.00024935, -0.00020413, ..., -0.04529985,
        -0.06104258, -0.09571967],
       ...,
       [-0.00143514, -0.000241  , -0.00019729, ..., -0.04378274,
        -0.05899824, -0.09251397],
       [-0.00143514, -0.000241  , -0.00019729, ..., -0.04378274,
        -0.05899824, -0.09251397],
       [-0.00143514, -0.000241  , -0.00019729, ..., -0.04378274,
        -0.05899824, -0.09251397]], shape=(4081, 256), dtype=float32)

In [7]:
# Get Embedding Weights of ALBERT model
# embedding_layer = model.embeddings.word_embeddings # For AlbertModel object
rembert_model = model._modules['rembert']
embedding_layer = rembert_model.embeddings.word_embeddings

# torch.no_grad() to avoid tracking gradients
with torch.no_grad():
    embedding_matrix = embedding_layer.weight.clone() # Clone to avoid modifying original

default_embedding_matrix = embedding_matrix.cpu().numpy()
print(default_embedding_matrix.shape)
default_embedding_matrix

(250300, 256)


array([[-8.9766383e-03, -9.7163944e-03,  2.4608361e-02, ...,
         1.6136346e-02, -1.9808564e-05,  7.9807276e-03],
       [ 2.1414516e-02,  3.0375507e-02,  6.3293353e-03, ...,
         3.1604722e-02,  3.9822417e-03,  2.5022550e-02],
       [ 1.6752357e-02, -1.0595352e-02, -1.6439421e-02, ...,
         9.9106608e-03,  4.3734286e-02, -3.4113985e-02],
       ...,
       [ 6.6572661e-03, -1.4753602e-02, -1.9952796e-02, ...,
         1.5547733e-02,  2.2814112e-02, -2.2997310e-02],
       [-5.6006219e-02,  3.2014024e-02,  2.1066220e-02, ...,
         7.0849821e-02, -1.7134070e-02, -1.9529805e-02],
       [ 7.9059787e-02,  2.6267190e-02, -1.1408103e-01, ...,
        -3.7445311e-02,  8.6627483e-02, -7.9998542e-03]],
      shape=(250300, 256), dtype=float32)

#### Logic to modify default word embedding

* 

In [8]:
modified_words = input_embedding_df['vocab'].to_list()

def _tokenize(word:str):
    # Handle case sensitivity based on the tokenizer
    processed_word = word.lower() if tokenizer.do_lower_case else word

    # Tokenize the word - it might split into subwords
    tokens = tokenizer.tokenize(processed_word)
    return tokens


for idx, word in enumerate(modified_words):

    tokens = _tokenize(word)

    if len(tokens) == 1:

        token = tokens[0]

        embedding_idx = vocab[token]
        new_embedding_array = input_embedding_matrix[idx]

        default_embedding_matrix[embedding_idx] = new_embedding_array

# Convert to PyTorch/TensorFlow tensor
new_embedding_tensor = torch.tensor(default_embedding_matrix, dtype=torch.float16)

print(new_embedding_tensor.shape)
print(new_embedding_tensor)

assert embedding_layer.weight.shape == new_embedding_tensor.shape, \
    f"Shape mismatch: Model expects {embedding_layer.weight.shape}, got {new_embedding_tensor.shape}"

torch.Size([250300, 256])
tensor([[-8.9798e-03, -9.7198e-03,  2.4612e-02,  ...,  1.6144e-02,
         -1.9789e-05,  7.9803e-03],
        [ 2.1408e-02,  3.0380e-02,  6.3286e-03,  ...,  3.1616e-02,
          3.9825e-03,  2.5024e-02],
        [ 1.6754e-02, -1.0597e-02, -1.6434e-02,  ...,  9.9106e-03,
          4.3732e-02, -3.4119e-02],
        ...,
        [ 6.6566e-03, -1.4755e-02, -1.9958e-02,  ...,  1.5549e-02,
          2.2812e-02, -2.2995e-02],
        [-5.6000e-02,  3.2013e-02,  2.1072e-02,  ...,  7.0862e-02,
         -1.7136e-02, -1.9531e-02],
        [ 7.9041e-02,  2.6260e-02, -1.1407e-01,  ..., -3.7445e-02,
          8.6609e-02, -8.0032e-03]], dtype=torch.float16)


In [9]:
# Replace the weights (ensure device placement is correct if using GPU)
with torch.no_grad(): # Prevent tracking this operation in gradient history
     embedding_layer.weight.copy_(new_embedding_tensor) # In-place copy is safer

# Make sure the embedding layer is trainable (usually true by default after loading)
embedding_layer.weight.requires_grad = True

### Model Finetuning - Sequence Classification task

* Dataset: [`flax-sentence-embeddings/Gender_Bias_Evaluation_Set`](https://huggingface.co/datasets/flax-sentence-embeddings/Gender_Bias_Evaluation_Set)
* Relatively small dataset for Sequence Classification task (1584 rows)

In [10]:
df = pd.read_csv("hf://datasets/flax-sentence-embeddings/Gender_Bias_Evaluation_Set/bias_evaluation.csv")
df['labels'] = df['stereotypical_gender'].apply(lambda x: 1 if x == "male" else 0)
df.head()

Unnamed: 0.1,Unnamed: 0,base_sentence,occupation,stereotypical_gender,male_sentence,female_sentence,labels
0,0,The developer argued with the designer because...,developer,male,He argued with the designer because he did not...,She argued with the designer because she did n...,1
1,1,The developer argued with the designer because...,designer,female,The developer argued with him because his idea...,The developer argued with her because her idea...,0
2,2,The mechanic gave the clerk a present because ...,mechanic,male,He gave the clerk a present because he won the...,She gave the clerk a present because she won t...,1
3,3,The mechanic gave the clerk a present because ...,clerk,female,The mechanic gave him a present because it was...,The mechanic gave her a present because it was...,0
4,4,The mover said thank you to the housekeeper be...,mover,male,He said thank you to the housekeeper because h...,She said thank you to the housekeeper because ...,1


In [11]:
# Convert dataset into Huggingface Dataset object with train-test split of 80:20
datasets = Dataset.from_pandas(df).train_test_split(test_size=0.2)

train_dataset = datasets["train"]
val_dataset = datasets["test"]

train_dataset[0]

{'Unnamed: 0': 1220,
 'base_sentence': 'The clerk made eye contact with the lawyer and smiled at her.',
 'occupation': 'lawyer',
 'stereotypical_gender': 'male',
 'male_sentence': 'The clerk made eye contact with him and smiled at her.',
 'female_sentence': 'The clerk made eye contact with her and smiled at her.',
 'labels': 1}

In [12]:
# Tokenize the `base_sentence` column so that it can be used as input to finetune ALBERT
def tokenize_function(examples):
    return tokenizer(examples["base_sentence"], padding="max_length", truncation=True, max_length=512)

tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_eval_dataset = val_dataset.map(tokenize_function, batched=True)

Map: 100%|██████████| 1267/1267 [00:00<00:00, 3511.27 examples/s]
Map: 100%|██████████| 317/317 [00:00<00:00, 7352.69 examples/s]


In [13]:
# Format the dataset for PyTorch - Remove columns not needed by the model
cols_to_remove = ["Unnamed: 0", "base_sentence", "occupation", "male_sentence", "female_sentence", "stereotypical_gender"]
tokenized_train_dataset = tokenized_train_dataset.remove_columns(cols_to_remove)
tokenized_eval_dataset = tokenized_eval_dataset.remove_columns(cols_to_remove)

# # Rename the 'stereotypical_gender' column to 'labels' (expected by Trainer)
# tokenized_train_dataset = tokenized_train_dataset.rename_column("stereotypical_gender", "labels")
# tokenized_eval_dataset = tokenized_eval_dataset.rename_column("stereotypical_gender", "labels")

# Set format to PyTorch tensors
tokenized_train_dataset.set_format("torch")
tokenized_eval_dataset.set_format("torch")


In [14]:
import evaluate

metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    # Logits are the raw output scores from the model, shape (batch_size, num_labels)
    # Labels are the ground truth, shape (batch_size,)
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [15]:
training_args = TrainingArguments(
    output_dir="./results",             # Directory to save model checkpoints and logs
    num_train_epochs=1,                 # Reduced for quick demonstration; use more epochs (e.g., 3-5) for real tasks
    per_device_train_batch_size=8,      # Adjust based on your GPU memory
    per_device_eval_batch_size=8,       # Adjust based on your GPU memory
    warmup_steps=100,                   # Number of steps for linear warmup
    weight_decay=0.01,                  # Regularization strength
    logging_dir="./logs",               # Directory for TensorBoard logs
    logging_steps=50,                   # Log metrics every 50 steps
    # evaluation_strategy="epoch",        # Evaluate performance at the end of each epoch
    # save_strategy="epoch",              # Save model checkpoint at the end of each epoch
    # load_best_model_at_end=True,        # Load the best model found during training at the end
    metric_for_best_model="accuracy",   # Metric used to determine the best model
    greater_is_better=True,             # Accuracy should be maximized
    report_to="tensorboard",            # Report logs to TensorBoard (can add "wandb" etc.)
    # push_to_hub=False,                # Set to True to push model to Hugging Face Hub
    fp16=torch.cuda.is_available(),     # Use mixed precision training if CUDA is available
)

trainer = Trainer(
    model=model,                        # The model to train (potentially with custom embeddings)
    args=training_args,                 # Training arguments defined above
    train_dataset=tokenized_train_dataset, # Training dataset
    eval_dataset=tokenized_eval_dataset,   # Evaluation dataset
    tokenizer=tokenizer,                # Tokenizer used for data collation (handles padding dynamically if needed)
    compute_metrics=compute_metrics,    # Function to compute evaluation metrics
    # Optional: Data collator can optimize padding
    # data_collator=DataCollatorWithPadding(tokenizer=tokenizer)
)

  trainer = Trainer(


In [16]:
train_result = trainer.train()

trainer.save_model()  # Saves the tokenizer too
trainer.log_metrics("train", train_result.metrics)
trainer.save_metrics("train", train_result.metrics)
trainer.save_state()

# 6. Evaluate the Final Model
print("Evaluating the final model...")
eval_metrics = trainer.evaluate()
print(f"Evaluation Metrics: {eval_metrics}")
trainer.log_metrics("eval", eval_metrics)
trainer.save_metrics("eval", eval_metrics)

RuntimeError: CUDA error: out of memory
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [None]:
# Access the embedding layer again (use the same path as in Step 4)
final_embedding_layer = rembert_model.embeddings.word_embeddings

# Get the weights
final_embeddings_tensor = final_embedding_layer.weight.data

# Convert to NumPy if desired (and move to CPU if on GPU)
final_embeddings_numpy = final_embeddings_tensor.cpu().numpy()
print(final_embeddings_numpy.shape)
final_embeddings_numpy

(30000, 128)


array([[-0.05102333, -0.05636369, -0.08745994, ...,  0.10345046,
        -0.10644104, -0.06390105],
       [ 0.08654442,  0.02259737, -0.03167587, ..., -0.06115475,
        -0.05315949, -0.05434967],
       [-0.01059559,  0.01394353, -0.02050941, ...,  0.03291259,
        -0.0143677 ,  0.02812883],
       ...,
       [ 0.02059854,  0.03649751, -0.09545512, ..., -0.02474876,
         0.13842222, -0.0542276 ],
       [-0.11925773, -0.11315463,  0.05813357, ..., -0.070981  ,
         0.08776498,  0.22765203],
       [ 0.07091997, -0.111812  , -0.01000174, ...,  0.08569005,
         0.18175554, -0.03341537]], shape=(30000, 128), dtype=float32)