In [68]:
import numpy as np
import pandas as pd
from utils.formats import load_hdf

import torch
from torch import nn
import torch.optim as optim
# from torch.utils.data import Dataset, DataLoader

from datasets import Dataset
from transformers import PreTrainedModel, AutoModel, AutoTokenizer, AutoModelForSequenceClassification # Or your task-specific model
from transformers import TrainingArguments, Trainer
from transformers.configuration_utils import PretrainedConfig
from transformers import AlbertModel

### Huggingface model - [ALBERT](https://huggingface.co/docs/transformers/model_doc/albert#transformers.AlbertForPreTraining)

* Input Embedding Dimensionality cannot be too big. 
* Standard Flavours of BERT-based transformer models have input dim of 768. PPMi + Retrofitting takes too long to produce input embedding vectors.

In [69]:
MODEL_NAME = "albert/albert-base-v1"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME) # Specify num_labels for your task
# model = AlbertModel.from_pretrained(MODEL_NAME, torch_dtype=torch.float16) # Specify num_labels for your task

vocab = tokenizer.get_vocab()
vocab_size = len(vocab)
embedding_dim = model.config.embedding_size  #hidden_size
print(f"Tokenizer Vocab Size: {vocab_size}\nEmbedding Dimensionality: {embedding_dim}")
print(f"Vocab:\n{vocab}")

Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert/albert-base-v1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Tokenizer Vocab Size: 30000
Embedding Dimensionality: 128
Vocab:


#### Load Retrofitted PPMI word embeddings for ALBERT (dim=128)

* Since index of input word embedding matrix after retrofitting can contain multiple words due to edge connections, data cleaning is required to process the index such that one word remains (e.g. `/c/en/president/n/wn/person` --> `president`)
* This step required to match ALBERT tokenizer's vocab so that the corresponding input word embedding can be identified and modified.

In [70]:
input_embedding = load_hdf("data/conceptnet_api/retrofit/retrofitted-albert-128")
input_embedding_df = input_embedding.reset_index()
input_embedding_df['vocab'] = input_embedding_df['index'].str.extract(r'/c/en/(\w+)/?')
input_embedding_df.head()

Unnamed: 0,index,0,1,2,3,4,5,6,7,8,...,119,120,121,122,123,124,125,126,127,vocab
0,/c/en/help_child,-0.000251,-0.010002,-0.006968,0.000776,0.0041,0.001745,-0.005115,-0.001407,3e-06,...,-0.043209,-0.044639,0.018009,-0.033039,-0.048886,0.055936,0.052766,0.071421,0.092698,help_child
1,/c/en/adult,-0.000234,-0.007938,-0.006174,0.000723,0.003105,0.001632,-0.003933,-0.001316,2e-06,...,-0.045798,-0.042103,0.058996,-0.069027,-0.040135,0.050881,0.049507,0.067248,0.077322,adult
2,/c/en/man,-0.00025,-0.005036,0.014826,0.000771,-0.01862,0.001739,0.024272,-0.001402,3e-06,...,-0.081502,-0.044615,0.222954,-0.129706,-0.03082,0.051938,0.053575,0.056078,0.090643,man
3,/c/en/sign_contract,-0.000251,-0.010131,-0.006857,0.00078,0.004132,0.001748,-0.005139,-0.00141,3e-06,...,-0.043298,-0.044735,0.018026,-0.033055,-0.048998,0.056059,0.05288,0.071576,0.092903,sign_contract
4,/c/en/dress_herself,-0.000251,-0.010131,-0.006857,0.00078,0.004132,0.001748,-0.005139,-0.00141,3e-06,...,-0.043298,-0.044735,0.018026,-0.033055,-0.048998,0.056059,0.05288,0.071576,0.092903,dress_herself


In [71]:
# convert retrofit ppimi word embedding into numpy matrix form
input_embedding_matrix = input_embedding.to_numpy()
print(input_embedding_matrix.shape)
input_embedding_matrix

(4081, 128)


array([[-0.00025054, -0.01000199, -0.006968  , ...,  0.05276616,
         0.07142138,  0.09269828],
       [-0.00023437, -0.00793784, -0.00617399, ...,  0.04950697,
         0.06724799,  0.07732166],
       [-0.00024969, -0.0050365 ,  0.01482603, ...,  0.05357489,
         0.0560779 ,  0.0906431 ],
       ...,
       [-0.00021273, -0.00120958, -0.00064862, ...,  0.04435906,
         0.05975946,  0.09372985],
       [-0.00021273, -0.00120958, -0.00064862, ...,  0.04435906,
         0.05975946,  0.09372985],
       [-0.00021273, -0.00120958, -0.00064862, ...,  0.04435906,
         0.05975946,  0.09372985]], shape=(4081, 128), dtype=float32)

In [72]:
# Get Embedding Weights of ALBERT model
# embedding_layer = model.embeddings.word_embeddings # For AlbertModel object
albert_model = model._modules['albert']
embedding_layer = albert_model.embeddings.word_embeddings

# torch.no_grad() to avoid tracking gradients
with torch.no_grad():
    embedding_matrix = embedding_layer.weight.clone() # Clone to avoid modifying original

default_embedding_matrix = embedding_matrix.cpu().numpy()
print(default_embedding_matrix.shape)
default_embedding_matrix

(30000, 128)


array([[-0.05101773, -0.05638105, -0.08745944, ...,  0.10348055,
        -0.1064435 , -0.06387638],
       [ 0.08651973,  0.02260554, -0.03166365, ..., -0.06117148,
        -0.05314829, -0.0543424 ],
       [-0.01066898,  0.01375878, -0.02094011, ...,  0.03288412,
        -0.01413923,  0.02857986],
       ...,
       [ 0.02059551,  0.03651065, -0.09545734, ..., -0.0247529 ,
         0.13839178, -0.05422436],
       [-0.11926416, -0.11318762,  0.05813185, ..., -0.07097802,
         0.08779413,  0.22770554],
       [ 0.07093989, -0.11180934, -0.01000072, ...,  0.08569918,
         0.1817395 , -0.03343155]], shape=(30000, 128), dtype=float32)

#### Logic to modify default word embedding

* 

In [73]:
modified_words = input_embedding_df['vocab'].to_list()

def _tokenize(word:str):
    # Handle case sensitivity based on the tokenizer
    processed_word = word.lower() if tokenizer.do_lower_case else word

    # Tokenize the word - it might split into subwords
    tokens = tokenizer.tokenize(processed_word)
    return tokens

modification_cache = dict() # store idx and words that were modified. 
for idx, word in enumerate(modified_words):

    tokens = _tokenize(word)

    if len(tokens) == 1:

        token = tokens[0]

        embedding_idx = vocab[token]

        modification_cache['/c/en/' + word] = embedding_idx

        new_embedding_array = input_embedding_matrix[idx]

        default_embedding_matrix[embedding_idx] = new_embedding_array

# Convert to PyTorch/TensorFlow tensor
new_embedding_tensor = torch.tensor(default_embedding_matrix, dtype=torch.float16)

print(new_embedding_tensor.shape)
print(new_embedding_tensor)

assert embedding_layer.weight.shape == new_embedding_tensor.shape, \
    f"Shape mismatch: Model expects {embedding_layer.weight.shape}, got {new_embedding_tensor.shape}"

torch.Size([30000, 128])
tensor([[-0.0510, -0.0564, -0.0875,  ...,  0.1035, -0.1064, -0.0639],
        [ 0.0865,  0.0226, -0.0317,  ..., -0.0612, -0.0532, -0.0544],
        [-0.0107,  0.0138, -0.0209,  ...,  0.0329, -0.0141,  0.0286],
        ...,
        [ 0.0206,  0.0365, -0.0955,  ..., -0.0247,  0.1384, -0.0542],
        [-0.1193, -0.1132,  0.0581,  ..., -0.0710,  0.0878,  0.2277],
        [ 0.0709, -0.1118, -0.0100,  ...,  0.0857,  0.1818, -0.0334]],
       dtype=torch.float16)


In [74]:
# Replace the weights (ensure device placement is correct if using GPU)
with torch.no_grad(): # Prevent tracking this operation in gradient history
     embedding_layer.weight.copy_(new_embedding_tensor) # In-place copy is safer

# Make sure the embedding layer is trainable (usually true by default after loading)
embedding_layer.weight.requires_grad = True

### Model Finetuning - Sequence Classification task

* Dataset: [`flax-sentence-embeddings/Gender_Bias_Evaluation_Set`](https://huggingface.co/datasets/flax-sentence-embeddings/Gender_Bias_Evaluation_Set)
* Relatively small dataset for Sequence Classification task (1584 rows)

In [75]:
df = pd.read_csv("hf://datasets/flax-sentence-embeddings/Gender_Bias_Evaluation_Set/bias_evaluation.csv")
df['labels'] = df['stereotypical_gender'].apply(lambda x: 1 if x == "male" else 0)
df.head()

Unnamed: 0.1,Unnamed: 0,base_sentence,occupation,stereotypical_gender,male_sentence,female_sentence,labels
0,0,The developer argued with the designer because...,developer,male,He argued with the designer because he did not...,She argued with the designer because she did n...,1
1,1,The developer argued with the designer because...,designer,female,The developer argued with him because his idea...,The developer argued with her because her idea...,0
2,2,The mechanic gave the clerk a present because ...,mechanic,male,He gave the clerk a present because he won the...,She gave the clerk a present because she won t...,1
3,3,The mechanic gave the clerk a present because ...,clerk,female,The mechanic gave him a present because it was...,The mechanic gave her a present because it was...,0
4,4,The mover said thank you to the housekeeper be...,mover,male,He said thank you to the housekeeper because h...,She said thank you to the housekeeper because ...,1


In [76]:
# Convert dataset into Huggingface Dataset object with train-test split of 80:20
datasets = Dataset.from_pandas(df).train_test_split(test_size=0.2)

train_dataset = datasets["train"]
val_dataset = datasets["test"]

train_dataset[0]

{'Unnamed: 0': 581,
 'base_sentence': 'The analyst sent a report to the editor because he put in a request yesterday.',
 'occupation': 'editor',
 'stereotypical_gender': 'female',
 'male_sentence': 'The analyst sent a report to him because he put in a request yesterday.',
 'female_sentence': 'The analyst sent a report to her because she put in a request yesterday.',
 'labels': 0}

In [77]:
# Tokenize the `base_sentence` column so that it can be used as input to finetune ALBERT
def tokenize_function(examples):
    return tokenizer(examples["base_sentence"], padding="max_length", truncation=True, max_length=512)

tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_eval_dataset = val_dataset.map(tokenize_function, batched=True)

Map: 100%|██████████| 1267/1267 [00:00<00:00, 4669.65 examples/s]
Map: 100%|██████████| 317/317 [00:00<00:00, 4920.14 examples/s]


In [78]:
# Format the dataset for PyTorch - Remove columns not needed by the model
cols_to_remove = ["Unnamed: 0", "base_sentence", "occupation", "male_sentence", "female_sentence", "stereotypical_gender"]
tokenized_train_dataset = tokenized_train_dataset.remove_columns(cols_to_remove)
tokenized_eval_dataset = tokenized_eval_dataset.remove_columns(cols_to_remove)

# # Rename the 'stereotypical_gender' column to 'labels' (expected by Trainer)
# tokenized_train_dataset = tokenized_train_dataset.rename_column("stereotypical_gender", "labels")
# tokenized_eval_dataset = tokenized_eval_dataset.rename_column("stereotypical_gender", "labels")

# Set format to PyTorch tensors
tokenized_train_dataset.set_format("torch")
tokenized_eval_dataset.set_format("torch")


In [79]:
import evaluate

metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    # Logits are the raw output scores from the model, shape (batch_size, num_labels)
    # Labels are the ground truth, shape (batch_size,)
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [80]:
training_args = TrainingArguments(
    output_dir="./results",             # Directory to save model checkpoints and logs
    num_train_epochs=1,                 # Reduced for quick demonstration; use more epochs (e.g., 3-5) for real tasks
    per_device_train_batch_size=8,      # Adjust based on your GPU memory
    per_device_eval_batch_size=8,       # Adjust based on your GPU memory
    warmup_steps=100,                   # Number of steps for linear warmup
    weight_decay=0.01,                  # Regularization strength
    logging_dir="./logs",               # Directory for TensorBoard logs
    logging_steps=50,                   # Log metrics every 50 steps
    # evaluation_strategy="epoch",        # Evaluate performance at the end of each epoch
    # save_strategy="epoch",              # Save model checkpoint at the end of each epoch
    # load_best_model_at_end=True,        # Load the best model found during training at the end
    metric_for_best_model="accuracy",   # Metric used to determine the best model
    greater_is_better=True,             # Accuracy should be maximized
    report_to="tensorboard",            # Report logs to TensorBoard (can add "wandb" etc.)
    # push_to_hub=False,                # Set to True to push model to Hugging Face Hub
    fp16=torch.cuda.is_available(),     # Use mixed precision training if CUDA is available
)

trainer = Trainer(
    model=model,                        # The model to train (potentially with custom embeddings)
    args=training_args,                 # Training arguments defined above
    train_dataset=tokenized_train_dataset, # Training dataset
    eval_dataset=tokenized_eval_dataset,   # Evaluation dataset
    tokenizer=tokenizer,                # Tokenizer used for data collation (handles padding dynamically if needed)
    compute_metrics=compute_metrics,    # Function to compute evaluation metrics
    # Optional: Data collator can optimize padding
    # data_collator=DataCollatorWithPadding(tokenizer=tokenizer)
)

  trainer = Trainer(


In [81]:
train_result = trainer.train()

trainer.save_model()  # Saves the tokenizer too
trainer.log_metrics("train", train_result.metrics)
trainer.save_metrics("train", train_result.metrics)
trainer.save_state()

# 6. Evaluate the Final Model
print("Evaluating the final model...")
eval_metrics = trainer.evaluate()
print(f"Evaluation Metrics: {eval_metrics}")
trainer.log_metrics("eval", eval_metrics)
trainer.save_metrics("eval", eval_metrics)

Step,Training Loss
50,0.6359
100,0.0775
150,0.0548


***** train metrics *****
  epoch                    =        1.0
  total_flos               =    28199GF
  train_loss               =     0.2429
  train_runtime            = 0:00:25.05
  train_samples_per_second =     50.567
  train_steps_per_second   =      6.346
Evaluating the final model...


Evaluation Metrics: {'eval_loss': 0.07681675255298615, 'eval_accuracy': 0.9905362776025236, 'eval_runtime': 1.7748, 'eval_samples_per_second': 178.609, 'eval_steps_per_second': 22.537, 'epoch': 1.0}
***** eval metrics *****
  epoch                   =        1.0
  eval_accuracy           =     0.9905
  eval_loss               =     0.0768
  eval_runtime            = 0:00:01.77
  eval_samples_per_second =    178.609
  eval_steps_per_second   =     22.537


In [82]:
# Access the embedding layer again (use the same path as in Step 4)
final_embedding_layer = albert_model.embeddings.word_embeddings

# Get the weights
final_embeddings_tensor = final_embedding_layer.weight.data

# Convert to NumPy if desired (and move to CPU if on GPU)
final_embeddings_numpy = final_embeddings_tensor.cpu().numpy()
print(final_embeddings_numpy.shape)
final_embeddings_numpy

(30000, 128)


array([[-0.05102333, -0.05636369, -0.08745995, ...,  0.10345047,
        -0.10644105, -0.06390106],
       [ 0.08654442,  0.02259737, -0.03167587, ..., -0.06115475,
        -0.05315949, -0.05434968],
       [-0.01078623,  0.01407648, -0.02098901, ...,  0.03270918,
        -0.01420622,  0.02845   ],
       ...,
       [ 0.02059854,  0.03649751, -0.09545513, ..., -0.02474876,
         0.13842224, -0.05422761],
       [-0.11925774, -0.11315463,  0.05813357, ..., -0.07098101,
         0.08776499,  0.22765204],
       [ 0.07091998, -0.111812  , -0.01000174, ...,  0.08569006,
         0.18175556, -0.03341537]], shape=(30000, 128), dtype=float32)

In [83]:
conceptnet_finetune_embeddings = dict()

for concept, idx in modification_cache.items():
    conceptnet_finetune_embeddings[concept] = final_embeddings_numpy[idx].tolist()

conceptnet_finetune_embeddings_df = pd.DataFrame.from_dict(conceptnet_finetune_embeddings, orient='index')
print(conceptnet_finetune_embeddings_df.shape)
conceptnet_finetune_embeddings_df.head()

(983, 128)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,118,119,120,121,122,123,124,125,126,127
/c/en/adult,-0.001131,-0.00703,-0.002296,0.003641,0.004207,0.007873,-0.001393,-0.006351,1.2e-05,0.011558,...,0.15246,-0.146967,-0.198356,-0.264637,0.271473,-0.278798,0.2683,0.23583,0.317858,0.498271
/c/en/man,4.2e-05,0.000893,-0.005074,0.000428,0.00085,0.001315,-0.001378,-0.000974,-0.000269,0.002604,...,0.006402,-0.03968,-0.040587,-0.039853,-0.541756,0.0206,0.027237,0.049466,0.058071,0.076559
/c/en/sheep,-0.000216,-0.001003,-0.000991,-0.002836,-0.009216,0.0015,0.008629,-0.00121,2e-06,0.002203,...,0.029082,-0.027984,-0.03781,-0.04965,0.051634,-0.053129,0.051115,0.044951,0.060544,0.094967
/c/en/character,-0.000216,-0.001989,-0.000668,0.000732,-0.006279,0.001506,0.008827,-0.001215,2e-06,0.002211,...,0.074033,-0.168206,-0.044615,0.788054,-0.067746,-0.02275,0.048887,0.042509,0.048033,0.088314
/c/en/pacifist,-0.000229,-0.001253,-0.000686,0.002687,-0.002437,0.001595,0.002428,-0.001286,2e-06,0.00234,...,0.030913,-0.029753,-0.04019,-0.052763,0.054868,-0.056455,0.054319,0.047758,0.064328,0.100948


In [84]:
conceptnet_finetune_embeddings_df.to_hdf(path_or_buf="data/ml_finetune/retrofitted-custom-albert-128", key='mat', encoding='utf-8')