In [3]:
import numpy as np
import pandas as pd
from utils.formats import load_hdf

import torch
from torch import nn
import torch.optim as optim
# from torch.utils.data import Dataset, DataLoader

from datasets import Dataset
from transformers import PreTrainedModel, AutoModel, AutoTokenizer, DistilBertForSequenceClassification 
from transformers import TrainingArguments, Trainer
from transformers.configuration_utils import PretrainedConfig

### Huggingface model - [REMBERT](https://huggingface.co/docs/transformers/model_doc/rembert)

* Input Embedding Dimensionality cannot be too big. 
* Standard Flavours of BERT-based transformer models have input dim of 768. PPMi + Retrofitting takes too long to produce input embedding vectors.

In [6]:
MODEL_NAME = "distilbert-base-uncased"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = DistilBertForSequenceClassification.from_pretrained(MODEL_NAME) # Specify num_labels for your task
# model = AlbertModel.from_pretrained(MODEL_NAME, torch_dtype=torch.float16) # Specify num_labels for your task

vocab = tokenizer.get_vocab()
vocab_size = len(vocab)
embedding_dim = model.config.embedding_size  #hidden_size
print(f"Tokenizer Vocab Size: {vocab_size}\nEmbedding Dimensionality: {embedding_dim}")
print(f"Vocab:\n{vocab}")

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


AttributeError: 'DistilBertConfig' object has no attribute 'embedding_size'

In [8]:
vars(model)

{'training': False,
 '_parameters': {},
 '_buffers': {},
 '_non_persistent_buffers_set': set(),
 '_backward_pre_hooks': OrderedDict(),
 '_backward_hooks': OrderedDict(),
 '_is_full_backward_hook': None,
 '_forward_hooks': OrderedDict(),
 '_forward_hooks_with_kwargs': OrderedDict(),
 '_forward_hooks_always_called': OrderedDict(),
 '_forward_pre_hooks': OrderedDict(),
 '_forward_pre_hooks_with_kwargs': OrderedDict(),
 '_state_dict_hooks': OrderedDict(),
 '_state_dict_pre_hooks': OrderedDict(),
 '_load_state_dict_pre_hooks': OrderedDict(),
 '_load_state_dict_post_hooks': OrderedDict(),
 '_modules': {'distilbert': DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
  

#### Load Retrofitted PPMI word embeddings for ALBERT (dim=128)

* Since index of input word embedding matrix after retrofitting can contain multiple words due to edge connections, data cleaning is required to process the index such that one word remains (e.g. `/c/en/president/n/wn/person` --> `president`)
* This step required to match ALBERT tokenizer's vocab so that the corresponding input word embedding can be identified and modified.

In [6]:
input_embedding = load_hdf("data/conceptnet_api/retrofit/retrofitted-albert-128")
input_embedding_df = input_embedding.reset_index()
input_embedding_df['vocab'] = input_embedding_df['index'].str.extract(r'/c/en/(\w+)/?')
input_embedding_df.head()

Unnamed: 0,index,0,1,2,3,4,5,6,7,8,...,119,120,121,122,123,124,125,126,127,vocab
0,/c/en/chair_meeting,-0.001778,0.007031,0.002296,0.00364,0.004209,-0.007875,0.001394,0.006352,0.004029,...,0.146947,0.198373,-0.264603,0.27145,-0.278929,-0.268215,-0.235852,-0.317816,-0.498361,chair_meeting
1,/c/en/chairperson,-0.001778,0.007031,0.002296,0.00364,0.004209,-0.007875,0.001394,0.006352,0.004029,...,0.146947,0.198373,-0.264603,0.27145,-0.278929,-0.268215,-0.235852,-0.317816,-0.498361,chairperson
2,/c/en/chair,-0.001778,0.007031,0.002296,0.00364,0.004209,-0.007875,0.001394,0.006352,0.004029,...,0.146947,0.198373,-0.264603,0.27145,-0.278929,-0.268215,-0.235852,-0.317816,-0.498361,chair
3,/c/en/chairperson/n,-0.001778,0.007031,0.002296,0.00364,0.004209,-0.007875,0.001394,0.006352,0.004029,...,0.146947,0.198373,-0.264603,0.27145,-0.278929,-0.268215,-0.235852,-0.317816,-0.498361,chairperson
4,/c/en/president/n/wn/person,-0.001778,0.007031,0.002296,0.00364,0.004209,-0.007875,0.001394,0.006352,0.004029,...,0.146947,0.198373,-0.264603,0.27145,-0.278929,-0.268215,-0.235852,-0.317816,-0.498361,president


In [8]:
# convert retrofit ppimi word embedding into numpy matrix form
input_embedding_matrix = input_embedding.to_numpy()
print(input_embedding_matrix.shape)
input_embedding_matrix

(4081, 128)


array([[-1.7784444e-03,  7.0306961e-03,  2.2962685e-03, ...,
        -2.3585208e-01, -3.1781605e-01, -4.9836105e-01],
       [-1.7784444e-03,  7.0306961e-03,  2.2962685e-03, ...,
        -2.3585208e-01, -3.1781605e-01, -4.9836105e-01],
       [-1.7784444e-03,  7.0306961e-03,  2.2962685e-03, ...,
        -2.3585208e-01, -3.1781605e-01, -4.9836105e-01],
       ...,
       [-3.5266747e-04,  1.3941947e-03,  4.5535257e-04, ...,
        -4.6769723e-02, -6.3023269e-02, -9.8825537e-02],
       [-3.5266747e-04,  1.3941947e-03,  4.5535257e-04, ...,
        -4.6769723e-02, -6.3023269e-02, -9.8825537e-02],
       [-3.5266747e-04,  1.3941947e-03,  4.5535257e-04, ...,
        -4.6769723e-02, -6.3023269e-02, -9.8825537e-02]],
      shape=(4081, 128), dtype=float32)

In [11]:
# Get Embedding Weights of ALBERT model
# embedding_layer = model.embeddings.word_embeddings # For AlbertModel object
albert_model = model._modules['albert']
embedding_layer = albert_model.embeddings.word_embeddings

# torch.no_grad() to avoid tracking gradients
with torch.no_grad():
    embedding_matrix = embedding_layer.weight.clone() # Clone to avoid modifying original

default_embedding_matrix = embedding_matrix.cpu().numpy()
print(default_embedding_matrix.shape)
default_embedding_matrix

(30000, 128)


array([[-0.05101773, -0.05638105, -0.08745944, ...,  0.10348055,
        -0.1064435 , -0.06387638],
       [ 0.08651973,  0.02260554, -0.03166365, ..., -0.06117148,
        -0.05314829, -0.0543424 ],
       [-0.01066898,  0.01375878, -0.02094011, ...,  0.03288412,
        -0.01413923,  0.02857986],
       ...,
       [ 0.02059551,  0.03651065, -0.09545734, ..., -0.0247529 ,
         0.13839178, -0.05422436],
       [-0.11926416, -0.11318762,  0.05813185, ..., -0.07097802,
         0.08779413,  0.22770554],
       [ 0.07093989, -0.11180934, -0.01000072, ...,  0.08569918,
         0.1817395 , -0.03343155]], shape=(30000, 128), dtype=float32)

#### Logic to modify default word embedding

* 

In [14]:
modified_words = input_embedding_df['vocab'].to_list()

def _tokenize(word:str):
    # Handle case sensitivity based on the tokenizer
    processed_word = word.lower() if tokenizer.do_lower_case else word

    # Tokenize the word - it might split into subwords
    tokens = tokenizer.tokenize(processed_word)
    return tokens


for idx, word in enumerate(modified_words):

    tokens = _tokenize(word)

    if len(tokens) == 1:

        token = tokens[0]

        embedding_idx = vocab[token]
        new_embedding_array = input_embedding_matrix[idx]

        default_embedding_matrix[embedding_idx] = new_embedding_array

# Convert to PyTorch/TensorFlow tensor
new_embedding_tensor = torch.tensor(default_embedding_matrix, dtype=torch.float16)

print(new_embedding_tensor.shape)
print(new_embedding_tensor)

assert embedding_layer.weight.shape == new_embedding_tensor.shape, \
    f"Shape mismatch: Model expects {embedding_layer.weight.shape}, got {new_embedding_tensor.shape}"

torch.Size([30000, 128])
tensor([[-0.0510, -0.0564, -0.0875,  ...,  0.1035, -0.1064, -0.0639],
        [ 0.0865,  0.0226, -0.0317,  ..., -0.0612, -0.0532, -0.0544],
        [-0.0107,  0.0138, -0.0209,  ...,  0.0329, -0.0141,  0.0286],
        ...,
        [ 0.0206,  0.0365, -0.0955,  ..., -0.0247,  0.1384, -0.0542],
        [-0.1193, -0.1132,  0.0581,  ..., -0.0710,  0.0878,  0.2277],
        [ 0.0709, -0.1118, -0.0100,  ...,  0.0857,  0.1818, -0.0334]],
       dtype=torch.float16)


In [15]:
# Replace the weights (ensure device placement is correct if using GPU)
with torch.no_grad(): # Prevent tracking this operation in gradient history
     embedding_layer.weight.copy_(new_embedding_tensor) # In-place copy is safer

# Make sure the embedding layer is trainable (usually true by default after loading)
embedding_layer.weight.requires_grad = True

### Model Finetuning - Sequence Classification task

* Dataset: [`flax-sentence-embeddings/Gender_Bias_Evaluation_Set`](https://huggingface.co/datasets/flax-sentence-embeddings/Gender_Bias_Evaluation_Set)
* Relatively small dataset for Sequence Classification task (1584 rows)

In [16]:
df = pd.read_csv("hf://datasets/flax-sentence-embeddings/Gender_Bias_Evaluation_Set/bias_evaluation.csv")
df['labels'] = df['stereotypical_gender'].apply(lambda x: 1 if x == "male" else 0)
df.head()

Unnamed: 0.1,Unnamed: 0,base_sentence,occupation,stereotypical_gender,male_sentence,female_sentence,labels
0,0,The developer argued with the designer because...,developer,male,He argued with the designer because he did not...,She argued with the designer because she did n...,1
1,1,The developer argued with the designer because...,designer,female,The developer argued with him because his idea...,The developer argued with her because her idea...,0
2,2,The mechanic gave the clerk a present because ...,mechanic,male,He gave the clerk a present because he won the...,She gave the clerk a present because she won t...,1
3,3,The mechanic gave the clerk a present because ...,clerk,female,The mechanic gave him a present because it was...,The mechanic gave her a present because it was...,0
4,4,The mover said thank you to the housekeeper be...,mover,male,He said thank you to the housekeeper because h...,She said thank you to the housekeeper because ...,1


In [17]:
# Convert dataset into Huggingface Dataset object with train-test split of 80:20
datasets = Dataset.from_pandas(df).train_test_split(test_size=0.2)

train_dataset = datasets["train"]
val_dataset = datasets["test"]

train_dataset[0]

{'Unnamed: 0': 1022,
 'base_sentence': "The developer needed the clerk 's help with paperwork and asked him.",
 'occupation': 'clerk',
 'stereotypical_gender': 'female',
 'male_sentence': "The developer needed him 's help with paperwork and asked him.",
 'female_sentence': "The developer needed her 's help with paperwork and asked him.",
 'labels': 0}

In [18]:
# Tokenize the `base_sentence` column so that it can be used as input to finetune ALBERT
def tokenize_function(examples):
    return tokenizer(examples["base_sentence"], padding="max_length", truncation=True, max_length=512)

tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_eval_dataset = val_dataset.map(tokenize_function, batched=True)

Map: 100%|██████████| 1267/1267 [00:00<00:00, 4439.69 examples/s]
Map: 100%|██████████| 317/317 [00:00<00:00, 8325.68 examples/s]


In [19]:
# Format the dataset for PyTorch - Remove columns not needed by the model
cols_to_remove = ["Unnamed: 0", "base_sentence", "occupation", "male_sentence", "female_sentence", "stereotypical_gender"]
tokenized_train_dataset = tokenized_train_dataset.remove_columns(cols_to_remove)
tokenized_eval_dataset = tokenized_eval_dataset.remove_columns(cols_to_remove)

# # Rename the 'stereotypical_gender' column to 'labels' (expected by Trainer)
# tokenized_train_dataset = tokenized_train_dataset.rename_column("stereotypical_gender", "labels")
# tokenized_eval_dataset = tokenized_eval_dataset.rename_column("stereotypical_gender", "labels")

# Set format to PyTorch tensors
tokenized_train_dataset.set_format("torch")
tokenized_eval_dataset.set_format("torch")


In [20]:
import evaluate

metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    # Logits are the raw output scores from the model, shape (batch_size, num_labels)
    # Labels are the ground truth, shape (batch_size,)
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [25]:
training_args = TrainingArguments(
    output_dir="./results",             # Directory to save model checkpoints and logs
    num_train_epochs=1,                 # Reduced for quick demonstration; use more epochs (e.g., 3-5) for real tasks
    per_device_train_batch_size=8,      # Adjust based on your GPU memory
    per_device_eval_batch_size=8,       # Adjust based on your GPU memory
    warmup_steps=100,                   # Number of steps for linear warmup
    weight_decay=0.01,                  # Regularization strength
    logging_dir="./logs",               # Directory for TensorBoard logs
    logging_steps=50,                   # Log metrics every 50 steps
    # evaluation_strategy="epoch",        # Evaluate performance at the end of each epoch
    # save_strategy="epoch",              # Save model checkpoint at the end of each epoch
    # load_best_model_at_end=True,        # Load the best model found during training at the end
    metric_for_best_model="accuracy",   # Metric used to determine the best model
    greater_is_better=True,             # Accuracy should be maximized
    report_to="tensorboard",            # Report logs to TensorBoard (can add "wandb" etc.)
    # push_to_hub=False,                # Set to True to push model to Hugging Face Hub
    fp16=torch.cuda.is_available(),     # Use mixed precision training if CUDA is available
)

trainer = Trainer(
    model=model,                        # The model to train (potentially with custom embeddings)
    args=training_args,                 # Training arguments defined above
    train_dataset=tokenized_train_dataset, # Training dataset
    eval_dataset=tokenized_eval_dataset,   # Evaluation dataset
    tokenizer=tokenizer,                # Tokenizer used for data collation (handles padding dynamically if needed)
    compute_metrics=compute_metrics,    # Function to compute evaluation metrics
    # Optional: Data collator can optimize padding
    # data_collator=DataCollatorWithPadding(tokenizer=tokenizer)
)

  trainer = Trainer(


In [26]:
train_result = trainer.train()

trainer.save_model()  # Saves the tokenizer too
trainer.log_metrics("train", train_result.metrics)
trainer.save_metrics("train", train_result.metrics)
trainer.save_state()

# 6. Evaluate the Final Model
print("Evaluating the final model...")
eval_metrics = trainer.evaluate()
print(f"Evaluation Metrics: {eval_metrics}")
trainer.log_metrics("eval", eval_metrics)
trainer.save_metrics("eval", eval_metrics)

Step,Training Loss
50,0.7085
100,0.1665
150,0.021


***** train metrics *****
  epoch                    =        1.0
  total_flos               =    28199GF
  train_loss               =     0.2818
  train_runtime            = 0:00:24.58
  train_samples_per_second =     51.531
  train_steps_per_second   =      6.467
Evaluating the final model...


Evaluation Metrics: {'eval_loss': 0.00017151095380540937, 'eval_accuracy': 1.0, 'eval_runtime': 1.7117, 'eval_samples_per_second': 185.193, 'eval_steps_per_second': 23.368, 'epoch': 1.0}
***** eval metrics *****
  epoch                   =        1.0
  eval_accuracy           =        1.0
  eval_loss               =     0.0002
  eval_runtime            = 0:00:01.71
  eval_samples_per_second =    185.193
  eval_steps_per_second   =     23.368


In [27]:
# Access the embedding layer again (use the same path as in Step 4)
final_embedding_layer = albert_model.embeddings.word_embeddings

# Get the weights
final_embeddings_tensor = final_embedding_layer.weight.data

# Convert to NumPy if desired (and move to CPU if on GPU)
final_embeddings_numpy = final_embeddings_tensor.cpu().numpy()
print(final_embeddings_numpy.shape)
final_embeddings_numpy

(30000, 128)


array([[-0.05102333, -0.05636369, -0.08745994, ...,  0.10345046,
        -0.10644104, -0.06390105],
       [ 0.08654442,  0.02259737, -0.03167587, ..., -0.06115475,
        -0.05315949, -0.05434967],
       [-0.01059559,  0.01394353, -0.02050941, ...,  0.03291259,
        -0.0143677 ,  0.02812883],
       ...,
       [ 0.02059854,  0.03649751, -0.09545512, ..., -0.02474876,
         0.13842222, -0.0542276 ],
       [-0.11925773, -0.11315463,  0.05813357, ..., -0.070981  ,
         0.08776498,  0.22765203],
       [ 0.07091997, -0.111812  , -0.01000174, ...,  0.08569005,
         0.18175554, -0.03341537]], shape=(30000, 128), dtype=float32)

In [7]:
from transformers import MobileBertConfig, MobileBertModel

# Initializing a MobileBERT configuration
configuration = MobileBertConfig()

# Initializing a model (with random weights) from the configuration above
model = MobileBertModel(configuration)

# Accessing the model configuration
configuration = model.config

vocab = tokenizer.get_vocab()
vocab_size = len(vocab)
embedding_dim = model.config.hidden_size
print(vocab_size, embedding_dim)

30000 512


In [12]:
from transformers import RemBertModel, RemBertConfig

# Initializing a RemBERT rembert style configuration
configuration = RemBertConfig()

# Initializing a model from the rembert style configuration
model = RemBertModel(configuration)

vocab = tokenizer.get_vocab()
vocab_size = len(vocab)
embedding_dim = model.config.input_embedding_size 
print(vocab_size, embedding_dim)

250680 256


In [13]:
# Example for BERT-like models (check your specific model structure)
embedding_layer = model.bert.embeddings.word_embeddings # Adjust path as needed (e.g., model.roberta...)

# Check dimensions match
print(embedding_layer.weight.shape)

torch.Size([30522, 768])


### Custom Dataset with Pre-Computed Embeddings

In [None]:
class EmbeddingDataset(Dataset):
    def __init__(self, embeddings, labels):
        """
        Args:
            embeddings: Pre-computed embeddings (numpy array or torch tensor)
                       Shape: (num_samples, seq_length, embedding_dim)
            labels: Corresponding labels
        """
        self.embeddings = torch.tensor(embeddings, dtype=torch.float)
        self.labels = torch.tensor(labels, dtype=torch.long)
        
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        return {
            'input_embeddings': self.embeddings[idx],
            'labels': self.labels[idx]
        }

In [10]:
def convert_labels_to_int(labels):
    """
    Convert various label formats to integer labels
    
    Args:
        labels: Could be strings, one-hot, etc.
    
    Returns:
        numpy array of integer labels
    """
    if isinstance(labels[0], str):
        # String labels to integers
        unique_labels = sorted(set(labels))
        label_to_int = {label: i for i, label in enumerate(unique_labels)}
        return np.array([label_to_int[label] for label in labels])
    elif len(labels.shape) > 1 and labels.shape[1] > 1:
        # One-hot to integers
        return np.argmax(labels, axis=1)
    else:
        # Already integers or binary
        return labels.astype(int)
    
labels = convert_labels_to_int(input_embedding.index.to_list())
labels

array([ 593,  595,  590, ..., 2518, 2514, 2516], shape=(4081,))

In [11]:
class EmbeddingDataset(Dataset):
    def __init__(self, embeddings, labels):
        """
        Args:
            embeddings: Pre-computed embeddings (numpy array or torch tensor)
                       Shape: (num_samples, seq_length, embedding_dim)
            labels: Corresponding labels
        """
        self.embeddings = torch.tensor(embeddings, dtype=torch.float)
        self.labels = torch.tensor(labels, dtype=torch.long)
        
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        return {
            'input_embeddings': self.embeddings[idx],
            'labels': self.labels[idx]
        }
    
dataset = EmbeddingDataset(embedding_array, labels)

In [22]:
class EmbeddingConfig(PretrainedConfig):
    def __init__(self, embedding_dim=300, **kwargs):
        super().__init__(**kwargs)
        self.embedding_dim = embedding_dim

class EmbeddingModel(PreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.transformer = AutoModel.from_pretrained("bert-base-uncased")
        self.classifier = nn.Linear(config.embedding_dim, config.num_labels)
        
    def forward(self, input_embeddings, attention_mask=None, labels=None):
        # Bypass the embedding layer and use pre-computed embeddings
        outputs = self.transformer(
            inputs_embeds=input_embeddings,
            attention_mask=attention_mask
        )
        
        # Use [CLS] token for classification
        logits = self.classifier(outputs.last_hidden_state[:, 0, :])
        
        loss = None
        if labels is not None:
            loss_fn = nn.CrossEntropyLoss()
            loss = loss_fn(logits.view(-1, self.config.num_labels), labels.view(-1))
            
        return {
            'loss': loss,
            'logits': logits,
            'hidden_states': outputs.hidden_states
        }

# Initialize model
config = EmbeddingConfig(embedding_dim=300, num_labels=4081)
model = EmbeddingModel(config)

In [23]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
optimizer = optim.AdamW(model.parameters(), lr=2e-5)

train_loader = DataLoader(dataset, batch_size=16, shuffle=True)

model.train()
for epoch in range(3):
    for batch in train_loader:
        optimizer.zero_grad()
        
        inputs = batch['input_embeddings'].to(device)
        labels = batch['labels'].to(device)
        
        outputs = model(input_embeddings=inputs, labels=labels)
        loss = outputs['loss']
        loss.backward()
        optimizer.step()
        
        print(f"Loss: {loss.item()}")

ValueError: not enough values to unpack (expected 2, got 1)

In [None]:
class EmbeddingTrainer(Trainer):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
    
    def _prepare_inputs(self, inputs):
        prepared = {}
        for k, v in inputs.items():
            if k == 'input_embeddings':
                prepared['inputs_embeds'] = v.to(self.args.device)
            elif isinstance(v, torch.Tensor):
                prepared[k] = v.to(self.args.device)
            else:
                prepared[k] = v
        return prepared

# Usage with TrainingArguments
training_args = TrainingArguments(
    output_dir="./embedding_results",
    per_device_train_batch_size=16,
    num_train_epochs=3,
    learning_rate=2e-5,
)

trainer = EmbeddingTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
)
trainer.train()