In [1]:
# !pip install transformers
# !pip install numpy==1.26.0
# !pip install tensorflow[and-cuda]
# !pip install torch torchvision torchaudio
# !pip install --upgrade ipywidgets
# !pip install tf-keras
# !pip install pandas
# !pip install scikit-learn
!pip install torchinfo

Collecting torchinfo
  Downloading torchinfo-1.8.0-py3-none-any.whl.metadata (21 kB)
Downloading torchinfo-1.8.0-py3-none-any.whl (23 kB)
Installing collected packages: torchinfo
Successfully installed torchinfo-1.8.0


In [47]:
!nvidia-smi

Mon Mar  3 02:46:23 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   72C    P0             32W /   70W |    5294MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [2]:
!git clone https://github.com/avyas21/interpretablellm.git

Cloning into 'interpretablellm'...
remote: Enumerating objects: 55, done.[K
remote: Counting objects: 100% (55/55), done.[K
remote: Compressing objects: 100% (46/46), done.[K
remote: Total 55 (delta 18), reused 33 (delta 8), pack-reused 0 (from 0)[K
Receiving objects: 100% (55/55), 6.44 MiB | 6.47 MiB/s, done.
Resolving deltas: 100% (18/18), done.


In [3]:
%cd interpretablellm
!ls

/content/interpretablellm
baseline.ipynb	baseline_prompt.txt  data  README.md  setup_dataset.ipynb


## Data Preprocessing

In [4]:
import pandas as pd

In [5]:
train_data = pd.read_csv("data/train_data.csv")
test_data = pd.read_csv("data/test_data.csv")

In [6]:
POSITIVE_WORDS = ["positive", "great", "good", "happy", "amazing", "fantastic", "yes"]
NEGATIVE_WORDS = ["negative", "bad", "sad", "terrible", "horrible", "no", "critical"]

In [7]:
def convert_lbl_to_int(label):
    if label.lower() in POSITIVE_WORDS:
        return 1
    if label.lower() in NEGATIVE_WORDS:
        return 0
    return -1

## Baseline Model

In [8]:
import numpy as np
print(np.__version__)
from transformers import BertModel, BertTokenizer, BertForSequenceClassification, BertConfig
from transformers import pipeline
from sklearn.metrics import f1_score
from torchinfo import summary
import random

1.26.4


In [9]:
def get_prompt(review):
  with open("baseline_prompt.txt", "r") as file:
    prompt = file.read()
  prompt = prompt.replace("<REVIEW>", review)
  return prompt

print(get_prompt("Test Prompt"))

Given a book review, classify it as expressing a positive or negative sentiment.
Review: Test Prompt
This review is either positive or negative sentiment. If one had to chosen, the sentiment in the review is [MASK].



In [10]:
# Load the BERT model and tokenizer
baseline_model = BertModel.from_pretrained("bert-base-uncased")
baseline_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
baseline_unmasker = pipeline('fill-mask', model='bert-base-uncased')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

BertForMaskedLM has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another archite

In [11]:
def predict(review, unmasker):
  prompt = get_prompt(review)
  predictions = unmasker(prompt)
  valid_predictions = POSITIVE_WORDS + NEGATIVE_WORDS

  for prediction in predictions:
    if(prediction['token_str'] in valid_predictions):
      return prediction['token_str']

  for prediction in predictions:
    print(prediction['token_str'])

  sentiment = ["positive", "negative"]
  #If not found, lets predict random
  return "NOT FOUND" #random.choice(sentiment)

In [12]:
def predict_baseline(df, model, tokenizer, unmasker):
    predictions = []
    labels = []
    for idx, row in df.iterrows():
        input_text = row['Review']
        prediction = predict(input_text, unmasker)
        predictions.append(convert_lbl_to_int(prediction))
        labels.append(convert_lbl_to_int(row['Sentiment']))
    return predictions, labels



In [13]:
def score_baseline(baseline_model, df, baseline_tokenizer, baseline_unmasker):
    predictions, labels = predict_baseline(df, baseline_model, baseline_tokenizer, baseline_unmasker)
    values, counts = np.unique(np.array(predictions), return_counts=True)

    for v, c in zip(values, counts):
        print(f"Value: {v}, Count: {c}")

    return f1_score(labels, predictions, average='micro')


In [14]:
train_baseline_f1 = score_baseline(baseline_model, train_data, baseline_tokenizer, baseline_unmasker)
print(train_baseline_f1)

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


KeyboardInterrupt: 

In [46]:
test_baseline_f1 = score_baseline(baseline_model, test_data, baseline_tokenizer, baseline_unmasker)
print(test_baseline_f1)

AttributeError: 'str' object has no attribute 'eval'

## Probe Models

In [14]:
import torch.nn as nn
import torch
from transformers import AutoTokenizer, BertModel
import pandas as pd

class CustomBERTModel(nn.Module):
    def __init__(self, n):
        super(CustomBERTModel, self).__init__()

        self.bert=  BertModel.from_pretrained("bert-base-uncased")
        self.bert.encoder.layer = nn.ModuleList(self.bert.encoder.layer[:n])

        for param in self.bert.parameters():
            param.requires_grad = False

        self.dropout = nn.Dropout(0.5)

        ### New layers:
        self.linear1 = nn.Linear(768, 256)
        self.linear2 = nn.Linear(256, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, ids, mask):
        output = self.bert(input_ids=ids, attention_mask=mask, output_hidden_states=True)

        linear1_output = self.linear1(output.last_hidden_state[:,0,:])
        linear2_output = self.linear2(self.dropout(linear1_output))
        sigmoid_output = self.sigmoid(linear2_output)
        return sigmoid_output


In [15]:
def get_custom_bert_model(num_bert_layers):
  gpu_available = torch.cuda.is_available()
  model = CustomBERTModel(num_bert_layers)

  if gpu_available:
    return model.to(torch.device("cuda"))
  return model

In [16]:
summary(get_custom_bert_model(4))

Layer (type:depth-idx)                                       Param #
CustomBERTModel                                              --
├─BertModel: 1-1                                             --
│    └─BertEmbeddings: 2-1                                   --
│    │    └─Embedding: 3-1                                   (23,440,896)
│    │    └─Embedding: 3-2                                   (393,216)
│    │    └─Embedding: 3-3                                   (1,536)
│    │    └─LayerNorm: 3-4                                   (1,536)
│    │    └─Dropout: 3-5                                     --
│    └─BertEncoder: 2-2                                      --
│    │    └─ModuleList: 3-6                                  (28,351,488)
│    └─BertPooler: 2-3                                       --
│    │    └─Linear: 3-7                                      (590,592)
│    │    └─Tanh: 3-8                                        --
├─Dropout: 1-2                                         

In [17]:
def get_inputs_labels(df):
    inputs = []
    labels = []
    for idx, row in train_data.iterrows():
        input_text = row['Review']
        inputs.append(input_text)
        labels.append(1 if row['Sentiment'] == 'positive' else 0)
    return inputs, labels


In [18]:
import torch
from torch.utils.data import Dataset, DataLoader

class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label)
        }

In [19]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
inputs, labels = get_inputs_labels(train_data)
dataset = TextDataset(inputs, labels, tokenizer, 512)
dataloader = DataLoader(dataset, batch_size=128, shuffle=True)

In [20]:
def predict(model, df):
    model.eval()
    predictions = []
    labels = []
    gpu_available = torch.cuda.is_available()
    with torch.no_grad():
        for idx, row in df.iterrows():
            input_text = row['Review']
            encoding = tokenizer(input_text, add_special_tokens=True, max_length = 512, padding='max_length', truncation=True, return_tensors='pt')
            if gpu_available:
              input_ids = encoding['input_ids'].cuda()
              attention_mask = encoding['attention_mask'].cuda()
              prediction = model(encoding['input_ids'].cuda(), encoding['attention_mask'].cuda())
            else:
              input_ids = encoding['input_ids']
              attention_mask = encoding['attention_mask']
              prediction = model(encoding['input_ids'], encoding['attention_mask'])
            predictions.append(1 if prediction > 0.5 else 0)
            labels.append(convert_lbl_to_int(row['Sentiment']))

    return predictions, labels

In [21]:
def score_model(model, train_df, test_df):
    train_f1 = None
    test_f1 = None

    if train_df is not None:
        train_predictions, train_labels = predict(model, train_df)
        train_f1 = f1_score(train_labels, train_predictions, average='micro')

    if test_df is not None:
        test_predictions, test_labels = predict(model, test_df)
        test_f1 = f1_score(test_labels, test_predictions, average='micro')

    return train_f1, test_f1

# print(score_custom_model(model, train_data, test_data))

In [22]:
def get_loss(model, df):
    inputs, labels = get_inputs_labels(df)


In [23]:
print(torch.cuda.is_available())

True


In [24]:
def train_model(model, epochs, dataloader, train_df, learning_rate = 1e-3, calc_train_f1 = True):
    criterion = nn.BCELoss() ## If required define your own criterion
    optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr = learning_rate)
    gpu_available = torch.cuda.is_available()

    for epoch in range(epochs):
        for batch in dataloader:
            targets = np.array(batch['label'])
            targets = torch.tensor(np.expand_dims(targets,axis=1)).float()
            optimizer.zero_grad()
            input_ids = batch['input_ids']
            attention_mask = batch['attention_mask']

            if gpu_available:
              targets = targets.cuda()
              input_ids = input_ids.cuda()
              attention_mask = attention_mask.cuda()

            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()

        if calc_train_f1:
          train_f1, _ = score_model(model, train_df, None)
          print("Epoch: " + str(epoch) + " F1: " + str(train_f1) + " LOSS: " + str(loss))

In [25]:
def score_all_probe_models(dataloader, train_df, test_df, epochs, max_n, learning_rate = 1e-3):
    model_scores = []
    for n in range(1,max_n + 1):
        print("N: " + str(n))
        model = get_custom_bert_model(n)
        train_model(model, epochs, dataloader, train_df, learning_rate)
        _, test_f1 = score_model(model, None, test_df)
        print("TEST F1: " + str(test_f1))
        model_scores.append([n, test_f1])

    return model_scores

In [26]:
model_scores = score_all_probe_models(dataloader, train_data, test_data, 30, 10, learning_rate = 1e-2)

N: 1


KeyboardInterrupt: 

## Scalar Mixing Weights

In [27]:
import torch.nn as nn
import torch
from transformers import AutoTokenizer, BertModel
import pandas as pd

class ScalarMixingWeightModel(nn.Module):
    def __init__(self, n, i):
        super(ScalarMixingWeightModel, self).__init__()

        self.bert= BertModel.from_pretrained("bert-base-uncased")
        self.bert.encoder.layer = nn.ModuleList(self.bert.encoder.layer[:n])
        self.n = n
        self.layer_weights = nn.Parameter(torch.ones(self.n))
        self.softmax = nn.Softmax(dim=0)
        self.i = i

        self.gamma = nn.Parameter(torch.ones(1))
        self.dropout = nn.Dropout(0.5)

        for param in self.bert.parameters():
            param.requires_grad = False

        ### New layers:
        if self.i == 1:
          self.linear1 = nn.Linear(768, 1)
        elif self.i == 2:
          self.linear1 = nn.Linear(768, 384)
          self.linear2 = nn.Linear(384, 1)
        elif self.i == 3:
          self.linear1 = nn.Linear(768, 384)
          self.linear2 = nn.Linear(384, 192)
          self.linear3 = nn.Linear(192, 1)
        elif self.i == 4:
          self.linear1 = nn.Linear(768, 384)
          self.linear2 = nn.Linear(384, 192)
          self.linear3 = nn.Linear(192, 96)
          self.linear4 = nn.Linear(96, 1)

        self.sigmoid = nn.Sigmoid()

    def forward(self, ids, mask):
        outputs = self.bert(input_ids=ids, attention_mask=mask, output_hidden_states=True)

        hidden_states = outputs.hidden_states[1:1 + self.n]

        normalized_weights = self.softmax(self.layer_weights)
        scalar_mixing_weight = self.gamma * sum(normalized_weights[i] * hidden_states[i] for i in range(self.n))

        if self.i == 1:
          linear1_output = self.dropout(self.linear1(scalar_mixing_weight[:, 0, :]))
          sigmoid_output = self.sigmoid(linear1_output)
        elif self.i == 2:
          linear1_output = self.linear1(scalar_mixing_weight[:, 0, :])
          linear2_output = self.linear2(self.dropout(linear1_output))
          sigmoid_output = self.sigmoid(linear2_output)
        elif self.i == 3:
          linear1_output = self.linear1(scalar_mixing_weight[:, 0, :])
          linear2_output = self.linear2(self.dropout(linear1_output))
          linear3_output = self.linear3(self.dropout(linear2_output))
          sigmoid_output = self.sigmoid(linear3_output)
        elif self.i == 4:
          linear1_output = self.linear1(scalar_mixing_weight[:, 0, :])
          linear2_output = self.linear2(self.dropout(linear1_output))
          linear3_output = self.linear3(self.dropout(linear2_output))
          linear4_output = self.linear4(self.dropout(linear3_output))
          sigmoid_output = self.sigmoid(linear4_output)

        return sigmoid_output


In [28]:
def get_scalar_mixing_model(num_bert_layers, i):
  gpu_available = torch.cuda.is_available()
  model = ScalarMixingWeightModel(num_bert_layers, i)

  if gpu_available:
    return model.to(torch.device("cuda"))

  return model

In [29]:
weights = {}
for i in range(1,11):
  test_scalar = get_scalar_mixing_model(i,1)
  train_model(test_scalar, 10, dataloader, train_data, learning_rate = 0.1)
  print(i, ":", test_scalar.layer_weights)
  weights[i] = test_scalar.layer_weights

Epoch: 0 F1: 0.592375 LOSS: tensor(0.7060, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward0>)
Epoch: 1 F1: 0.586375 LOSS: tensor(0.7573, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward0>)
Epoch: 2 F1: 0.708625 LOSS: tensor(0.5033, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward0>)
Epoch: 3 F1: 0.708375 LOSS: tensor(0.6218, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward0>)
Epoch: 4 F1: 0.737875 LOSS: tensor(0.5680, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward0>)
Epoch: 5 F1: 0.690375 LOSS: tensor(0.4901, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward0>)
Epoch: 6 F1: 0.738 LOSS: tensor(0.5440, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward0>)
Epoch: 7 F1: 0.66975 LOSS: tensor(0.4664, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward0>)
Epoch: 8 F1: 0.673 LOSS: tensor(0.5569, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward0>)
Epoch: 9 F1: 0.734625 LOSS: tensor(0.4967, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward0>)
1 : Parameter con

In [32]:
lrs = [0.1, 0.01, 0.001, 0.0001]
for lr in lrs:
  for i in range(1,5):
    test_scalar = get_scalar_mixing_model(2, i)
    train_model(test_scalar, 10, dataloader, train_data, learning_rate = lr)
    print("Learning rate:", lr, "Number of layers", i)

Epoch: 0 F1: 0.5 LOSS: tensor(0.6902, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward0>)
Epoch: 1 F1: 0.579 LOSS: tensor(0.6880, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward0>)
Epoch: 2 F1: 0.747375 LOSS: tensor(0.6665, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward0>)
Epoch: 3 F1: 0.76175 LOSS: tensor(0.4873, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward0>)
Epoch: 4 F1: 0.754875 LOSS: tensor(0.4700, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward0>)
Epoch: 5 F1: 0.771375 LOSS: tensor(0.3286, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward0>)
Epoch: 6 F1: 0.817375 LOSS: tensor(0.4258, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward0>)
Epoch: 7 F1: 0.808375 LOSS: tensor(0.3882, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward0>)
Epoch: 8 F1: 0.7935 LOSS: tensor(0.6432, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward0>)
Epoch: 9 F1: 0.827375 LOSS: tensor(0.3705, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward0>)
Learning rate: 0.1 Nu

## Analysis

In [None]:
summary(baseline_model)

Layer (type:depth-idx)                                  Param #
BertModel                                               --
├─BertEmbeddings: 1-1                                   --
│    └─Embedding: 2-1                                   (23,440,896)
│    └─Embedding: 2-2                                   (393,216)
│    └─Embedding: 2-3                                   (1,536)
│    └─LayerNorm: 2-4                                   (1,536)
│    └─Dropout: 2-5                                     --
├─BertEncoder: 1-2                                      --
│    └─ModuleList: 2-6                                  --
│    │    └─BertLayer: 3-1                              (7,087,872)
│    │    └─BertLayer: 3-2                              (7,087,872)
│    │    └─BertLayer: 3-3                              (7,087,872)
│    │    └─BertLayer: 3-4                              (7,087,872)
│    │    └─BertLayer: 3-5                              (7,087,872)
│    │    └─BertLayer: 3-6            

In [None]:
summary( get_custom_bert_model(10))

Layer (type:depth-idx)                             Param #
CustomBERTModel                                    --
├─BertEmbeddings: 1-1                              --
│    └─Embedding: 2-1                              (23,440,896)
│    └─Embedding: 2-2                              (393,216)
│    └─Embedding: 2-3                              (1,536)
│    └─LayerNorm: 2-4                              (1,536)
│    └─Dropout: 2-5                                --
├─ModuleList: 1-2                                  --
│    └─BertLayer: 2-6                              --
│    │    └─BertAttention: 3-1                     (2,363,904)
│    │    └─BertIntermediate: 3-2                  (2,362,368)
│    │    └─BertOutput: 3-3                        (2,361,600)
│    └─BertLayer: 2-7                              --
│    │    └─BertAttention: 3-4                     (2,363,904)
│    │    └─BertIntermediate: 3-5                  (2,362,368)
│    │    └─BertOutput: 3-6                        (2,361,6