In [8]:
!pip install transformers
!pip install numpy==1.26.0
!pip install tensorflow[and-cuda]
!pip install torch torchvision torchaudio
!pip install --upgrade ipywidgets
!pip install tf-keras
!pip install pandas
!pip install scikit-learn
!pip install torchinfo

Collecting torchinfo
  Downloading torchinfo-1.8.0-py3-none-any.whl.metadata (21 kB)
Downloading torchinfo-1.8.0-py3-none-any.whl (23 kB)
Installing collected packages: torchinfo
Successfully installed torchinfo-1.8.0


In [None]:
!nvidia-smi

/bin/bash: line 1: nvidia-smi: command not found


In [1]:
!git clone https://github.com/avyas21/interpretablellm.git

Cloning into 'interpretablellm'...
remote: Enumerating objects: 43, done.[K
remote: Counting objects: 100% (43/43), done.[K
remote: Compressing objects: 100% (34/34), done.[K
remote: Total 43 (delta 11), reused 34 (delta 8), pack-reused 0 (from 0)[K
Receiving objects: 100% (43/43), 6.41 MiB | 15.90 MiB/s, done.
Resolving deltas: 100% (11/11), done.


In [2]:
%cd interpretablellm
!ls

/content/interpretablellm
baseline.ipynb	baseline_prompt.txt  data  README.md  setup_dataset.ipynb


## Data Preprocessing

In [3]:
import pandas as pd

In [4]:
train_data = pd.read_csv("data/train_data.csv")
test_data = pd.read_csv("data/test_data.csv")

In [5]:
POSITIVE_WORDS = ["positive", "great", "good", "happy", "amazing", "fantastic", "yes"]
NEGATIVE_WORDS = ["negative", "bad", "sad", "terrible", "horrible", "no", "critical"]

In [6]:
def convert_lbl_to_int(label):
    if label.lower() in POSITIVE_WORDS:
        return 1
    if label.lower() in NEGATIVE_WORDS:
        return 0
    return -1

## Baseline Model

In [9]:
import numpy as np
print(np.__version__)
from transformers import BertModel, BertTokenizer, BertForSequenceClassification
from transformers import pipeline
from sklearn.metrics import f1_score
from torchinfo import summary
import random

1.26.4


In [10]:
def get_prompt(review):
  with open("baseline_prompt.txt", "r") as file:
    prompt = file.read()
  prompt = prompt.replace("<REVIEW>", review)
  return prompt

print(get_prompt("Test Prompt"))

Given a book review, classify it as expressing a positive or negative sentiment.
Review: Test Prompt
This review is either positive or negative sentiment. If one had to chosen, the sentiment in the review is [MASK].



In [11]:
# Load the BERT model and tokenizer
baseline_model = BertModel.from_pretrained("bert-base-uncased")
baseline_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
baseline_unmasker = pipeline('fill-mask', model='bert-base-uncased')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

BertForMaskedLM has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another archite

In [None]:
def predict(review, unmasker):
  prompt = get_prompt(review)
  predictions = unmasker(prompt)
  valid_predictions = POSITIVE_WORDS + NEGATIVE_WORDS

  for prediction in predictions:
    if(prediction['token_str'] in valid_predictions):
      return prediction['token_str']

  for prediction in predictions:
    print(prediction['token_str'])

  sentiment = ["positive", "negative"]
  #If not found, lets predict random
  return "NOT FOUND" #random.choice(sentiment)

In [None]:
def predict_baseline(df, model, tokenizer, unmasker):
    predictions = []
    labels = []
    for idx, row in df.iterrows():
        input_text = row['Review']
        prediction = predict(input_text, unmasker)
        predictions.append(convert_lbl_to_int(prediction))
        labels.append(convert_lbl_to_int(row['Sentiment']))
    return predictions, labels



In [None]:
def score_baseline(baseline_model, df, baseline_tokenizer, baseline_unmasker):
    predictions, labels = predict_baseline(df, baseline_model, baseline_tokenizer, baseline_unmasker)
    values, counts = np.unique(np.array(predictions), return_counts=True)

    for v, c in zip(values, counts):
        print(f"Value: {v}, Count: {c}")

    return f1_score(labels, predictions, average='micro')


In [None]:
train_baseline_f1 = score_baseline(baseline_model, train_data, baseline_tokenizer, baseline_unmasker)
print(train_baseline_f1)

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


ridiculous
wrong
silly
right
absurd
Value: -1, Count: 1
Value: 0, Count: 4542
Value: 1, Count: 3457
0.67075


In [None]:
test_baseline_f1 = score_baseline(baseline_model, test_data, baseline_tokenizer, baseline_unmasker)
print(test_baseline_f1)

Value: 0, Count: 1143
Value: 1, Count: 857
0.6425


## Probe Models

In [12]:
import torch.nn as nn
import torch
from transformers import AutoTokenizer, BertModel
import pandas as pd

class CustomBERTModel(nn.Module):
    def __init__(self, bert_embedding_layer, bert_encoder_layers):
        super(CustomBERTModel, self).__init__()
        self.bert_embedding_layer = bert_embedding_layer
        self.bert_encoder_layers = bert_encoder_layers
        self.dropout = nn.Dropout(0.5)

        # Freeze the embedding layer
        for param in self.bert_embedding_layer.parameters():
            param.requires_grad = False

        for param in self.bert_encoder_layers.parameters():
            param.requires_grad = False

        ### New layers:
        self.flatten = nn.Flatten()
        self.linear1 = nn.Linear(768, 256)
        self.linear2 = nn.Linear(256, 1)
        self.linear3 = nn.Linear(512, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, ids, mask):
        embedding_output = self.bert_embedding_layer(ids)
        encoder_outputs = embedding_output
        for i, layer_module in enumerate(self.bert_encoder_layers):
          encoder_outputs = layer_module(encoder_outputs)[0]

        linear1_output = self.dropout(self.linear1(encoder_outputs))
        linear2_output = self.dropout(self.linear2(linear1_output))
        flattened_output = self.flatten(linear2_output)
        linear3_output = self.dropout(self.linear3(flattened_output))
        sigmoid_output = self.sigmoid(linear3_output)
        return sigmoid_output


In [13]:
def get_custom_bert_model(num_bert_layers):
  gpu_available = torch.cuda.is_available()
  model = CustomBERTModel(baseline_model.embeddings, baseline_model.encoder.layer[:num_bert_layers])

  if gpu_available:
    return model.to(torch.device("cuda"))
  return model

In [14]:
summary(get_custom_bert_model(4))

Layer (type:depth-idx)                             Param #
CustomBERTModel                                    --
├─BertEmbeddings: 1-1                              --
│    └─Embedding: 2-1                              (23,440,896)
│    └─Embedding: 2-2                              (393,216)
│    └─Embedding: 2-3                              (1,536)
│    └─LayerNorm: 2-4                              (1,536)
│    └─Dropout: 2-5                                --
├─ModuleList: 1-2                                  --
│    └─BertLayer: 2-6                              --
│    │    └─BertAttention: 3-1                     (2,363,904)
│    │    └─BertIntermediate: 3-2                  (2,362,368)
│    │    └─BertOutput: 3-3                        (2,361,600)
│    └─BertLayer: 2-7                              --
│    │    └─BertAttention: 3-4                     (2,363,904)
│    │    └─BertIntermediate: 3-5                  (2,362,368)
│    │    └─BertOutput: 3-6                        (2,361,6

In [15]:
# bert_seq_model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
# inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
# print(inputs)
# with torch.no_grad():
#     logits = bert_seq_model(**inputs).logits
#     print(logits.shape)

# predicted_class_id = logits.argmax().item()
# print(bert_seq_model.config.id2label[predicted_class_id])

# summary(bert_seq_model)

In [16]:
def get_inputs_labels(df):
    inputs = []
    labels = []
    for idx, row in train_data.iterrows():
        input_text = row['Review']
        inputs.append(input_text)
        labels.append(1 if row['Sentiment'] == 'positive' else 0)
    return inputs, labels


In [17]:
import torch
from torch.utils.data import Dataset, DataLoader

class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label)
        }

In [18]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
inputs, labels = get_inputs_labels(train_data)
dataset = TextDataset(inputs, labels, tokenizer, 512)
dataloader = DataLoader(dataset, batch_size=128, shuffle=True)

In [19]:
def predict(model, df):
    model.eval()
    predictions = []
    labels = []
    gpu_available = torch.cuda.is_available()
    with torch.no_grad():
        for idx, row in df.iterrows():
            input_text = row['Review']
            encoding = tokenizer(input_text, add_special_tokens=True, max_length = 512, padding='max_length', truncation=True, return_tensors='pt')
            if gpu_available:
              input_ids = encoding['input_ids'].cuda()
              attention_mask = encoding['attention_mask'].cuda()
              prediction = model(encoding['input_ids'].cuda(), encoding['attention_mask'].cuda())
            else:
              input_ids = encoding['input_ids']
              attention_mask = encoding['attention_mask']
              prediction = model(encoding['input_ids'], encoding['attention_mask'])
            predictions.append(1 if prediction > 0.5 else 0)
            labels.append(convert_lbl_to_int(row['Sentiment']))

    return predictions, labels

In [20]:
def score_model(model, train_df, test_df):
    train_f1 = None
    test_f1 = None

    if train_df is not None:
        train_predictions, train_labels = predict(model, train_df)
        train_f1 = f1_score(train_labels, train_predictions, average='micro')

    if test_df is not None:
        test_predictions, test_labels = predict(model, test_df)
        test_f1 = f1_score(test_labels, test_predictions, average='micro')

    return train_f1, test_f1

# print(score_custom_model(model, train_data, test_data))

In [21]:
def get_loss(model, df):
    inputs, labels = get_inputs_labels(df)


In [22]:
print(torch.cuda.is_available())

True


In [23]:
def train_model(model, epochs, dataloader, train_df):
    criterion = nn.BCELoss() ## If required define your own criterion
    optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()))
    gpu_available = torch.cuda.is_available()

    for epoch in range(epochs):
        for batch in dataloader:
            targets = np.array(batch['label'])
            targets = torch.tensor(np.expand_dims(targets,axis=1)).float()
            optimizer.zero_grad()
            input_ids = batch['input_ids']
            attention_mask = batch['attention_mask']

            if gpu_available:
              targets = targets.cuda()
              input_ids = input_ids.cuda()
              attention_mask = attention_mask.cuda()

            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, targets)

            loss.backward()
            optimizer.step()

        train_f1, _ = score_model(model, train_df, None)
        print("Epoch: " + str(epoch) + " F1: " + str(train_f1))

In [24]:
def score_all_probe_models(dataloader, train_df, test_df, epochs, max_n):
    model_scores = []
    for n in range(1,max_n + 1):
        print("N: " + str(n))
        model = get_custom_bert_model(n)
        train_model(model, epochs, dataloader, train_df)
        _, test_f1 = score_model(model, None, test_df)
        print("TEST F1: " + str(test_f1))
        model_scores.append([n, test_f1])

    return model_scores

In [102]:
model_scores = score_all_probe_models(dataloader, train_data, test_data, 10, 10)

N: 1


KeyboardInterrupt: 

## Scalar Mixing Weights

In [111]:
import torch.nn as nn
import torch
from transformers import AutoTokenizer, BertModel
import pandas as pd

class ScalarMixingWeightModel(nn.Module):
    def __init__(self, bert_embedding_layer, bert_encoder_layers):
        super(ScalarMixingWeightModel, self).__init__()
        self.bert_embedding_layer = bert_embedding_layer
        self.bert_encoder_layers = bert_encoder_layers
        self.gamma = nn.Parameter(torch.tensor(1.0))
        random_values = torch.rand(len(self.bert_encoder_layers))
        random_values = random_values / torch.sum(random_values)
        print(random_values)
        self.layer_weights = nn.ParameterList([nn.Parameter(torch.tensor(random_values[i])) for i in range(len(self.bert_encoder_layers))])

        assert len(self.bert_encoder_layers) == len(self.layer_weights)

        self.dropout = nn.Dropout(0.5)

        # Freeze the embedding layer
        for param in self.bert_embedding_layer.parameters():
            param.requires_grad = False

        for param in self.bert_encoder_layers.parameters():
            param.requires_grad = False

        ### New layers:
        self.flatten = nn.Flatten()
        self.linear1 = nn.Linear(768, 256)
        self.linear2 = nn.Linear(256, 1)
        self.linear3 = nn.Linear(512, 1)
        self.sigmoid = nn.Sigmoid()
        self.softmax = nn.Softmax(dim=0)

    def forward(self, ids, mask):
        embedding_output = self.bert_embedding_layer(ids)
        encoder_output = embedding_output
        encoder_outputs = []

        for i, layer_module in enumerate(self.bert_encoder_layers):
          encoder_output = layer_module(encoder_output)[0]
          encoder_outputs.append(encoder_output)

        tensor_list = [param for param in self.layer_weights]
        scalar_mix_weights = torch.stack(tensor_list)
        scalar_mix_weights = self.softmax(scalar_mix_weights)
        print(scalar_mix_weights)

        final_encoder_output = (encoder_outputs[0] * scalar_mix_weights[0])

        for i in range(1, len(encoder_outputs)):
          final_encoder_output +=  (encoder_outputs[i] * scalar_mix_weights[i])

        final_encoder_output *= self.gamma
        print(self.gamma)

        linear1_output = self.dropout(self.linear1(final_encoder_output))
        linear2_output = self.dropout(self.linear2(linear1_output))
        flattened_output = self.flatten(linear2_output)
        linear3_output = self.dropout(self.linear3(flattened_output))
        sigmoid_output = self.sigmoid(linear3_output)
        return sigmoid_output


In [112]:
def get_scalar_mixing_model(num_bert_layers):
  gpu_available = torch.cuda.is_available()
  model = ScalarMixingWeightModel(baseline_model.embeddings, baseline_model.encoder.layer[:num_bert_layers])

  if gpu_available:
    return model.to(torch.device("cuda"))

  return model

In [113]:
test_scalar = get_scalar_mixing_model(3)
summary(test_scalar)

tensor([0.5523, 0.3808, 0.0669])


  self.layer_weights = nn.ParameterList([nn.Parameter(torch.tensor(random_values[i])) for i in range(len(self.bert_encoder_layers))])


Layer (type:depth-idx)                             Param #
ScalarMixingWeightModel                            1
├─BertEmbeddings: 1-1                              --
│    └─Embedding: 2-1                              (23,440,896)
│    └─Embedding: 2-2                              (393,216)
│    └─Embedding: 2-3                              (1,536)
│    └─LayerNorm: 2-4                              (1,536)
│    └─Dropout: 2-5                                --
├─ModuleList: 1-2                                  --
│    └─BertLayer: 2-6                              --
│    │    └─BertAttention: 3-1                     (2,363,904)
│    │    └─BertIntermediate: 3-2                  (2,362,368)
│    │    └─BertOutput: 3-3                        (2,361,600)
│    └─BertLayer: 2-7                              --
│    │    └─BertAttention: 3-4                     (2,363,904)
│    │    └─BertIntermediate: 3-5                  (2,362,368)
│    │    └─BertOutput: 3-6                        (2,361,60

In [114]:
train_model(test_scalar, 3, dataloader, train_data)

tensor([0.4069, 0.3427, 0.2504], device='cuda:0', grad_fn=<SoftmaxBackward0>)
Parameter containing:
tensor(1., device='cuda:0', requires_grad=True)
tensor([0.4071, 0.3423, 0.2506], device='cuda:0', grad_fn=<SoftmaxBackward0>)
Parameter containing:
tensor(0.9990, device='cuda:0', requires_grad=True)
tensor([0.4075, 0.3420, 0.2504], device='cuda:0', grad_fn=<SoftmaxBackward0>)
Parameter containing:
tensor(0.9982, device='cuda:0', requires_grad=True)
tensor([0.4079, 0.3418, 0.2503], device='cuda:0', grad_fn=<SoftmaxBackward0>)
Parameter containing:
tensor(0.9974, device='cuda:0', requires_grad=True)
tensor([0.4083, 0.3416, 0.2501], device='cuda:0', grad_fn=<SoftmaxBackward0>)
Parameter containing:
tensor(0.9965, device='cuda:0', requires_grad=True)
tensor([0.4087, 0.3413, 0.2500], device='cuda:0', grad_fn=<SoftmaxBackward0>)
Parameter containing:
tensor(0.9956, device='cuda:0', requires_grad=True)
tensor([0.4091, 0.3410, 0.2498], device='cuda:0', grad_fn=<SoftmaxBackward0>)
Parameter cont

KeyboardInterrupt: 

In [31]:
print(test_scalar.layer_weights)

[Parameter containing:
tensor(1., requires_grad=True), Parameter containing:
tensor(1., requires_grad=True), Parameter containing:
tensor(1., requires_grad=True)]


## Analysis

In [None]:
summary(baseline_model)

Layer (type:depth-idx)                                  Param #
BertModel                                               --
├─BertEmbeddings: 1-1                                   --
│    └─Embedding: 2-1                                   (23,440,896)
│    └─Embedding: 2-2                                   (393,216)
│    └─Embedding: 2-3                                   (1,536)
│    └─LayerNorm: 2-4                                   (1,536)
│    └─Dropout: 2-5                                     --
├─BertEncoder: 1-2                                      --
│    └─ModuleList: 2-6                                  --
│    │    └─BertLayer: 3-1                              (7,087,872)
│    │    └─BertLayer: 3-2                              (7,087,872)
│    │    └─BertLayer: 3-3                              (7,087,872)
│    │    └─BertLayer: 3-4                              (7,087,872)
│    │    └─BertLayer: 3-5                              (7,087,872)
│    │    └─BertLayer: 3-6            

In [None]:
summary( get_custom_bert_model(10))

Layer (type:depth-idx)                             Param #
CustomBERTModel                                    --
├─BertEmbeddings: 1-1                              --
│    └─Embedding: 2-1                              (23,440,896)
│    └─Embedding: 2-2                              (393,216)
│    └─Embedding: 2-3                              (1,536)
│    └─LayerNorm: 2-4                              (1,536)
│    └─Dropout: 2-5                                --
├─ModuleList: 1-2                                  --
│    └─BertLayer: 2-6                              --
│    │    └─BertAttention: 3-1                     (2,363,904)
│    │    └─BertIntermediate: 3-2                  (2,362,368)
│    │    └─BertOutput: 3-3                        (2,361,600)
│    └─BertLayer: 2-7                              --
│    │    └─BertAttention: 3-4                     (2,363,904)
│    │    └─BertIntermediate: 3-5                  (2,362,368)
│    │    └─BertOutput: 3-6                        (2,361,6