<a href="https://colab.research.google.com/github/bayesmaxxing/gpt-text-classifier/blob/main/gpt_classifier_interpretability.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Purpose
Will use this notebook to investigate and try to understand the models I've trained. I would like to understand what parts of the message are important for classification, what the attention heads are focusing on, and what I can do to improve the model.

In [1]:
# Importing packages
!pip install datasets
!pip install transformers[torch]
!pip install evaluate

Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-16.1.0-cp310-cp310-manylinux_2_28_x86_64.whl (40.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.8/40.8 MB[0m [31m12.1 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m
Collecting requests>=2.32.2 (from datasets)
  Downloading requests-2.32.3-py3-none-any.whl (64 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.9/64.9 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1

In [47]:
import torch
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from transformers import pipeline, AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments
from datasets import Dataset
from google.colab import userdata, files
import accelerate

In [48]:
# Load model and tokenizer here from local machine
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = AutoModelForSequenceClassification.from_pretrained("bayesmaxxer/roberta-llm-classfier")
tokenizer = AutoTokenizer.from_pretrained("bayesmaxxer/roberta-llm-classfier")

model.to(device)

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
             

In [4]:
# Load test data to use in interp
fixed_length_test_gpts = pd.read_csv('/content/test_data_gpts.csv')
fixed_length_test_pplx = pd.read_csv('/content/test_data_pplx.csv')

# Add label encoding using Datasets
fixed_length_test_data = pd.concat([fixed_length_test_gpts, fixed_length_test_pplx], ignore_index=True)
fixed_length_test_data['label'] = fixed_length_test_data['model']

# Select relevant columns
fixed_length_test_data = fixed_length_test_data[['message', 'label']]

# Encode labels
label_encoder = LabelEncoder()
fixed_length_test_data['encoded_label'] = label_encoder.fit_transform(fixed_length_test_data['label'])

# Create id2label and label2id dictionaries
id2label = {id: label for id, label in enumerate(label_encoder.classes_)}
label2id = {label: id for id, label in id2label.items()}

In [25]:
from torch.utils.data import Dataset

tokenizer = AutoTokenizer.from_pretrained("bayesmaxxer/roberta-llm-classfier")

class TextClassificationDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text = self.data.iloc[idx]['message']
        label = self.data.iloc[idx]['encoded_label']

        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Create datasets
test_variable_dataset = TextClassificationDataset(fixed_length_test_data, tokenizer, max_length=512)

In [10]:
# setup compute_metrics here
import evaluate
metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [53]:
training_args = TrainingArguments(
    output_dir = './test_results',
    eval_strategy = 'epoch',
    per_device_train_batch_size=8,
    per_device_eval_batch_size=64,
    num_train_epochs=1
)
trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    eval_dataset=test_variable_dataset)

In [54]:
trainer.evaluate()

{'eval_loss': 0.5806668400764465,
 'eval_accuracy': 0.9166666666666666,
 'eval_runtime': 1.7464,
 'eval_samples_per_second': 34.357,
 'eval_steps_per_second': 0.573}

In [60]:
output_predictions = trainer.predict(test_variable_dataset)