In [None]:

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from transformers import AutoTokenizer, Trainer, TrainingArguments, DistilBertForSequenceClassification
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from datasets import Dataset, DatasetDict
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import numpy as np
import torch
from torch.nn import CrossEntropyLoss
from transformers.modeling_outputs import SequenceClassifierOutput

## BERT Classifier (DistilBERT)

#### We chose a BERT Classifier for our classification task because we wanted to experiment with a more robust model using a transformer architecture versus a more basic Logistic Regression or SVM model.  Upon researching BERT, we discovered that DistilBERT is a good option compared to traditional BERT by itself.  DistilBERT uses distillation (learns to approximate from a teacher model - BERT) and can really useful producing powerful results with a smaller/faster model.

In [2]:
data = pd.read_csv('llm_source_attribution_data.csv')
data.head()

Unnamed: 0,Question Number,Question,Notes While Testing,Question Category,LLM,Model Version,Explicit Attribution,Date,Attributed Sources,Number of Sources,Are All Source Links Functional,Ads Included in Response,Multi-modal Response
0,1,"""where does the saying keeping up with the jon...",,General Knowledge,gemini,gemini_1.5_pro,explicit_attribution,11/2/24,"""www.commonlit.org, www.history.howstuffworks....",2.0,all_links_functional,no_ads,text_only
1,2,"""when did day light savings start in the us""",,History,gemini,gemini_1.5_pro,explicit_attribution,11/2/24,"""www.wikipedia.org""",1.0,all_links_functional,no_ads,text_only
2,3,"""what is the doll in the garden about""",,Entertainment,gemini,gemini_1.5_pro,explicit_attribution,11/2/24,"""www.goodreads.com, amazon.com, www.publishers...",3.0,all_links_functional,no_ads,text_only
3,4,"""where is a unitary system of government found""",,Politics,gemini,gemini_1.5_pro,explicit_attribution,11/2/24,"""www.britannica.com, www.guides.skylinecollege...",3.0,all_links_functional,no_ads,text_only
4,5,"""who dies in season 2 of the originals""",,Entertainment,gemini,gemini_1.5_pro,explicit_attribution,11/2/24,"""www.wikipedia.org""",1.0,all_links_functional,no_ads,text_only


In [3]:
bert_data = data.drop(columns=['Question Number', 'Notes While Testing', 'LLM', 'Model Version', 'Explicit Attribution', 'Date', 'Attributed Sources', 'Number of Sources', 'Are All Source Links Functional', 'Ads Included in Response', 'Multi-modal Response'])
bert_data.head()

Unnamed: 0,Question,Question Category
0,"""where does the saying keeping up with the jon...",General Knowledge
1,"""when did day light savings start in the us""",History
2,"""what is the doll in the garden about""",Entertainment
3,"""where is a unitary system of government found""",Politics
4,"""who dies in season 2 of the originals""",Entertainment


In [4]:
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")

X_basic = bert_data['Question'][:99]
y_basic = bert_data['Question Category'][:99]

label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y_basic)

X_train, X_test, y_train, y_test = train_test_split(X_basic, y_encoded, test_size=0.2, random_state=42)

class_weights = compute_class_weight(
    class_weight="balanced",
    classes=np.unique(y_train),
    y=y_train
)
class_weights_tensor = torch.tensor(class_weights, dtype=torch.float).to(device)

train_dataset = Dataset.from_dict({"text": list(X_train), "labels": list(y_train)})
test_dataset = Dataset.from_dict({"text": list(X_test), "labels": list(y_test)})
dataset = DatasetDict({"train": train_dataset, "test": test_dataset})

model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_function(example):
    return tokenizer(example["text"], truncation=True, padding="max_length", max_length=512)

encoded_dataset = dataset.map(tokenize_function, batched=True)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = accuracy_score(labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average="weighted", zero_division=0)
    return {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1,
    }


class WeightedLossModel(DistilBertForSequenceClassification):
    def forward(self, input_ids=None, attention_mask=None, labels=None):
        outputs = super().forward(input_ids=input_ids, attention_mask=attention_mask, labels=None)
        logits = outputs.logits

        loss = None
        if labels is not None:
            loss = CrossEntropyLoss(weight=class_weights_tensor)(logits, labels)

        return SequenceClassifierOutput(loss=loss, logits=logits)



model = WeightedLossModel.from_pretrained(model_name, num_labels=len(label_encoder.classes_))
model = model.to(device)

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=10,
    weight_decay=0.01,
    logging_dir="./logs",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    save_total_limit=2,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()

model.save_pretrained("./fine_tuned_model")
tokenizer.save_pretrained("./fine_tuned_model")

metrics = trainer.evaluate()
print("Evaluation Metrics:", metrics)

new_samples = [
    "What is the capital of France?",
    "Explain the theory of relativity.",
    "Who won the World Cup in 2018?"
]

tokens = tokenizer(new_samples, truncation=True, padding=True, max_length=256, return_tensors="pt")
tokens = {key: val.to(device) for key, val in tokens.items()} 

model.eval()
with torch.no_grad():
    outputs = model(**tokens)
    logits = outputs.logits
    probabilities = torch.nn.functional.softmax(logits, dim=1)
    predicted_classes = probabilities.argmax(dim=1).cpu().numpy()

predicted_labels = label_encoder.inverse_transform(predicted_classes)

for sample, label in zip(new_samples, predicted_labels):
    print(f"Text: {sample}")
    print(f"Predicted Label: {label}")




Map:   0%|          | 0/79 [00:00<?, ? examples/s]

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

Some weights of WeightedLossModel were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

{'eval_loss': 2.3623037338256836, 'eval_accuracy': 0.3, 'eval_precision': 0.275, 'eval_recall': 0.3, 'eval_f1': 0.2857142857142857, 'eval_runtime': 0.3791, 'eval_samples_per_second': 52.759, 'eval_steps_per_second': 13.19, 'epoch': 1.0}


  0%|          | 0/5 [00:00<?, ?it/s]

{'eval_loss': 2.297442674636841, 'eval_accuracy': 0.4, 'eval_precision': 0.2909090909090909, 'eval_recall': 0.4, 'eval_f1': 0.3333333333333333, 'eval_runtime': 0.3675, 'eval_samples_per_second': 54.427, 'eval_steps_per_second': 13.607, 'epoch': 2.0}


  0%|          | 0/5 [00:00<?, ?it/s]

{'eval_loss': 2.1945159435272217, 'eval_accuracy': 0.5, 'eval_precision': 0.3166666666666667, 'eval_recall': 0.5, 'eval_f1': 0.38646616541353385, 'eval_runtime': 0.3581, 'eval_samples_per_second': 55.851, 'eval_steps_per_second': 13.963, 'epoch': 3.0}


  0%|          | 0/5 [00:00<?, ?it/s]

{'eval_loss': 2.103670358657837, 'eval_accuracy': 0.55, 'eval_precision': 0.5352272727272727, 'eval_recall': 0.55, 'eval_f1': 0.4807936507936508, 'eval_runtime': 0.3584, 'eval_samples_per_second': 55.811, 'eval_steps_per_second': 13.953, 'epoch': 4.0}


  0%|          | 0/5 [00:00<?, ?it/s]

{'eval_loss': 2.02793550491333, 'eval_accuracy': 0.5, 'eval_precision': 0.5225, 'eval_recall': 0.5, 'eval_f1': 0.4556302521008403, 'eval_runtime': 0.3594, 'eval_samples_per_second': 55.655, 'eval_steps_per_second': 13.914, 'epoch': 5.0}


  0%|          | 0/5 [00:00<?, ?it/s]

{'eval_loss': 1.9570499658584595, 'eval_accuracy': 0.5, 'eval_precision': 0.5727272727272726, 'eval_recall': 0.5, 'eval_f1': 0.4722222222222222, 'eval_runtime': 0.359, 'eval_samples_per_second': 55.714, 'eval_steps_per_second': 13.929, 'epoch': 6.0}


  0%|          | 0/5 [00:00<?, ?it/s]

{'eval_loss': 1.9033527374267578, 'eval_accuracy': 0.6, 'eval_precision': 0.635, 'eval_recall': 0.6, 'eval_f1': 0.5453781512605043, 'eval_runtime': 0.3594, 'eval_samples_per_second': 55.641, 'eval_steps_per_second': 13.91, 'epoch': 7.0}


  0%|          | 0/5 [00:00<?, ?it/s]

{'eval_loss': 1.873721718788147, 'eval_accuracy': 0.6, 'eval_precision': 0.6283333333333333, 'eval_recall': 0.6, 'eval_f1': 0.5815686274509804, 'eval_runtime': 0.3589, 'eval_samples_per_second': 55.731, 'eval_steps_per_second': 13.933, 'epoch': 8.0}


  0%|          | 0/5 [00:00<?, ?it/s]

{'eval_loss': 1.8453611135482788, 'eval_accuracy': 0.55, 'eval_precision': 0.5977272727272727, 'eval_recall': 0.55, 'eval_f1': 0.5055555555555555, 'eval_runtime': 0.3587, 'eval_samples_per_second': 55.752, 'eval_steps_per_second': 13.938, 'epoch': 9.0}


  0%|          | 0/5 [00:00<?, ?it/s]

{'eval_loss': 1.8373782634735107, 'eval_accuracy': 0.55, 'eval_precision': 0.5977272727272727, 'eval_recall': 0.55, 'eval_f1': 0.5055555555555555, 'eval_runtime': 0.3688, 'eval_samples_per_second': 54.228, 'eval_steps_per_second': 13.557, 'epoch': 10.0}
{'train_runtime': 61.9309, 'train_samples_per_second': 12.756, 'train_steps_per_second': 3.229, 'train_loss': 1.8285108947753905, 'epoch': 10.0}


  0%|          | 0/5 [00:00<?, ?it/s]

Evaluation Metrics: {'eval_loss': 1.873721718788147, 'eval_accuracy': 0.6, 'eval_precision': 0.6283333333333333, 'eval_recall': 0.6, 'eval_f1': 0.5815686274509804, 'eval_runtime': 0.3766, 'eval_samples_per_second': 53.102, 'eval_steps_per_second': 13.276, 'epoch': 10.0}
Text: What is the capital of France?
Predicted Label: Geography
Text: Explain the theory of relativity.
Predicted Label: Science & Technology
Text: Who won the World Cup in 2018?
Predicted Label: Sports


#### Testing some additional questions

In [5]:

sample_questions = [
    "What is the longest river in the world?",
    "What is Newton's second law of motion?",
    "Who was the first President of the United States?",
    "Which country hosted the 2016 Summer Olympics?",
    "Who wrote 'Pride and Prejudice'?",
    "What is the meaning of existentialism?",
    "What is the difference between a bull market and a bear market?",
    "Who directed the movie 'Inception'?",
    "How do you make a classic Margherita pizza?",
    "What is the primary role of the United Nations?"
]

tokens = tokenizer(sample_questions, truncation=True, padding=True, max_length=256, return_tensors="pt")
tokens = {key: val.to(device) for key, val in tokens.items()}

model.eval()
with torch.no_grad():
    outputs = model(**tokens)
    logits = outputs.logits
    probabilities = torch.nn.functional.softmax(logits, dim=1)
    predicted_classes = probabilities.argmax(dim=1).cpu().numpy()  

predicted_labels = label_encoder.inverse_transform(predicted_classes)

for question, label in zip(sample_questions, predicted_labels):
    print(f"Question: {question}")
    print(f"Predicted Label: {label}")

Question: What is the longest river in the world?
Predicted Label: Geography
Question: What is Newton's second law of motion?
Predicted Label: Geography
Question: Who was the first President of the United States?
Predicted Label: History
Question: Which country hosted the 2016 Summer Olympics?
Predicted Label: Geography
Question: Who wrote 'Pride and Prejudice'?
Predicted Label: Entertainment
Question: What is the meaning of existentialism?
Predicted Label: History
Question: What is the difference between a bull market and a bear market?
Predicted Label: Economics & Commerce
Question: Who directed the movie 'Inception'?
Predicted Label: Entertainment
Question: How do you make a classic Margherita pizza?
Predicted Label: Entertainment
Question: What is the primary role of the United Nations?
Predicted Label: History


## Comments:

#### We chose this model because of its contextual understanding of text, pretrained knowledge base, and model customization.  We did test various hyperparameters in our model, including learning rate, batch size, and number of epochs.  Something we saw consistently in all of these tests is the model seemed to plateau around mid way point of the number of epochs, possibly indicating the model was struggling with generalization. Although the classification performance metrics were not very good (slightly above average), we believe with additional hyperparamter tuning and perhaps with a larger dataset, we will see significantly better performance metrics.