In [None]:
import pandas as pd
pd.set_option('display.max_colwidth', 150)

In [None]:
data = pd.read_excel(filename, header=0)
data = data[data.columns[1]]
data = data[data.notna()]
data.head()

In [None]:
with open("data/labels.txt") as f:
    labels = [line.rstrip() for line in f]

labels

## Use pipeline for simplicity. 

Also tried answerdotai/ModernBERT-base but performance very poor

In [None]:
from transformers import pipeline

results = []
pipe = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
for i in data.index:
    results.append(
        pipe(data.loc[i], labels, multi_label=True)
    )

## Multilabel

If predicted probability > 0.5, use label. if no labels > 0.5, use max predicted prob as label

In [None]:
for r in results:
    predicted_labels = []
    for idx in range(0, len(r["scores"])) :
        if r["scores"][idx] > 0.5:
            predicted_labels.append(r["labels"][idx])
    if len(predicted_labels) < 1:
        predicted_labels.append(r["labels"][r["scores"].index(max(r["scores"]))])
    r["predicted_labels"] = predicted_labels

In [None]:
results_df = pd.DataFrame(results)
results_df[["sequence", "predicted_labels"]]

In [None]:
results_df.loc[429]["sequence"]

## Add human in the loop: finetune the model to correct its mistakes

In [None]:
# Check 10 random rows from the dataset. correct them if wrong

import random

for i in random.sample(list(results_df.index), 10):
    print(i)
    print(f'Text: {results_df.loc[i, "sequence"]}')
    print(f'Labels: {results_df.loc[i,"predicted_labels"]}')

## This is manual correcting, which is gross, but I would build in a nice UI to correct the labels properly

In [None]:
results_df["corrected_labels"] = results_df["predicted_labels"].copy()
results_df.at[124, "corrected_labels"] = ["Communications and engagement"]
results_df.at[282, "corrected_labels"] = ["Other"]
results_df.at[168, "corrected_labels"] = ["Communications and engagement"]
results_df.at[398, "corrected_labels"] = ['Funding', 'Communications and engagement', 'Workforce']
results_df.at[79, "corrected_labels"] = ['Other']
results_df.at[339, "corrected_labels"] = ['Other']
results_df.at[231, "corrected_labels"] = ['Other']
results_df.at[160, "corrected_labels"] = ['Workforce']
results_df.at[433, "corrected_labels"] = ['Other']
results_df.at[431, "corrected_labels"] = ['Governance']

In [None]:
results_df[["sequence", "corrected_labels"]]

## Retrain model

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
import tensorflow as tf
from transformers import BartTokenizer, BartForSequenceClassification, Trainer, TrainingArguments, AutoModelForSequenceClassification
from transformers import create_optimizer
import torch
from torch.utils.data import Dataset

In [None]:
# Binarize labels
mlb = MultiLabelBinarizer(classes=labels)
binarized_labels = mlb.fit_transform(results_df['corrected_labels'])

# Train/val split
train_texts, val_texts, train_labels, val_labels = train_test_split(
    results_df['sequence'].tolist(),
    binarized_labels,
    test_size=0.2,
    random_state=42
)

# Load tokenizer
model_name = "facebook/bart-large-mnli"
# Load tokenizer
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-mnli')

# Dataset class
class MultiLabelDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=512):
        self.encodings = tokenizer(texts, truncation=True, padding=True, max_length=max_length, return_tensors="pt")
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float)
        return item

    def __len__(self):
        return len(self.labels)
    
# Create datasets
train_dataset = MultiLabelDataset(train_texts, train_labels, tokenizer)
val_dataset = MultiLabelDataset(val_texts, val_labels, tokenizer)

In [None]:
# Load model
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=len(labels),
    problem_type="multi_label_classification",
    ignore_mismatched_sizes=True
)

# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    eval_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=4,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss"
)


# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

# Train
trainer.train()

In [None]:
# Save model
trainer.save_model("./bart-multilabel-model")

# Save tokenizer
tokenizer.save_pretrained("./bart-multilabel-model")

## Make predictions with finetuned model

In [None]:
texts = results_df["sequence"].tolist()
encodings = tokenizer(texts, truncation=True, padding=True, max_length=512, return_tensors="pt")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
encodings = {k: v.to(device) for k, v in encodings.items()}

# Predict
model.eval()
with torch.no_grad():
    outputs = model(**encodings)
    probs = torch.sigmoid(outputs.logits)

preds = (probs > 0.5).int().cpu().numpy()

predicted_labels = mlb.inverse_transform(preds)

results_df["predicted_labels_2"] = predicted_labels

In [None]:
results_df.head()

In [None]:
results_df.to_csv("zero_shot_with_bart-large-mnli.csv", index=False)