In [None]:
import os

if not os.path.exists('/content/police-records-project'):
    !git clone https://github.com/c-goenka/police-records-project.git
    %cd /content/police-records-project
    !pip install -r requirements.txt
else:
    %cd /content/police-records-project

from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
import numpy as np
from setfit import SetFitModel, Trainer, TrainingArguments
from datasets import Dataset
from sklearn.metrics import classification_report, confusion_matrix, f1_score
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

In [None]:
data_dir = "/content/drive/MyDrive/police-records-project-data/processed"

train_df = pd.read_csv(f"{data_dir}/train.csv")
test_df = pd.read_csv(f"{data_dir}/test.csv")

print(f"Train: {len(train_df)} documents")
print(f"Test: {len(test_df)} documents\n")
print(f"Classes: {train_df['label'].nunique()}")
print(f"Labels: {sorted(train_df['label'].unique())}")

In [None]:
label_to_id = {label: idx for idx, label in enumerate(sorted(train_df['label'].unique()))}
id_to_label = {idx: label for label, idx in label_to_id.items()}

train_df['label_id'] = train_df['label'].map(label_to_id)
test_df['label_id'] = test_df['label'].map(label_to_id)

train_dataset = Dataset.from_dict({
    'text': train_df['text_clean'].tolist(),
    'label': train_df['label_id'].tolist()
})

test_dataset = Dataset.from_dict({
    'text': test_df['text_clean'].tolist(),
    'label': test_df['label_id'].tolist()
})

print(f"Label mapping:")
for label, idx in sorted(label_to_id.items(), key=lambda x: x[1]):
    print(f"  {idx}: {label}")

In [None]:
model_name = "sentence-transformers/all-mpnet-base-v2"
model = SetFitModel.from_pretrained(
    model_name,
    labels=list(label_to_id.keys())
)

print(f"Model loaded: {model_name}")

In [None]:
args = TrainingArguments(
    batch_size=16,
    num_epochs=1,
    eval_strategy="no",
    save_strategy="no",
    seed=RANDOM_SEED
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset
)

trainer.train()
print("SetFit Model Training complete")

In [None]:
predictions = model.predict(test_df['text_clean'].tolist())
y_true = test_df['label_id'].values
y_pred = [label_to_id[label] for label in predictions]

In [None]:
results_df = pd.DataFrame({
    'text': test_df['text_clean'],
    'true_label': test_df['label'],
    'pred_label': [id_to_label[i] for i in y_pred],
    'correct': y_true == y_pred
})

output_path = "/content/drive/MyDrive/police-records-project-data/processed/setfit_results.csv"
results_df.to_csv(output_path, index=False)

print(f"Saved results to: {output_path}")

In [None]:
macro_f1 = f1_score(y_true, y_pred, average='macro')
accuracy = (y_true == y_pred).mean()

print(f"Macro F1: {macro_f1:.4f}")
print(f"Accuracy: {accuracy:.4f} ({(y_true == y_pred).sum()}/{len(y_true)})")
print(f"Correct predictions: {(y_true == y_pred).sum()}/{len(y_true)}")

In [None]:
cm = confusion_matrix(y_true, y_pred)
labels = [id_to_label[i] for i in sorted(id_to_label.keys())]

plt.figure(figsize=(12, 10))
sns.heatmap(
    cm,
    annot=True,
    cmap='Blues',
    xticklabels=labels,
    yticklabels=labels,
    cbar_kws={'label': 'Count'}
)
plt.title('Confusion Matrix - SetFit', fontsize=14, pad=20)
plt.xlabel('Predicted', fontsize=12)
plt.ylabel('True', fontsize=12)
plt.xticks(rotation=45, ha='right')
plt.yticks()
plt.tight_layout()
plt.show()