In [None]:
import os

if not os.path.exists('/content/police-records-project'):
    !git clone https://github.com/c-goenka/police-records-project.git
    %cd /content/police-records-project
    !pip install -r requirements.txt
else:
    %cd /content/police-records-project

from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
from sklearn.metrics import f1_score

In [None]:
data_dir = "/content/drive/MyDrive/police-records-project-data/processed"

setfit_df = pd.read_csv(f"{data_dir}/setfit_results.csv")
embeddings_df = pd.read_csv(f"{data_dir}/embeddings_classifier_results.csv")

In [None]:
def compute_metrics(df):
    y_true = df['true_label']
    y_pred = df['pred_label']

    return {
        'Macro F1': f1_score(y_true, y_pred, average='macro'),
        'Accuracy': (y_true == y_pred).mean(),
        'Correct/Total': f"{(y_true == y_pred).sum()}/{len(y_true)}"
    }

setfit_metrics = compute_metrics(setfit_df)
embeddings_metrics = compute_metrics(embeddings_df)

comparison_df = pd.DataFrame({
    'SetFit': setfit_metrics,
    'Embeddings+Classifier': embeddings_metrics
})

comparison_df

In [None]:
setfit_f1 = comparison_df.loc['Macro F1', 'SetFit']
embeddings_f1 = comparison_df.loc['Macro F1', 'Embeddings+Classifier']

if setfit_f1 > embeddings_f1:
    winner = "SetFit"
    winner_f1 = setfit_f1
    diff = setfit_f1 - embeddings_f1
else:
    winner = "Embeddings+Classifier"
    winner_f1 = embeddings_f1
    diff = embeddings_f1 - setfit_f1

print(f"\nBest Method: {winner}")
print(f"Macro F1: {winner_f1:.4f}")
print(f"Margin: {diff:.4f}\n")
print(f"Dataset: 75 train / 23 test samples across 11 classes")
print(f"Both methods use class balancing to handle imbalanced data")