In [None]:
import os

if not os.path.exists('/content/police-records-project'):
    !git clone https://github.com/c-goenka/police-records-project.git
    %cd /content/police-records-project
    !pip install -r requirements.txt
else:
    %cd /content/police-records-project

from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, f1_score

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

In [None]:
data_dir = "/content/drive/MyDrive/police-records-project-data/processed"

train_df = pd.read_csv(f"{data_dir}/train.csv")
test_df = pd.read_csv(f"{data_dir}/test.csv")

print(f"Train: {len(train_df)} documents")
print(f"Test: {len(test_df)} documents\n")
print(f"Classes: {train_df['label'].nunique()}")
print(f"Labels: {sorted(train_df['label'].unique())}")

In [None]:
model_name = "sentence-transformers/all-mpnet-base-v2"
model = SentenceTransformer(model_name)

print(f"Model loaded: {model_name}")
print(f"Model dimensions: {model.get_sentence_embedding_dimension()}")

In [None]:
X_train = model.encode(
    train_df['text_clean'].tolist(),
    show_progress_bar=True,
    batch_size=32
)

X_test = model.encode(
    test_df['text_clean'].tolist(),
    show_progress_bar=True,
    batch_size=32
)

y_train = train_df['label'].values
y_test = test_df['label'].values

In [None]:
classifiers = {
    'Logistic Regression': LogisticRegression(
        max_iter=1000,
        class_weight='balanced',
        random_state=RANDOM_SEED
    ),
    'Random Forest': RandomForestClassifier(
        n_estimators=100,
        class_weight='balanced',
        random_state=RANDOM_SEED
    ),
    'SVM': SVC(
        kernel='rbf',
        class_weight='balanced',
        random_state=RANDOM_SEED
    )
}

In [None]:
results = {}

for name, clf in classifiers.items():
    print(f"Training {name}...")
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)

    macro_f1 = f1_score(y_test, y_pred, average='macro')
    micro_f1 = f1_score(y_test, y_pred, average='micro')
    weighted_f1 = f1_score(y_test, y_pred, average='weighted')

    results[name] = {
        'model': clf,
        'predictions': y_pred,
        'macro_f1': macro_f1,
        'micro_f1': micro_f1,
        'weighted_f1': weighted_f1
    }

    print(f"  Macro F1: {macro_f1:.4f}")
    print(f"  Micro F1: {micro_f1:.4f}")
    print(f"  Weighted F1: {weighted_f1:.4f}\n")