In [None]:
import os

if not os.path.exists('/content/police-records-project'):
    !git clone https://github.com/c-goenka/police-records-project.git
    %cd /content/police-records-project
    !pip install -r requirements.txt
else:
    %cd /content/police-records-project

from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, f1_score

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

In [None]:
data_dir = "/content/drive/MyDrive/police-records-project-data/processed"

train_df = pd.read_csv(f"{data_dir}/train.csv")
test_df = pd.read_csv(f"{data_dir}/test.csv")

print(f"Train: {len(train_df)} documents")
print(f"Test: {len(test_df)} documents\n")
print(f"Classes: {train_df['label'].nunique()}")
print(f"Labels: {sorted(train_df['label'].unique())}")

In [None]:
model_name = "sentence-transformers/all-mpnet-base-v2"
model = SentenceTransformer(model_name)

print(f"Model loaded: {model_name}")
print(f"Model dimensions: {model.get_sentence_embedding_dimension()}")

In [None]:
X_train = model.encode(
    train_df['text_clean'].tolist(),
    show_progress_bar=True,
    batch_size=32
)

X_test = model.encode(
    test_df['text_clean'].tolist(),
    show_progress_bar=True,
    batch_size=32
)

y_train = train_df['label'].values
y_test = test_df['label'].values

print(f"Train embeddings shape: {X_train.shape}")
print(f"Test embeddings shape: {X_test.shape}")

In [None]:
classifiers = {
    'Logistic Regression': LogisticRegression(
        max_iter=1000,
        class_weight='balanced',
        random_state=RANDOM_SEED
    ),
    'Random Forest': RandomForestClassifier(
        n_estimators=100,
        class_weight='balanced',
        random_state=RANDOM_SEED
    ),
    'SVM': SVC(
        kernel='rbf',
        class_weight='balanced',
        random_state=RANDOM_SEED
    )
}

In [None]:
for name, clf in classifiers.items():
    print(f"Training {name}...")
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)

    macro_f1 = f1_score(y_test, y_pred, average='macro')
    accuracy = (y_test == y_pred).mean()

    results[name] = {
        'model': clf,
        'predictions': y_pred,
        'macro_f1': macro_f1,
        'accuracy': accuracy
    }

    print(f"  Macro F1: {macro_f1:.4f}")
    print(f"  Accuracy: {accuracy:.4f} ({(y_test == y_pred).sum()}/{len(y_test)})\n")
    print(f"  Correct predictions: {(y_test == y_pred).sum()}/{len(y_test)}")

In [None]:
results = {}

results_df = pd.DataFrame({
  'text': test_df['text_clean'],
  'true_label': y_test,
  'pred_label': best_pred,
  'correct': y_test == best_pred
})

output_path = f"{data_dir}/embeddings_classifier_results.csv"
results_df.to_csv(output_path, index=False)

print(f"Saved results to: {output_path}")

In [None]:
cm = confusion_matrix(y_test, results['Random Forest']['predictions'])
labels = sorted(train_df['label'].unique())

plt.figure(figsize=(12, 10))
sns.heatmap(
    cm,
    annot=True,
    cmap='Blues',
    xticklabels=labels,
    yticklabels=labels,
    cbar_kws={'label': 'Count'}
)

plt.title(f'Confusion Matrix - Random Forest', fontsize=14, pad=20)
plt.xlabel('Predicted', fontsize=12)
plt.ylabel('True', fontsize=12)
plt.xticks(rotation=45, ha='right')
plt.yticks()
plt.tight_layout()
plt.show()