In [None]:
import os

if not os.path.exists('/content/police-records-project'):
    !git clone https://github.com/c-goenka/police-records-project.git
    %cd /content/police-records-project
    !pip install -r requirements.txt
else:
    %cd /content/police-records-project

from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score
import plotly.express as px
import umap

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

In [None]:
data_dir = "/content/drive/MyDrive/police-records-project-data/processed"

train_df = pd.read_csv(f"{data_dir}/train.csv")
test_df = pd.read_csv(f"{data_dir}/test.csv")

df_all = pd.concat([train_df, test_df], ignore_index=True)

print(f"Total documents: {len(df_all)}")
print(f"Classes: {df_all['label'].nunique()}")
print(f"Labels: {sorted(df_all['label'].unique())}")

In [None]:
model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')

embeddings = model.encode(
    df_all['text_clean'].tolist(),
    show_progress_bar=True,
    batch_size=32
)

print(f"Embeddings Shape: {embeddings.shape}")

In [None]:
reducer = umap.UMAP(n_components=2, random_state=RANDOM_SEED)
umap_embeddings = reducer.fit_transform(embeddings)

df_all['umap_x'] = umap_embeddings[:, 0]
df_all['umap_y'] = umap_embeddings[:, 1]

fig = px.scatter(
    df_all,
    x='umap_x',
    y='umap_y',
    color='label',
    title='Document Embeddings in 2D Space (Colored by Class)',
    width=900,
    height=700
)

fig.update_traces(marker=dict(size=10, opacity=0.7))
fig.update_layout(
    xaxis_title='UMAP Dimension 1',
    yaxis_title='UMAP Dimension 2',
    legend_title='Document Type'
)

fig.show()

In [None]:
n_clusters = df_all['label'].nunique()

kmeans = KMeans(n_clusters=n_clusters, random_state=RANDOM_SEED, n_init=10)
cluster_labels = kmeans.fit_predict(embeddings)

ari = adjusted_rand_score(df_all['label'], cluster_labels)

print(f"K-means clustering (k={n_clusters})")
print(f"Adjusted Rand Index: {ari:.4f}")