In [1]:
import os

if not os.path.exists('/content/police-records-project'):
    !git clone https://github.com/c-goenka/police-records-project.git
    %cd /content/police-records-project
    !pip install -r requirements.txt
else:
    %cd /content/police-records-project

from google.colab import drive
drive.mount('/content/drive')

Cloning into 'police-records-project'...
remote: Enumerating objects: 120, done.[K
remote: Counting objects: 100% (120/120), done.[K
remote: Compressing objects: 100% (72/72), done.[K
remote: Total 120 (delta 61), reused 99 (delta 40), pack-reused 0 (from 0)[K
Receiving objects: 100% (120/120), 105.71 KiB | 3.92 MiB/s, done.
Resolving deltas: 100% (61/61), done.
/content/police-records-project
Collecting pymupdf (from -r requirements.txt (line 8))
  Downloading pymupdf-1.26.7-cp310-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Collecting pdf2image (from -r requirements.txt (line 9))
  Downloading pdf2image-1.17.0-py3-none-any.whl.metadata (6.2 kB)
Collecting pytesseract (from -r requirements.txt (line 10))
  Downloading pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)
Collecting setfit (from -r requirements.txt (line 14))
  Downloading setfit-1.1.3-py3-none-any.whl.metadata (12 kB)
Collecting evaluate>=0.3.0 (from setfit->-r requirements.txt (line 14))
  Downloading evaluate

In [2]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score
import plotly.express as px
import umap

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

In [3]:
data_dir = "/content/drive/MyDrive/police-records-project-data/processed"

train_df = pd.read_csv(f"{data_dir}/train.csv")
test_df = pd.read_csv(f"{data_dir}/test.csv")

df_all = pd.concat([train_df, test_df], ignore_index=True)

print(f"Total documents: {len(df_all)}")
print(f"Classes: {df_all['label'].nunique()}")
print(f"Labels: {sorted(df_all['label'].unique())}")

Total documents: 98
Classes: 11
Labels: ['discovery-package', 'emails-memorandum-correspondence', 'police-commision-agenda', 'press-release', 'reports-coroners', 'reports-criminal', 'reports-death-in-custody', 'reports-incident', 'reports-investigation', 'reports-supplemental', 'reports-use-of-force']


In [4]:
model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')

embeddings = model.encode(
    df_all['text_clean'].tolist(),
    show_progress_bar=True,
    batch_size=32
)

print(f"Embeddings Shape: {embeddings.shape}")

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Embeddings Shape: (98, 768)


In [5]:
reducer = umap.UMAP(n_components=2, random_state=RANDOM_SEED)
umap_embeddings = reducer.fit_transform(embeddings)

df_all['umap_x'] = umap_embeddings[:, 0]
df_all['umap_y'] = umap_embeddings[:, 1]

fig = px.scatter(
    df_all,
    x='umap_x',
    y='umap_y',
    color='label',
    title='Document Embeddings in 2D Space (Colored by Class)',
    width=900,
    height=700
)

fig.update_traces(marker=dict(size=10, opacity=0.7))
fig.update_layout(
    xaxis_title='UMAP Dimension 1',
    yaxis_title='UMAP Dimension 2',
    legend_title='Document Type'
)

fig.show()

  warn(


In [6]:
n_clusters = df_all['label'].nunique()

kmeans = KMeans(n_clusters=n_clusters, random_state=RANDOM_SEED, n_init=10)
cluster_labels = kmeans.fit_predict(embeddings)

ari = adjusted_rand_score(df_all['label'], cluster_labels)

print(f"K-means clustering (k={n_clusters})")
print(f"Adjusted Rand Index: {ari:.4f}")

K-means clustering (k=11)
Adjusted Rand Index: 0.2342
