In [1]:
import os

if not os.path.exists('/content/police-records-project'):
    !git clone https://github.com/c-goenka/police-records-project.git
    %cd /content/police-records-project
    !pip install -r requirements.txt
else:
    %cd /content/police-records-project

from google.colab import drive
drive.mount('/content/drive')

Cloning into 'police-records-project'...
remote: Enumerating objects: 116, done.[K
remote: Counting objects: 100% (116/116), done.[K
remote: Compressing objects: 100% (70/70), done.[K
remote: Total 116 (delta 59), reused 95 (delta 38), pack-reused 0 (from 0)[K
Receiving objects: 100% (116/116), 104.89 KiB | 3.38 MiB/s, done.
Resolving deltas: 100% (59/59), done.
/content/police-records-project
Collecting pymupdf (from -r requirements.txt (line 8))
  Downloading pymupdf-1.26.7-cp310-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Collecting pdf2image (from -r requirements.txt (line 9))
  Downloading pdf2image-1.17.0-py3-none-any.whl.metadata (6.2 kB)
Collecting pytesseract (from -r requirements.txt (line 10))
  Downloading pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)
Collecting setfit (from -r requirements.txt (line 14))
  Downloading setfit-1.1.3-py3-none-any.whl.metadata (12 kB)
Collecting evaluate>=0.3.0 (from setfit->-r requirements.txt (line 14))
  Downloading evaluate

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import re

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

In [4]:
data_dir = "/content/drive/MyDrive/police-records-project-data/processed/extracted_data.csv"
df = pd.read_csv(data_dir)

In [5]:
def preprocess_text(text):
    text = re.sub(r'[^\w\s.,!?;:()\-$%&/]', '', text)
    text = ' '.join(text.split())
    return text.strip()

df['text_clean'] = df['text'].apply(preprocess_text)

print(f"Original length: {len(df['text'].iloc[0])} chars")
print(f"Cleaned length: {len(df['text_clean'].iloc[0])} chars\n")
print(f"First 300 chars (original):")
print(repr(df['text'].iloc[0][:300]))
print(f"\nFirst 300 chars (cleaned):")
print(df['text_clean'].iloc[0][:300])

Original length: 105929 chars
Cleaned length: 98408 chars

First 300 chars (original):
'\ue010\n\ue011\n\ue00e\n\ue00f\nSH - Sheriff\nDiscovery Package - DIS\nSH-DIS-56980\nCase Number:\n24112642\nRequested/Printed:\n4/3/2024 1:42:30 PM\nCASE DETAIL\nCase Number:\n24112642\nCFS Number: E9144930\nSubject:\n(HOSPITAL ARRAIGNMENT)A/R - 273.5 PC/69 PC (A1) Sunny Estrada\nDate Opened:\n3/30/2024\nAssigned Primary \nDetective/Depu'

First 300 chars (cleaned):
SH - Sheriff Discovery Package - DIS SH-DIS-56980 Case Number: 24112642 Requested/Printed: 4/3/2024 1:42:30 PM CASE DETAIL Case Number: 24112642 CFS Number: E9144930 Subject: (HOSPITAL ARRAIGNMENT)A/R - 273.5 PC/69 PC (A1) Sunny Estrada Date Opened: 3/30/2024 Assigned Primary Detective/Deputy: Appro


In [6]:
# Splitting strategy based on class size
train_indices = []
test_indices = []

for label in df['label'].unique():
    label_indices = df[df['label'] == label].index.tolist()
    count = len(label_indices)

    if count <= 2:
        train_indices.extend(label_indices)
    else:
        test_size = max(1, int(count * 0.25))
        label_train, label_test = train_test_split(
            label_indices,
            test_size=test_size,
            random_state=RANDOM_SEED
        )
        train_indices.extend(label_train)
        test_indices.extend(label_test)

train_df = df.loc[train_indices].reset_index(drop=True)
test_df = df.loc[test_indices].reset_index(drop=True)

print(f"Train set: {len(train_df)} documents")
print(f"Test set: {len(test_df)} documents")
print(f"Total: {len(train_df) + len(test_df)} documents")

Train set: 75 documents
Test set: 23 documents
Total: 98 documents


In [7]:
output_dir = "/content/drive/MyDrive/police-records-project-data/processed"

train_path = f"{output_dir}/train.csv"
test_path = f"{output_dir}/test.csv"

train_df.to_csv(train_path, index=False)
test_df.to_csv(test_path, index=False)

print(f"Saved train set: {train_path}")
print(f"Saved test set: {test_path}")

Saved train set: /content/drive/MyDrive/police-records-project-data/processed/train.csv
Saved test set: /content/drive/MyDrive/police-records-project-data/processed/test.csv
