In [None]:
import os

if not os.path.exists('/content/police-records-project'):
    !git clone https://github.com/c-goenka/police-records-project.git
    %cd /content/police-records-project
    !pip install -r requirements.txt
else:
    %cd /content/police-records-project

from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import re

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

In [None]:
data_dir = "/content/drive/MyDrive/police-records-project-data/processed/extracted_data.csv"
df = pd.read_csv(data_dir)

In [None]:
def preprocess_text(text):
    text = re.sub(r'[^\w\s.,!?;:()\-$%&/]', '', text)
    text = ' '.join(text.split())
    return text.strip()

df['text_clean'] = df['text'].apply(preprocess_text)

print(f"Original length: {len(df['text'].iloc[0])} chars")
print(f"Cleaned length: {len(df['text_clean'].iloc[0])} chars\n")
print(f"First 300 chars (original):")
print(repr(df['text'].iloc[0][:300]))
print(f"\nFirst 300 chars (cleaned):")
print(df['text_clean'].iloc[0][:300])

In [None]:
# Splitting strategy based on class size
train_indices = []
test_indices = []

for label in df['label'].unique():
    label_indices = df[df['label'] == label].index.tolist()
    count = len(label_indices)

    if count <= 2:
        train_indices.extend(label_indices)
    else:
        test_size = max(1, int(count * 0.25))
        label_train, label_test = train_test_split(
            label_indices,
            test_size=test_size,
            random_state=RANDOM_SEED
        )
        train_indices.extend(label_train)
        test_indices.extend(label_test)

train_df = df.loc[train_indices].reset_index(drop=True)
test_df = df.loc[test_indices].reset_index(drop=True)

print(f"Train set: {len(train_df)} documents")
print(f"Test set: {len(test_df)} documents")
print(f"Total: {len(train_df) + len(test_df)} documents")

In [None]:
output_dir = "/content/drive/MyDrive/police-records-project-data/processed"

train_path = f"{output_dir}/train.csv"
test_path = f"{output_dir}/test.csv"

train_df.to_csv(train_path, index=False)
test_df.to_csv(test_path, index=False)

print(f"Saved train set: {train_path}")
print(f"Saved test set: {test_path}")