In [None]:
import os

if not os.path.exists('/content/police-records-project'):
    !git clone https://github.com/c-goenka/police-records-project.git
    %cd /content/police-records-project
    !pip install -r requirements.txt
else:
    %cd /content/police-records-project

from google.colab import drive
drive.mount('/content/drive')

In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import fitz
import pytesseract
from pdf2image import convert_from_path
from PIL import Image
from sklearn.model_selection import train_test_split, StratifiedKFold

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

In [None]:
def extract_text(file_path):
    text = ''
    ocr_pages = 0
    total_pages = 0

    doc = fitz.open(file_path)
    total_pages = len(doc)

    for i, page in enumerate(doc):
        page_text = page.get_text()

        if len(page_text.strip()) < 50:
            ocr_pages += 1
            images = convert_from_path(file_path, first_page=i+1, last_page=i+1, dpi=300)
            if images:
                page_text = pytesseract.image_to_string(images[0])

        text += page_text + "\n"

    doc.close()

    word_count = len(text.split())
    ocr_ratio = ocr_pages / total_pages if total_pages > 0 else 0

    return text, word_count, ocr_ratio

In [None]:
data_path = "/content/drive/MyDrive/police-records-project-data"
records = []

for label in os.listdir(data_path):
    folder = os.path.join(data_path, label)
    if os.path.isdir(folder):
        print(f"Processing: {label}")

        for filename in os.listdir(folder):
            if filename.endswith(".pdf"):
                file_path = os.path.join(folder, filename)
                text, words, ocr_ratio = extract_text(file_path)

                if text is not None:
                    records.append({
                        "filename": filename,
                        "text": text,
                        "label": label,
                        "word_count": words,
                        "ocr_ratio": ocr_ratio
                    })

df_raw = pd.DataFrame(records)
print(f"Extracted {len(df_raw)} documents")
print(f"Classes: {df_raw['label'].nunique()}")