In [None]:
import os

if not os.path.exists('/content/police-records-project'):
    !git clone https://github.com/c-goenka/police-records-project.git
    %cd /content/police-records-project
    !pip install -r requirements.txt
else:
    %cd /content/police-records-project

from google.colab import drive
drive.mount('/content/drive')

In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import fitz
import pytesseract
from tqdm.auto import tqdm
!apt-get install poppler-utils
from pdf2image import convert_from_path
from PIL import Image
from sklearn.model_selection import train_test_split, StratifiedKFold

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

In [None]:
def extract_text(file_path):
    text = ''
    ocr_pages = 0
    total_pages = 0

    doc = fitz.open(file_path)
    total_pages = len(doc)

    for i, page in enumerate(doc):
        page_text = page.get_text()

        if len(page_text.strip()) < 50:
            ocr_pages += 1
            images = convert_from_path(file_path, first_page=i+1, last_page=i+1, dpi=300)
            if images:
                page_text = pytesseract.image_to_string(images[0])

        text += page_text + "\n"

    doc.close()

    word_count = len(text.split())
    ocr_ratio = ocr_pages / total_pages if total_pages > 0 else 0

    return text, word_count, ocr_ratio

In [None]:
def process_folder(folder_path, label):
    pdf_files = [f for f in os.listdir(folder_path) if f.endswith(".pdf")]
    
    for filename in tqdm(pdf_files, desc=f"{label}", unit="file"):
        file_path = os.path.join(folder_path, filename)
        text, words, ocr_ratio = extract_text(file_path)

        if text is not None:
            records.append({
                "filename": filename,
                "text": text,
                "label": label,
                "word_count": words,
                "ocr_ratio": ocr_ratio
            })

In [None]:
data_path = "/content/drive/MyDrive/police-records-project-data"
records = []

for item in os.listdir(data_path):
    item_path = os.path.join(data_path, item)

    if not os.path.isdir(item_path):
        continue

    if item == "reports":
        for sub_label in os.listdir(item_path):
            sub_path = os.path.join(item_path, sub_label)
            if os.path.isdir(sub_path):
                label = f"reports-{sub_label}"
                print(f"Processing: {label}")
                process_folder(sub_path, label)
    else:
        print(f"Processing: {item}")
        process_folder(item_path, item)

df_raw = pd.DataFrame(records)

print(f"Extracted {len(df_raw)} documents")
print(f"Classes: {df_raw['label'].nunique()}")
print(f"Labels: {sorted(df_raw['label'].unique())}")