In [1]:
import os
import re
import json
import fitz  # PyMuPDF
from striprtf.striprtf import rtf_to_text

# === Paths ===
input_folder = "Extract_data_from_rtf/rtfpdffilesfornewbaltimore"
output_base = "Extracted"

# Ensure output folder exists
os.makedirs(output_base, exist_ok=True)

# === Helper Function to Extract Pay Period Info from PDF ===
def extract_pay_period_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text()

    pattern = re.compile(
        r"Payroll\s+Register\s+Report.*?"
        r"Pay\s*Period\s*From\s*(\d{1,2}/\d{1,2}/\d{4})\s*to\s*(\d{1,2}/\d{1,2}/\d{4})"
        r".*?Pay\s*Date[:\s]*([\d/]+)",
        re.IGNORECASE | re.DOTALL
    )

    match = pattern.search(text)
    if match:
        return match.group(1), match.group(2), match.group(3)
    else:
        return None, None, None

# === Helper Function to Extract Employees from RTF ===
def extract_employees_from_rtf(rtf_path):
    with open(rtf_path, "r", encoding="utf-8") as f:
        rtf_content = f.read()

    text = rtf_to_text(rtf_content)
    employee_blocks = re.split(r"\bEmp#\s*\d+\b", text)
    employee_ids = re.findall(r"\bEmp#\s*(\d+)\b", text)

    cleaned_employees = []

    for emp_id, block in zip(employee_ids, employee_blocks[1:]):
        lines = block.splitlines()
        cleaned_lines = []
        for line in lines:
            cleaned_lines.append(line)
            if "Employee Tot:" in line:
                break
        cleaned_block = "\n".join(cleaned_lines).strip()
        if cleaned_block:
            cleaned_employees.append({
                "Emp#": emp_id,
                "Block": cleaned_block
            })

    return cleaned_employees

# === Main Loop ===
for file in os.listdir(input_folder):
    if file.endswith(".pdf"):
        base_name = os.path.splitext(file)[0]
        pdf_path = os.path.join(input_folder, file)
        rtf_path = os.path.join(input_folder, base_name + ".rtf")

        if not os.path.exists(rtf_path):
            print(f"⚠️ RTF file missing for: {base_name}")
            continue

        # Extract Pay Period Info
        pay_start, pay_end, pay_date = extract_pay_period_from_pdf(pdf_path)
        if not all([pay_start, pay_end, pay_date]):
            print(f"❌ Could not extract pay period from PDF: {pdf_path}")
            continue

        # Clean dates for folder name
        folder_name = f"NewBaltimore-{pay_start.replace('/', '-')}_{pay_end.replace('/', '-')}_{pay_date.replace('/', '-')}"
        output_dir = os.path.join(output_base, folder_name)
        os.makedirs(output_dir, exist_ok=True)

        # Extract employee blocks
        employee_data = extract_employees_from_rtf(rtf_path)

        # Save as JSON
        output_json_path = os.path.join(output_dir, "employee_data.json")
        with open(output_json_path, "w", encoding="utf-8") as f:
            json.dump(employee_data, f, indent=2)

        print(f"✅ Extracted: {base_name} → {output_json_path}")


✅ Extracted: 2025-01-03 1 Payroll Set → Extracted\NewBaltimore-12-14-2024_12-27-2024_01-03-2025\employee_data.json
✅ Extracted: 2025-01-17 3 Payroll Set → Extracted\NewBaltimore-12-28-2024_01-10-2025_01-17-2025\employee_data.json
✅ Extracted: 2025-01-31 4 Payroll Set → Extracted\NewBaltimore-01-11-2025_01-24-2025_01-31-2025\employee_data.json
✅ Extracted: 2025-02-14 5 Payroll Set → Extracted\NewBaltimore-01-25-2025_02-07-2025_02-14-2025\employee_data.json
✅ Extracted: 2025-02-28 6 Payroll Set → Extracted\NewBaltimore-02-08-2025_02-21-2025_02-28-2025\employee_data.json
✅ Extracted: 2025-03-14 7 Payroll Set → Extracted\NewBaltimore-02-22-2025_03-07-2025_03-14-2025\employee_data.json
✅ Extracted: 2025-03-28 8 Payroll Set → Extracted\NewBaltimore-03-08-2025_03-21-2025_03-28-2025\employee_data.json
✅ Extracted: 2025-03-31 2 Payroll Set → Extracted\NewBaltimore-01-01-2025_03-31-2025_03-31-2025\employee_data.json
✅ Extracted: 2025-04-11 9 Payroll Set → Extracted\NewBaltimore-03-22-2025_04-04-

In [3]:
total_chunks = sum(len(json.load(open(os.path.join("Extracted", folder, "employee_data.json"))))
                   for folder in os.listdir("Extracted")
                   if os.path.isdir(os.path.join("Extracted", folder)) and
                   os.path.exists(os.path.join("Extracted", folder, "employee_data.json")))
total_chunks

224