In [19]:
import PyPDF2
import re
import os
import pandas as pd
from tqdm import tqdm

# ===================== 1. Folder Setting ======================
folder_path = "data/PDF"
output_excel = "results/Overview_TeksLengkap_Strict.xlsx"
log_file = "data/Logs/log_tekslengkap_strict.txt"

os.makedirs(folder_path, exist_ok=True)
os.makedirs("results", exist_ok=True)
os.makedirs("data/Logs", exist_ok=True)

with open(log_file, "w", encoding="utf-8") as f:
    f.write("=== LOG EKSTRAKSI PUTUSAN — STRICT MODE ===\n\n")


# ===================== 2. EXTRACT + CLEAN TEXT ======================
def extract_text_from_pdf(pdf_path):
    """Extract text dari PDF MA + bersihkan watermark/noise."""
    try:
        with open(pdf_path, "rb") as file:
            reader = PyPDF2.PdfReader(file)
            text = ""
            for page in reader.pages:
                t = page.extract_text() or ""
                text += t + "\n"

        # Bersihkan watermark dan noise
        noise_patterns = [
            r"Mahkamah Agung Republik Indonesia",
            r"Direktori Putusan.*?mahkamahagung.go.id",
            r"Disclaimer.*?(Email|Telp).*?(\n|$)",
            r"Halaman\s+\d+\s+dari\s+\d+",
        ]
        for p in noise_patterns:
            text = re.sub(p, "", text, flags=re.I | re.S)

        text = text.replace("\u00a0", " ")
        text = re.sub(r"\s+", " ", text)
        return text.strip()

    except Exception as e:
        with open(log_file, "a", encoding="utf-8") as f:
            f.write(f"[ERROR] PDF gagal dibaca: {pdf_path} | {e}\n")
        return ""


# ===================== 3. SMART SPACING ======================
def smart_spacing(text):
    """Memperbaiki spasi teks hasil PDF MA agar terbaca rapi."""
    text = re.sub(r"(?<=[a-z])([A-Z])", r" \1", text)
    text = re.sub(r"(?<! )(\d+)", r" \1 ", text)
    corrections = {
        "AMARPUTUSAN": "AMAR PUTUSAN",
        "BARANGBUKTI": "BARANG BUKTI",
        "BARBUK": "BARANG BUKTI",
        "MENJATUHKANPIDANA": "MENJATUHKAN PIDANA",
    }
    for k, v in corrections.items():
        text = text.replace(k, v)
    text = re.sub(r"\s+", " ", text)
    return text.strip()


# ===================== 4. EKSTRAKSI KHUSUS ======================

def get_no_putusan(text):
    """
    Ambil nomor putusan dengan berbagai pola fallback:
    - 'Nomor: 408/Pid.Sus/...'
    - 'No. 408/Pid.Sus/...'
    - '408/Pid.Sus/.../PN ...'
    """
    pola_list = [
        r"Nomor\s*[:\-]?\s*(\d{1,4}/Pid\.Sus/\d{4}/PN\s+[A-Za-z]+)",
        r"No\.?\s*(\d{1,4}/Pid\.Sus/\d{4}/PN\s+[A-Za-z]+)",
        r"(\d{1,4}/Pid\.Sus/\d{4}/PN\s+[A-Za-z]+)"
    ]
    for pola in pola_list:
        match = re.search(pola, text, flags=re.I)
        if match:
            return match.group(1).strip()

    # fallback: baris pertama yang memuat 'Pid.Sus'
    fallback = re.search(r"(\d{1,4}/Pid\.Sus/.*?PN\s*[A-Za-z]+)", text[:500], flags=re.I)
    if fallback:
        return fallback.group(1).strip()

    return "TIDAK DITEMUKAN"


def get_lembaga(text):
    """
    Ambil lembaga peradilan dengan pola fleksibel:
    - 'PN Kediri', 'PN Kab Kediri', 'PN Gpr'
    - 'PENGADILAN NEGERI KABUPATEN KEDIRI'
    """
    pola_list = [
        r"\bPN\s+(Kab\s+)?[A-Za-z]+",
        r"PENGADILAN\s+NEGERI\s+[A-Za-z\s]+",
        r"Pengadilan\s+Negeri\s+[A-Za-z\s]+",
    ]
    for pola in pola_list:
        match = re.search(pola, text, flags=re.I)
        if match:
            lembaga = match.group(0)
            lembaga = re.sub(r"PENGADILAN\s+NEGERI", "PN", lembaga, flags=re.I)
            lembaga = lembaga.strip()
            return lembaga

    # fallback: cari dari No Putusan jika ada "PN XXX"
    np = get_no_putusan(text)
    match2 = re.search(r"(PN\s+[A-Za-z]+)", np)
    if match2:
        return match2.group(1)

    return "TIDAK DITEMUKAN"


def clean_paragraphs(teks):
    """Membersihkan paragraf agar rapi di Excel."""
    teks = re.sub(r"[\r\n]+", " ", teks)
    teks = re.sub(r"\s*-\s*", "\n- ", teks)
    teks = re.sub(r";\s*", ";\n", teks)
    teks = re.sub(r"\s+", " ", teks)
    return teks.strip()


def get_barang_bukti(text):
    """Ambil bagian BARANG BUKTI secara lengkap tapi bersih."""
    pola = r"(barang bukti[\s\S]*?)(?=(dirampas|untuk dimusnahkan|amar putusan|menetapkan|mengadili|menjatuhkan))"
    match = re.search(pola, text, flags=re.I)
    if match:
        return clean_paragraphs(match.group(1))
    else:
        pola_alt = r"(menetapkan\s+barang\s+bukti[\s\S]*?)(?=(dirampas|amar putusan|mengadili|menjatuhkan))"
        match2 = re.search(pola_alt, text, flags=re.I)
        if match2:
            return clean_paragraphs(match2.group(1))
    return "TIDAK DITEMUKAN"


def get_amar_putusan(text):
    """Ambil bagian AMAR PUTUSAN (atau MENGADILI) secara lengkap."""
    pola_list = [
        r"(AMAR PUTUSAN[\s\S]*)$",
        r"(MENGADILI[\s\S]*)$",
        r"(MEMUTUSKAN[\s\S]*)$",
        r"(MENETAPKAN[\s\S]*)$",
    ]
    for pola in pola_list:
        match = re.search(pola, text, flags=re.I)
        if match:
            hasil = re.sub(r"\s+", " ", match.group(1))
            return hasil.strip()
    return "TIDAK DITEMUKAN"


# ===================== 5. PROSES SEMUA FILE ======================

results = []
pdf_files = [f for f in os.listdir(folder_path) if f.lower().endswith(".pdf")]

for idx, file in enumerate(tqdm(pdf_files, desc="Ekstraksi STRICT MODE"), start=1):
    full_path = os.path.join(folder_path, file)

    text = extract_text_from_pdf(full_path)
    text = smart_spacing(text)

    if not text:
        continue

    no_putusan = get_no_putusan(text)
    lembaga = get_lembaga(text)

    # Jika salah satu belum ditemukan → tulis ke log untuk diperiksa manual
    if no_putusan == "TIDAK DITEMUKAN" or lembaga == "TIDAK DITEMUKAN":
        with open(log_file, "a", encoding="utf-8") as f:
            f.write(f"[PERINGATAN] Data penting kosong di file {file}\n")
            f.write(f"No Putusan: {no_putusan}\nLembaga: {lembaga}\n---\n")

    bb = get_barang_bukti(text)
    amar = get_amar_putusan(text)

    results.append({
        "No": idx,
        "No Putusan": no_putusan,
        "Lembaga Peradilan": lembaga,
        "Barang Bukti": bb,
        "Amar Putusan": amar,
    })


# ===================== 6. SAVE TO EXCEL ======================
df = pd.DataFrame(results)
df.to_excel(output_excel, index=False)

print("\n✅ EKSTRAKSI STRICT MODE SELESAI!")
print("✅ Output disimpan di:", output_excel)
print("✅ Log di:", log_file)


Ekstraksi STRICT MODE: 100%|██████████| 50/50 [00:36<00:00,  1.39it/s]


✅ EKSTRAKSI STRICT MODE SELESAI!
✅ Output disimpan di: results/Overview_TeksLengkap_Strict.xlsx
✅ Log di: data/Logs/log_tekslengkap_strict.txt



