In [None]:
import os
import re
import csv
from pathlib import Path
from datetime import datetime

# === Path
RAW_DIR = "/content/drive/MyDrive/cbr_banking/data/raw"
OUTPUT_DIR = "/content/drive/MyDrive/cbr_banking/data/processed"
os.makedirs(OUTPUT_DIR, exist_ok=True)
CSV_PATH = os.path.join(OUTPUT_DIR, "cases.csv")

# === Helper: Ekstrak dengan regex
def extract_metadata(text):
    # === Nomor Perkara ===
    no_perkara = ""
    match = re.search(r"(putusan\s+)?nomor\s+([A-Z0-9/ .\-]+)", text, re.IGNORECASE)
    if not match:
        match = re.search(r"no\.?\s*perkara\s*[:\-]?\s*([A-Z0-9/ .\-]+)", text, re.IGNORECASE)
    if match:
        no_perkara = match.group(2).strip()

    # === Tanggal ===
    tanggal = ""
    match = re.search(r"(tanggal|tgl)[\s:]+(\d{1,2}\s+\w+\s+\d{4})", text, re.IGNORECASE)
    if match:
        try:
            tanggal = datetime.strptime(match.group(2), "%d %B %Y").strftime("%Y-%m-%d")
        except:
            tanggal = ""

    # === Ringkasan Fakta (dakwaan / barang bukti) ===
    fakta = ""
    for line in text.split('\n'):
        if any(keyword in line.lower() for keyword in ['barang bukti', 'dakwaan', 'didakwa']):
            fakta = line.strip()
            break

    # === Pasal ===
    pasals = re.findall(r"pasal\s+\d+[a-z]?(?:\s+ayat\s+\(\d+\))?", text, re.IGNORECASE)
    pasal = "; ".join(sorted(set([p.lower() for p in pasals])))

    # === Pihak (Terdakwa) ===
    pihak = ""
    match = re.search(r"terdakwa\s*:\s*([A-Z a-z.']{3,100})", text)
    if not match:
        match = re.search(r"terdakwa\s+([A-Z a-z.']{3,100})", text)
    if match:
        pihak = match.group(1).strip()

    return no_perkara, tanggal, fakta, pasal, pihak

# === Main processing
cases = []
files = sorted([f for f in os.listdir(RAW_DIR) if f.endswith(".txt")])
for i, file in enumerate(files):
    file_path = os.path.join(RAW_DIR, file)
    with open(file_path, "r", encoding="utf-8") as f:
        full_text = f.read()
        no_perkara, tanggal, fakta, pasal, pihak = extract_metadata(full_text)

        cases.append({
            "case_id": i + 1,
            "no_perkara": no_perkara,
            "tanggal": tanggal,
            "ringkasan_fakta": fakta,
            "pasal": pasal,
            "pihak": pihak,
            "text_full": full_text
        })

# === Simpan ke CSV
with open(CSV_PATH, "w", newline="", encoding="utf-8") as csvfile:
    fieldnames = ["case_id", "no_perkara", "tanggal", "ringkasan_fakta", "pasal", "pihak", "text_full"]
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    writer.writerows(cases)

print(f"✅ CSV selesai: {CSV_PATH}")
print(f"📄 Jumlah kasus: {len(cases)}")


✅ CSV selesai: /content/drive/MyDrive/cbr_banking/data/processed/cases.csv
📄 Jumlah kasus: 150
