In [None]:
# === SETUP
!apt install poppler-utils -y > /dev/null
!pip install requests beautifulsoup4 lxml > /dev/null

import os, re, time
import requests
from bs4 import BeautifulSoup
from pathlib import Path
from urllib.request import urlretrieve
from google.colab import drive
import subprocess

# === MOUNT DRIVE
drive.mount('/content/drive')

# === PATH
BASE_DIR = "/content/drive/MyDrive/cbr_banking"
PDF_DIR  = f"{BASE_DIR}/data/pdf"
RAW_DIR  = f"{BASE_DIR}/data/raw"
LOG_DIR  = f"{BASE_DIR}/logs"
LOG_FILE = f"{LOG_DIR}/cleaning.log"

Path(PDF_DIR).mkdir(parents=True, exist_ok=True)
Path(RAW_DIR).mkdir(parents=True, exist_ok=True)
Path(LOG_DIR).mkdir(parents=True, exist_ok=True)

# === UTILITAS
def open_page(url):
    for _ in range(3):
        try:
            return BeautifulSoup(requests.get(url, timeout=15).text, "lxml")
        except:
            time.sleep(3)
    return None

def get_pdf_link(soup):
    try:
        tag = soup.find("a", href=re.compile("/direktori/download_file/.+/pdf/"))
        href = tag["href"]
        if href.startswith("http"):
            return href
        return "https://putusan3.mahkamahagung.go.id" + href
    except:
        return None

def clean_text(text):
    original_len = len(text)

    # Hapus bagian disclaimer panjang di akhir
    text = re.sub(r"Disclaimer[\s\S]+?kepaniteraan@mahkamahagung.go.id.*", "", text, flags=re.IGNORECASE)

    # Buang direktori, halaman, nomor hal
    text = re.sub(r'direktori putusan', '', text, flags=re.IGNORECASE)
    text = re.sub(r'hal\.?\s*\d+\s*dari\s*\d+\s*hal\.?', '', text, flags=re.IGNORECASE)
    text = re.sub(r'halaman\s+\d+', '', text, flags=re.IGNORECASE)

    # Normalisasi karakter
    text = text.replace("\xa0", " ")           # spasi aneh
    text = re.sub(r'\n{2,}', '\n', text)        # newline dobel
    text = text.strip()

    # Jangan buang karakter kecuali betul-betul mengganggu
    # Biarkan huruf kapital, simbol hukum, tanda baca tetap

    return text, original_len, len(text)


import shutil

def extract_text_pdftotext(pdf_path):
    temp_pdf = "/content/temp.pdf"
    temp_txt = "/content/temp.txt"

    import shutil
    shutil.copy(pdf_path, temp_pdf)

    # ✅ Pakai -layout agar struktur tidak hilang
    subprocess.run(["pdftotext", "-layout", "-enc", "UTF-8", temp_pdf, temp_txt], check=True)

    with open(temp_txt, "r", encoding="utf-8") as f:
        return f.read()



# === TAHAP 1: SCRAPE & DOWNLOAD PDF
def download_pdf_only(pdf_url, case_num):
    try:
        pdf_path = f"/content/case_{case_num:03d}.pdf"  # hanya sementara
        urlretrieve(pdf_url, pdf_path)

        size = os.path.getsize(pdf_path)
        if size < 5000:
            raise Exception(f"PDF terlalu kecil ({size} bytes)")

        print(f"✅ Downloaded: case_{case_num:03d}.pdf ({size} bytes)")
        return pdf_path  # return path

    except Exception as e:
        print(f"❌ Gagal download case_{case_num:03d}: {e}")
        return None


def scrape_pdf_and_extract(max_cases=35):
    total = 0
    base = "https://putusan3.mahkamahagung.go.id/search.html?jenis_doc=putusan&cat=5b3b183b41bc6ab3764cab235b9a3f8a&court=8bb6198cd9528aaac4199a1d5627bbb9"

    for page in range(1, 20):
        url = base if page == 1 else f"{base}&page={page}"
        print(f"📄 Halaman {page}")
        soup = open_page(url)
        if not soup:
            continue

        links = soup.find_all("a", href=re.compile("/direktori/putusan/"))
        for a in links:
            detail_url = a["href"]
            if not detail_url.startswith("http"):
                detail_url = "https://putusan3.mahkamahagung.go.id" + detail_url

            detail_soup = open_page(detail_url)
            if not detail_soup:
                continue

            pdf_link = get_pdf_link(detail_soup)
            if not pdf_link:
                continue

            case_txt = f"{RAW_DIR}/case_{total+1:03d}.txt"
            if os.path.exists(case_txt):
                continue

            temp_pdf_path = download_pdf_only(pdf_link, total + 1)
            if not temp_pdf_path:
                continue

            try:
                raw_text = extract_text_pdftotext(temp_pdf_path)
                cleaned, before, after = clean_text(raw_text)

                with open(case_txt, "w", encoding="utf-8") as f:
                    f.write(cleaned)

                with open(LOG_FILE, "a", encoding="utf-8") as log:
                    log.write(f"case_{total+1:03d}.txt | {before} chars -> {after} chars\n")

                print(f"✅ Ekstrak: case_{total+1:03d}.pdf ({before} → {after})")
                total += 1
            except Exception as e:
                print(f"❌ Gagal ekstrak case_{total+1:03d}: {e}")

            if total >= max_cases:
                print(f"\n🎉 Selesai ekstrak total {total} file")
                return


# === TAHAP 2: EKSTRAKSI PDF KE TXT
def extract_all_pdf_to_txt():
    pdf_files = sorted([f for f in os.listdir(PDF_DIR) if f.endswith(".pdf")])
    for file in pdf_files:
        pdf_path = os.path.join(PDF_DIR, file)
        txt_path = os.path.join(RAW_DIR, file.replace(".pdf", ".txt"))
        if os.path.exists(txt_path):
            continue
        try:
            raw_text = extract_text_pdftotext(pdf_path)
            cleaned, before, after = clean_text(raw_text)
            with open(txt_path, "w", encoding="utf-8") as f:
                f.write(cleaned)
            with open(LOG_FILE, "a", encoding="utf-8") as log:
                log.write(f"{file.replace('.pdf','.txt')} | {before} chars -> {after} chars\n")
            print(f"✅ Ekstrak: {file} ({before} → {after})")
        except Exception as e:
            print(f"❌ Gagal ekstrak {file}: {e}")


scrape_pdf_and_extract(max_cases=150)
extract_all_pdf_to_txt()



Mounted at /content/drive
📄 Halaman 1
✅ Downloaded: case_001.pdf (104859 bytes)
✅ Ekstrak: case_001.pdf (79387 → 69378)
✅ Downloaded: case_002.pdf (633628 bytes)
✅ Ekstrak: case_002.pdf (1433111 → 1266110)
✅ Downloaded: case_003.pdf (358019 bytes)
✅ Ekstrak: case_003.pdf (239700 → 209684)
✅ Downloaded: case_004.pdf (121641 bytes)
✅ Ekstrak: case_004.pdf (211006 → 183512)
✅ Downloaded: case_005.pdf (98252 bytes)
✅ Ekstrak: case_005.pdf (164538 → 143148)
✅ Downloaded: case_006.pdf (213932 bytes)
✅ Ekstrak: case_006.pdf (350256 → 305271)
✅ Downloaded: case_007.pdf (511786 bytes)
✅ Ekstrak: case_007.pdf (572578 → 504306)
✅ Downloaded: case_008.pdf (457734 bytes)
✅ Ekstrak: case_008.pdf (658273 → 578133)
✅ Downloaded: case_009.pdf (55748 bytes)
✅ Ekstrak: case_009.pdf (77769 → 67687)
✅ Downloaded: case_010.pdf (285324 bytes)
✅ Ekstrak: case_010.pdf (449529 → 391653)
✅ Downloaded: case_011.pdf (344321 bytes)
✅ Ekstrak: case_011.pdf (572278 → 504233)
✅ Downloaded: case_012.pdf (304969 bytes