In [None]:

import os
import re
import time
import pandas as pd
import requests
from bs4 import BeautifulSoup
from datetime import date
from pdfminer.high_level import extract_text
from google.colab import drive

# ================ MOUNT GOOGLE DRIVE ==================
drive.mount('/content/drive')

# ================== SETUP =============================
HEADERS = {
    "User-Agent": "Mozilla/5.0"
}

# ================= PATH ==============================
def create_path(folder_name):
    path = os.path.join("/content/drive/MyDrive/Penalaran Komputer", folder_name)
    os.makedirs(path, exist_ok=True)
    return path

# ================= SCRAPING ===========================
def open_page(link):
    for _ in range(5):
        try:
            r = requests.get(link, headers=HEADERS, timeout=30)
            r.raise_for_status()
            time.sleep(2)
            return BeautifulSoup(r.text, "lxml")
        except:
            time.sleep(3)
    return None

def normalize_link(link):
    return link if link.startswith("http") else "https://putusan3.mahkamahagung.go.id" + link

def get_detail(soup, keyword):
    td = soup.find(lambda tag: tag.name == "td" and keyword.lower() in tag.text.lower())
    return td.find_next().get_text(strip=True) if td else ""

def get_pdf(url, path_pdf):
    try:
        r = requests.get(url, headers=HEADERS)
        r.raise_for_status()
        fname = os.path.basename(url)
        fpath = os.path.join(path_pdf, fname)
        with open(fpath, "wb") as f:
            f.write(r.content)
        print(f"Downloaded: {fname}")
        return fpath, fname
    except:
        return None, None

def extract_data(link, path_output, path_pdf):
    soup = open_page(normalize_link(link))
    if not soup: return None, False

    table = soup.find("table", {"class": "table"})
    if not table: return None, False

    judul = table.find("h2").text.strip() if table.find("h2") else ""

    fields = [
        "Nomor", "Tingkat Proses", "Klasifikasi", "Kata Kunci", "Tahun", "Tanggal Register",
        "Lembaga Peradilan", "Jenis Lembaga Peradilan", "Hakim Ketua", "Hakim Anggota",
        "Panitera", "Amar", "Amar Lainnya", "Catatan Amar", "Tanggal Musyawarah",
        "Tanggal Dibacakan", "Kaidah", "Status", "Abstrak"
    ]
    values = [get_detail(table, f) for f in fields]

    link_pdf_tag = soup.find("a", href=re.compile(r"/pdf/"))
    if not link_pdf_tag: return None, False
    pdf_url = normalize_link(link_pdf_tag["href"])
    pdf_path, pdf_name = get_pdf(pdf_url, path_pdf)
    if not pdf_path: return None, False

    text_pdf = extract_text(pdf_path)
    data = [judul] + values + [normalize_link(link), pdf_url, pdf_name, text_pdf]
    columns = [
        "judul", "nomor", "tingkat_proses", "klasifikasi", "kata_kunci", "tahun",
        "tanggal_register", "lembaga_peradilan", "jenis_lembaga_peradilan", "hakim_ketua",
        "hakim_anggota", "panitera", "amar", "amar_lainnya", "catatan_amar",
        "tanggal_musyawarah", "tanggal_dibacakan", "kaidah", "status", "abstrak",
        "link", "link_pdf", "file_name_pdf", "text_pdf"
    ]
    return pd.DataFrame([data], columns=columns), True

def run_scraper():
    path_out = create_path("CSV4")
    path_pdf = create_path("PDF4")
    today = date.today().strftime("%Y-%m-%d")
    file_csv = os.path.join(path_out, f"putusan_fidusia_{today}.csv")

    count = 0
    seen = set()
    page = 1

    while count < 50:
        url = f"https://putusan3.mahkamahagung.go.id/direktori/index/kategori/fidusia-1/page/{page}.html" if page > 1 else "https://putusan3.mahkamahagung.go.id/direktori/index/kategori/fidusia-1.html"
        soup = open_page(url)
        if not soup: break

        links = soup.find_all("a", href=re.compile("/direktori/putusan"))
        for tag in links:
            href = tag.get("href")
            if href and href not in seen:
                seen.add(href)
                df, ok = extract_data(href, path_out, path_pdf)
                if ok:
                    df.to_csv(file_csv, mode='a', header=not os.path.exists(file_csv), index=False)
                    count += 1
                    if count >= 50: break
        page += 1
    print(f"Done. Total: {count} putusan.")

# Jalankan Scraper
run_scraper()

# ===================================
# CLEANING CSV (Tahap 2)
# ===================================

df = pd.read_csv("/content/drive/MyDrive/Penalaran Komputer/CSV4/putusan_fidusia_2025-06-24.csv")

# Bersihkan kolom text_pdf

def clean_text_ma(text):
    if pd.isna(text): return ""
    text = text.lower()
    text = re.sub(r'direktori putusan.*?transparansi.*?peradilan\.', '', text, flags=re.DOTALL)
    text = re.sub(r'email\s*:\s*\S+@\S+', '', text)
    text = re.sub(r'telp.*?(\d{3,})', '', text)
    text = re.sub(r'nip\.?\s*\d+', '', text)
    text = re.sub(r'panitera.*?hakim.*?', '', text, flags=re.DOTALL)
    text = re.sub(r'ttd.*?', '', text, flags=re.DOTALL)
    text = re.sub(r'halaman\s*\d+', '', text)
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

df['text_pdf_cleaned'] = df['text_pdf'].apply(clean_text_ma)

# Simpan hasilnya
cleaned_path = "/content/drive/MyDrive/Penalaran Komputer/CSV4/putusan_fidusia_cleaned_FINAL.csv"
df.to_csv(cleaned_path, index=False)
print("✅ Cleaning selesai. File disimpan ke:", cleaned_path)


import pandas as pd
import re

# Load data dari hasil scraping
file_path = '/content/drive/MyDrive/Penalaran Komputer/CSV4/putusan_fidusia_cleaned_FINAL.csv'
df = pd.read_csv(file_path)

# Kolom target
text_col = 'text_pdf_cleaned'

# Pola regex lanjutan untuk membersihkan watermark, disclaimer, dan duplikasi
def clean_advanced(text):
    if not isinstance(text, str):
        return ""

    # Hapus watermark & identitas MA
    text = re.sub(r'(mahkamah agung republik indonesia|putusan\.mahkamahagung\.go\.id)', ' ', text, flags=re.IGNORECASE)
    text = re.sub(r'direktori putusan.*?transparansi dan akuntabilitas.*?(?=halaman|\s+)', ' ', text, flags=re.DOTALL|re.IGNORECASE)

    # Hapus penutup hakim/panitera
    text = re.sub(r'ttd\./.*?(?=panitera|hakim|untuk salinan|dr\.|nip)', ' ', text, flags=re.DOTALL)
    text = re.sub(r'nip\s*\d{5,}', ' ', text, flags=re.IGNORECASE)

    # Hapus 'halaman x dari x halaman' dan yang sejenis
    text = re.sub(r'halaman\s*\d+\s*(dari\s*\d+\s*halaman)?', ' ', text, flags=re.IGNORECASE)

    # Hapus duplikasi berulang (cut-off setelah putusan utama)
    text = re.sub(r'putusan tersebut diucapkan.*', '', text, flags=re.DOTALL|re.IGNORECASE)

    # Normalisasi spasi dan huruf besar kecil
    text = re.sub(r'\s+', ' ', text)
    text = text.strip().lower()

    return text

def extract_amar_putusan(text):
    if not isinstance(text, str):
        return ""

    # Lowercase + spasi normal
    text = text.lower()
    text = re.sub(r"\s+", " ", text)

    # Mulai dari kalimat penting
    start_idx = text.find("m e n g a d i l i")
    if start_idx == -1:
        start_idx = text.find("demi keadilan berdasarkan ketuhanan yang maha esa")
    if start_idx == -1:
        return text  # fallback

    text = text[start_idx:]

    # Akhiri di bagian-bagian administratif
    end_phrases = [
        "putusan ini diucapkan", "diputuskan dalam rapat",
        "panitera pengganti", "untuk salinan", "nip."
    ]
    for phrase in end_phrases:
        end_idx = text.find(phrase)
        if end_idx != -1:
            text = text[:end_idx]
            break

    return text.strip()

def finalize_cleaning(text):
    if not isinstance(text, str) or not text.strip():
        return ""

    # Lowercase
    text = text.lower()

    # Cari frasa kunci amar putusan
    start_idx = text.find('m e n g a d i l i')
    if start_idx != -1:
        text = text[start_idx:]  # Ambil dari "m e n g a d i l i"

    # Hapus watermark, disclaimer, penutup jika ada
    text = re.sub(r'direktori putusan.*?(?=menolak|memperbaiki|pada hari|putusan nomor|demi keadilan)', '', text, flags=re.DOTALL)
    text = re.sub(r'email\s*:\s*\S+@\S+|telp\s*:\s*[\d\s\-\(\)]+', '', text)
    text = re.sub(r'nip\.\s*\d+|\bpada hari.*', '', text)
    text = re.sub(r'panitera.*?$', '', text, flags=re.DOTALL)

    # Bersihkan spasi berlebih
    text = re.sub(r'\s+', ' ', text).strip()

    return text

def cari_amar_alternatif(text):
    text = text.lower()
    keywords = ['m e n g a d i l i', 'putusan pengadilan', 'menjatuhkan pidana', 'memperbaiki putusan']
    for key in keywords:
        idx = text.find(key)
        if idx != -1:
            return text[idx:]
    return text  # fallback: kembalikan teks penuh




# Terapkan ke kolom text_pdf_cleaned
df['text_pdf_cleaned'] = df['text_pdf_cleaned'].apply(clean_advanced)
df['text_pdf_cleaned'] = df['text_pdf_cleaned'].apply(extract_amar_putusan)
df['text_pdf_cleaned'] = df['text_pdf_cleaned'].apply(finalize_cleaning)
df['text_pdf_cleaned'] = df['text_pdf_cleaned'].apply(cari_amar_alternatif)



# Simpan hasil baru
output_path = '/content/drive/MyDrive/Penalaran Komputer/CSV4/putusan_fidusia_cleaned_FINAL_BERSIH_FIX.csv'
df.to_csv(output_path, index=False)

print("✅ Sukses! File dibersihkan dan disimpan ke:")
print(output_path)


df[['nomor', 'text_pdf_cleaned']].head(5)
