In [None]:
import os
import re
import time
import pandas as pd
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from pdfminer.high_level import extract_text
from io import BytesIO
import urllib.request
from pathlib import Path
from datetime import datetime

# 1. Inisialisasi Direktori
def initialize_directories():
    """Membuat struktur direktori yang diperlukan"""
    Path('data/raw').mkdir(parents=True, exist_ok=True)
    Path('logs').mkdir(parents=True, exist_ok=True)
    Path('figures').mkdir(parents=True, exist_ok=True)
    
    # Buat file log jika belum ada
    if not os.path.exists('logs/cleaning.log'):
        with open('logs/cleaning.log', 'w', encoding='utf-8') as f:
            f.write(f"Log file created at {datetime.now()}\n")

# 2. Fungsi Bantuan
def get_detail(soup, keyword):
    """Mengekstrak detail dari tabel metadata"""
    try:
        return soup.find(lambda tag: tag.name == "td" and keyword in tag.text).find_next().get_text().strip()
    except:
        return ""

def clean_text(text):
    """Membersihkan teks dari header/footer dan normalisasi"""
    if not text:
        return ""
    
    # Hapus header/footer spesifik
    patterns = [
        r"M a h ka m a h A g u n g R e p u blik In d o n esia[\s\S]*?Kepaniteraan Mahkamah Agung RI",
        r"Disclaimer[\s\S]*?ext\.318\)",
        r"Halaman \d+ dari \d+",
        r"Page \d+ of \d+"
    ]
    
    for pattern in patterns:
        text = re.sub(pattern, "", text, flags=re.IGNORECASE)
    
    # Normalisasi teks
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def download_pdf(url):
    """Mengunduh dan mengekstrak teks dari PDF"""
    try:
        # Unduh PDF
        pdf_file = urllib.request.urlopen(url)
        file_content = pdf_file.read()
        
        # Ekstrak teks
        text = extract_text(BytesIO(file_content))
        return text
    except Exception as e:
        print(f"Error processing PDF {url}: {e}")
        return None

# 3. Fungsi Ekstraksi Dokumen Utama
def extract_document_data(url, max_retries=3):
    """Mengekstrak data dari halaman putusan"""
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    
    for attempt in range(max_retries):
        try:
            response = requests.get(url, headers=headers, timeout=10)
            response.raise_for_status()
            soup = BeautifulSoup(response.text, 'html.parser')
            
            # Ekstrak metadata
            metadata = {
                'judul': soup.find('h2').get_text().strip() if soup.find('h2') else "",
                'nomor': get_detail(soup, "Nomor"),
                'tanggal': get_detail(soup, "Tanggal Register"),
                'jenis_perkara': get_detail(soup, "Jenis Lembaga Peradilan"),
                'pasal': get_detail(soup, "Kaidah"),
                'amar': get_detail(soup, "Amar"),
                'link': url
            }
            
            # Ekstrak teks PDF
            pdf_link = soup.find('a', href=re.compile(r'/pdf/'))
            if pdf_link:
                pdf_url = urljoin(url, pdf_link['href'])
                pdf_text = download_pdf(pdf_url)
                metadata['text'] = clean_text(pdf_text) if pdf_text else ""
            else:
                metadata['text'] = ""
            
            return metadata
            
        except Exception as e:
            print(f"Attempt {attempt + 1} failed for {url}: {e}")
            time.sleep(5)  # Tunggu sebelum retry
    
    print(f"Max retries reached for {url}")
    return None

# 4. Fungsi Penyimpanan
def save_document(data):
    """Menyimpan dokumen dan mencatat log"""
    try:
        # Hitung ID dokumen
        existing_files = [f for f in os.listdir('data/raw') if f.startswith('case_')]
        doc_id = len(existing_files) + 1
        
        # Simpan teks putusan
        with open(f'data/raw/case_{doc_id:03d}.txt', 'w', encoding='utf-8') as f:
            f.write(data.get('text', ''))
        
        # Catat log
        with open('logs/cleaning.log', 'a', encoding='utf-8') as f:
            f.write(f"{datetime.now()} - Processed document {doc_id}: {data.get('judul', '')}\n")
            
        return True
    except Exception as e:
        print(f"Error saving document: {e}")
        return False

# 5. Fungsi Scraping Utama
def scrape_ma_putusan(keyword="narkotika", max_documents=30, delay=1):
    """Fungsi utama untuk scraping putusan MA"""
    # Inisialisasi
    initialize_directories()
    base_url = "https://putusan3.mahkamahagung.go.id"
    search_url = f"{base_url}/search.html"
    documents = []
    page = 1
    
    print(f"Memulai scraping untuk keyword: '{keyword}'")
    
    while len(documents) < max_documents:
        try:
            # Buat URL pencarian
            url = f"{search_url}?q={urllib.parse.quote(keyword)}&page={page}&obf=TANGGAL_PUTUS&obm=desc"
            
            # Request halaman
            response = requests.get(url, timeout=10)
            response.raise_for_status()
            soup = BeautifulSoup(response.text, 'html.parser')
            
            # Temukan semua link putusan
            links = soup.find_all('a', href=re.compile(r'/direktori/putusan/'))
            
            if not links:
                print("Tidak menemukan hasil lagi.")
                break
                
            print(f"Memproses halaman {page} - ditemukan {len(links)} putusan")
            
            # Proses setiap putusan
            for link in links:
                if len(documents) >= max_documents:
                    break
                    
                doc_path = link['href']
                doc_url = urljoin(base_url, doc_path)
                
                print(f"Memproses: {doc_url}")
                
                # Ekstrak data
                doc_data = extract_document_data(doc_url)
                if doc_data and doc_data.get('text'):
                    if save_document(doc_data):
                        documents.append(doc_data)
                        print(f"Berhasil menyimpan dokumen {len(documents)}")
                    else:
                        print("Gagal menyimpan dokumen")
                
                # Jeda antara request
                time.sleep(delay)
                
            page += 1
            
        except requests.exceptions.RequestException as e:
            print(f"Error request: {e}")
            break
        except Exception as e:
            print(f"Unexpected error: {e}")
            break
    
    # Simpan metadata
    if documents:
        df = pd.DataFrame(documents)
        df.to_csv('data/metadata_raw.csv', index=False)
        print(f"Berhasil menyimpan {len(df)} dokumen ke data/metadata_raw.csv")
        return df
    else:
        print("Tidak ada dokumen yang berhasil diambil")
        return pd.DataFrame()

# 6. Eksekusi
if __name__ == "__main__":
    # Contoh penggunaan
    keyword = "narkotika"  # Ganti dengan keyword yang diinginkan
    max_docs = 30          # Jumlah dokumen yang ingin diambil
    
    start_time = time.time()
    df = scrape_ma_putusan(keyword=keyword, max_documents=max_docs)
    elapsed = time.time() - start_time
    
    print(f"\nSelesai dalam {elapsed:.2f} detik")
    print(f"Total dokumen yang berhasil diambil: {len(df)}")
    
    if not df.empty:
        print("\nContoh data:")
        print(df.head())