In [2]:
import os
import re
import json
import logging
import pandas as pd
from datetime import datetime
import PyPDF2
import glob

# Setup logging
def setup_logging():
    """Setup logging configuration"""
    log_filename = f"extraction_log_{datetime.now().strftime('%Y%m%d_%H%M%S')}.log"

    logging.basicConfig(
        level=logging.INFO,
        format='%(asctime)s - %(levelname)s - %(message)s',
        handlers=[
            logging.FileHandler(log_filename),
            logging.StreamHandler()
        ]
    )
    return logging.getLogger(__name__)

def extract_text_from_pdf(pdf_path):
    """
    Extract text from PDF file
    """
    try:
        with open(pdf_path, 'rb') as file:
            pdf_reader = PyPDF2.PdfReader(file)
            text = ""
            for page in pdf_reader.pages:
                text += page.extract_text() + "\n"
        return text
    except Exception as e:
        logging.error(f"Error reading PDF {pdf_path}: {str(e)}")
        return ""

def clean_barang_bukti_item(item):
    """Clean and validate barang bukti item"""
    # Remove common noise patterns
    item = re.sub(r'\d+/\d+/[A-Z]+,?\s*-?\s*:', '', item)  # Remove lab numbers like "29485/2024/NNF,- :"
    item = re.sub(r'No\.\s*\d+/\d+/[A-Z]+', '', item)  # Remove "No. 29485/2024/NNF"
    item = re.sub(r'berupa\s*', '', item)  # Remove "berupa"
    item = re.sub(r'\s+', ' ', item).strip()

    # Filter out items that are too long (likely not actual barang bukti)
    if len(item) > 150:
        return None

    # Filter out items that contain certain keywords indicating they're not barang bukti
    exclusion_keywords = [
        'bahwa', 'kesimpulan', 'pemeriksaan', 'laboratorium', 'saksi', 'terdakwa',
        'berat', 'netto', 'gram', 'dengan', 'untuk', 'dari', 'pada', 'tanggal',
        'wib', 'setelah', 'sebelum', 'kemudian', 'selanjutnya', 'berdasarkan'
    ]

    if any(keyword in item.lower() for keyword in exclusion_keywords):
        return None

    return item if len(item) > 10 else None

def extract_legal_data_from_text(text, filename):
    """
    Extract legal document data from text with improved accuracy
    """
    result = {
        "file_name": filename,
        "nomor_putusan": "",
        "lembaga_peradilan": "",
        "barang_bukti": [],
        "amar_putusan": [],
        "status": "Success",
        "error_message": ""
    }

    try:
        # Ekstraksi Nomor Putusan - Improved pattern
        nomor_patterns = [
            r'Nomor\s+(\d+[\/\w\.\s\-]+\d+)',
            r'Nomor\s*:\s*(\d+[\/\w\.\s\-]+\d+)',
            r'Putusan\s+Nomor\s+(\d+[\/\w\.\s\-]+\d+)',
            r'Nomor\s+(\d+\/Pid\.Sus\/\d+\/PN\s*\w+)'
        ]

        for pattern in nomor_patterns:
            nomor_match = re.search(pattern, text, re.IGNORECASE)
            if nomor_match:
                result["nomor_putusan"] = nomor_match.group(1).strip()
                break

        # Ekstraksi Lembaga Peradilan - Improved pattern
        lembaga_patterns = [
            r'Pengadilan\s+(Negeri|Tinggi|Agung)\s+([A-Za-z\s]+?)(?=\s|\,|\.|$)',
            r'(Pengadilan\s+Negeri\s+[A-Za-z]+)',
            r'Pengadilan\s+([A-Za-z\s]+)(?=\s+yang\s+memeriksa|$)'
        ]

        for pattern in lembaga_patterns:
            lembaga_match = re.search(pattern, text, re.IGNORECASE)
            if lembaga_match:
                result["lembaga_peradilan"] = lembaga_match.group(0).strip()
                break

        # Improved Barang Bukti Extraction
        barang_bukti_sections = []

        # Look for specific barang bukti patterns
        barang_bukti_patterns = [
            r'barang\s+bukti\s+berupa\s*:([^\.]+?)(?=\.|\n\n|\d+\.|$)',
            r'barang\s+bukti\s+sebagai\s+berikut\s*:([^\.]+?)(?=\.|\n\n|\d+\.|$)',
            r'menetapkan\s+barang\s+bukti\s+berupa\s*:([^\.]+?)(?=\.|\n\n|\d+\.|$)',
            r'barang\s+bukti\s+([^\.]{50,500}?)(?=\.|\n\n|amar|putusan|$)'
        ]

        for pattern in barang_bukti_patterns:
            matches = re.findall(pattern, text, re.IGNORECASE | re.DOTALL)
            for match in matches:
                if match.strip():
                    barang_bukti_sections.append(match.strip())

        # Extract items from sections
        for section in barang_bukti_sections:
            # Pattern for numbered items
            items = re.findall(r'\d+\s*\(\s*\w+\s*\)[^\.\d]+?(?=\d+\s*\(|\n\n|$)', section)
            if not items:
                # Alternative pattern for bullet items
                items = re.findall(r'-\s*([^-]+?)(?=-\s*|\n\n|$)', section)

            for item in items:
                cleaned_item = clean_barang_bukti_item(item)
                if cleaned_item:
                    result["barang_bukti"].append(cleaned_item)

        # If still no barang bukti found, try direct extraction from common patterns
        if not result["barang_bukti"]:
            # Look for common barang bukti patterns in the entire text
            common_patterns = [
                r'\d+\s*\(\s*\w+\s*\)\s*[^\.\d]{10,100}HP[^\.\d]{0,50}',
                r'\d+\s*\(\s*\w+\s*\)\s*[^\.\d]{10,100}pipet[^\.\d]{0,50}',
                r'\d+\s*\(\s*\w+\s*\)\s*[^\.\d]{10,100}alat hisap[^\.\d]{0,50}',
                r'\d+\s*\(\s*\w+\s*\)\s*[^\.\d]{10,100}tas[^\.\d]{0,50}',
                r'\d+\s*\(\s*\w+\s*\)\s*[^\.\d]{10,100}sekrop[^\.\d]{0,50}'
            ]

            for pattern in common_patterns:
                matches = re.findall(pattern, text, re.IGNORECASE)
                for match in matches:
                    cleaned_item = clean_barang_bukti_item(match)
                    if cleaned_item:
                        result["barang_bukti"].append(cleaned_item)

        # Remove duplicates while preserving order
        seen = set()
        result["barang_bukti"] = [x for x in result["barang_bukti"] if not (x in seen or seen.add(x))]

        # Improved Amar Putusan Extraction
        amar_sections = []

        # Look for AMAR PUTUSAN section more precisely
        amar_patterns = [
            r'M\s*E\s*N\s*G\s*A\s*D\s*I\s*L\s*I\s*:?\s*([\s\S]{500,3000}?)(?=Demikianlah|$|\n\n)',
            r'A\s*M\s*A\s*R\s*\s*P\s*U\s*T\s*U\s*S\s*A\s*N\s*:?\s*([\s\S]{500,3000}?)(?=Demikianlah|$|\n\n)',
            r'MENGA\s*DILI\s*:?\s*([\s\S]{500,3000}?)(?=Demikianlah|$|\n\n)'
        ]

        for pattern in amar_patterns:
            amar_match = re.search(pattern, text, re.IGNORECASE)
            if amar_match:
                amar_text = amar_match.group(1)
                amar_sections.append(amar_text)
                break

        # If no specific section found, look for numbered decisions at the end
        if not amar_sections:
            # Look for numbered points that typically indicate amar putusan
            decision_pattern = r'\d+\.\s*([^\.]{50,500}?(?=\d+\.|$|Demikianlah))'
            decisions = re.findall(decision_pattern, text[-2000:])  # Look in last 2000 chars
            if decisions:
                amar_sections.extend(decisions)

        # Extract clean amar points
        for section in amar_sections:
            # Extract numbered points from amar section
            amar_points = re.findall(r'\d+\.\s*([^\d\.]+?(?=\d+\.|\Z|$))', section)

            for point in amar_points:
                cleaned_point = re.sub(r'\s+', ' ', point).strip()
                # Filter out points that are too short or don't look like actual decisions
                if (len(cleaned_point) > 30 and
                    not cleaned_point.lower().startswith('bahwa') and
                    not cleaned_point.lower().startswith('pada') and
                    any(keyword in cleaned_point.lower() for keyword in
                        ['menyatakan', 'menjatuhkan', 'menetapkan', 'membebankan', 'memerintahkan'])):
                    result["amar_putusan"].append(cleaned_point)

        # If still no amar found, try alternative approach
        if not result["amar_putusan"]:
            # Look for common decision patterns in the entire text
            decision_keywords = [
                r'Menyatakan\s+[^\.]{20,200}',
                r'Menjatuhkan\s+[^\.]{20,200}',
                r'Menetapkan\s+[^\.]{20,200}',
                r'Memerintahkan\s+[^\.]{20,200}',
                r'Membebankan\s+[^\.]{20,200}'
            ]

            for pattern in decision_keywords:
                matches = re.findall(pattern, text, re.IGNORECASE)
                for match in matches:
                    if len(match) > 30:
                        result["amar_putusan"].append(match.strip())

        # Remove duplicates while preserving order
        seen = set()
        result["amar_putusan"] = [x for x in result["amar_putusan"] if not (x in seen or seen.add(x))]

    except Exception as e:
        result["status"] = "Error"
        result["error_message"] = str(e)
        logging.error(f"Error processing {filename}: {str(e)}")

    return result

def process_pdf_folder(folder_path):
    """
    Process all PDF files in a folder
    """
    logger = setup_logging()

    # Cari semua file PDF dalam folder
    pdf_pattern = os.path.join(folder_path, "*.pdf")
    pdf_files = glob.glob(pdf_pattern)

    if not pdf_files:
        logging.warning(f"No PDF files found in {folder_path}")
        return []

    logging.info(f"Found {len(pdf_files)} PDF files to process")

    all_results = []

    for pdf_file in pdf_files:
        filename = os.path.basename(pdf_file)
        logging.info(f"Processing: {filename}")

        try:
            # Extract text from PDF
            text = extract_text_from_pdf(pdf_file)

            if not text.strip():
                logging.warning(f"No text extracted from {filename}")
                continue

            # Extract legal data
            result = extract_legal_data_from_text(text, filename)
            all_results.append(result)

            logging.info(f"Successfully processed: {filename} - Found {len(result['barang_bukti'])} barang bukti, {len(result['amar_putusan'])} amar")

        except Exception as e:
            logging.error(f"Failed to process {filename}: {str(e)}")
            error_result = {
                "file_name": filename,
                "nomor_putusan": "",
                "lembaga_peradilan": "",
                "barang_bukti": [],
                "amar_putusan": [],
                "status": "Error",
                "error_message": str(e)
            }
            all_results.append(error_result)

    return all_results

def save_to_excel(results, output_file="results/overview.xlsx"):
    """
    Save results to Excel file with better formatting
    """
    try:
        # Prepare data for Excel
        excel_data = []

        for result in results:
            row = {
                "File Name": result["file_name"],
                "Nomor Putusan": result["nomor_putusan"],
                "Lembaga Peradilan": result["lembaga_peradilan"],
                "Barang Bukti": "\n".join(result["barang_bukti"]) if result["barang_bukti"] else "Tidak ditemukan",
                "Amar Putusan": "\n".join(result["amar_putusan"]) if result["amar_putusan"] else "Tidak ditemukan",
                "Status": result["status"],
                "Error Message": result.get("error_message", "")
            }
            excel_data.append(row)

        # Create DataFrame
        df = pd.DataFrame(excel_data)

        # Save to Excel with formatting
        with pd.ExcelWriter(output_file, engine='openpyxl') as writer:
            df.to_excel(writer, sheet_name='Overview', index=False)

            # Auto-adjust column widths
            worksheet = writer.sheets['Overview']

            # Set specific column widths
            column_widths = {
                'A': 30,  # File Name
                'B': 25,  # Nomor Putusan
                'C': 25,  # Lembaga Peradilan
                'D': 40,  # Barang Bukti
                'E': 60,  # Amar Putusan
                'F': 12,  # Status
                'G': 20   # Error Message
            }

            for col, width in column_widths.items():
                worksheet.column_dimensions[col].width = width

            # Enable text wrapping for longer columns
            for row in worksheet.iter_rows(min_row=2, max_row=worksheet.max_row, min_col=4, max_col=5):
                for cell in row:
                    cell.alignment = cell.alignment.copy(wrap_text=True)

        logging.info(f"Results saved to {output_file}")
        return True

    except Exception as e:
        logging.error(f"Error saving to Excel: {str(e)}")
        return False

def save_detailed_json(results, output_file="results/detailed_results.json"):
    """
    Save detailed results to JSON file
    """
    try:
        with open(output_file, 'w', encoding='utf-8') as f:
            json.dump(results, f, indent=2, ensure_ascii=False)
        logging.info(f"Detailed results saved to {output_file}")
    except Exception as e:
        logging.error(f"Error saving JSON: {str(e)}")

def main():
    """
    Main function
    """
    print("PDF Legal Document Extractor - Improved Version")
    print("=" * 60)

    # Get folder path from user
    folder_path = input("Masukkan path folder yang berisi file PDF: ").strip()

    if not os.path.exists(folder_path):
        print("Folder tidak ditemukan!")
        return

    # Process PDF files
    print(f"\nMemproses file PDF di folder: {folder_path}")
    results = process_pdf_folder(folder_path)

    if not results:
        print("Tidak ada data yang berhasil diproses.")
        return

    # Save results
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    excel_file = f"overview_{timestamp}.xlsx"
    json_file = f"detailed_results_{timestamp}.json"

    success = save_to_excel(results, excel_file)
    save_detailed_json(results, json_file)

    # Print summary
    print("\n" + "=" * 60)
    print("SUMMARY")
    print("=" * 60)
    print(f"Total files processed: {len(results)}")

    success_count = sum(1 for r in results if r["status"] == "Success")
    error_count = sum(1 for r in results if r["status"] == "Error")

    total_barang_bukti = sum(len(r["barang_bukti"]) for r in results if r["status"] == "Success")
    total_amar = sum(len(r["amar_putusan"]) for r in results if r["status"] == "Success")

    print(f"Successfully processed: {success_count}")
    print(f"Errors: {error_count}")
    print(f"Total barang bukti items found: {total_barang_bukti}")
    print(f"Total amar putusan items found: {total_amar}")
    print(f"\nOutput files:")
    print(f"- Excel overview: {excel_file}")
    print(f"- Detailed JSON: {json_file}")
    print(f"- Log file: extraction_log_*.log")

if __name__ == "__main__":
    main()

PDF Legal Document Extractor - Improved Version


2025-11-10 13:56:28,642 - INFO - Found 50 PDF files to process
2025-11-10 13:56:28,643 - INFO - Processing: putusan_1492_pid.sus_2025_pn_sby_20251110132055.pdf



Memproses file PDF di folder: data/PDF


2025-11-10 13:56:29,356 - INFO - Successfully processed: putusan_1492_pid.sus_2025_pn_sby_20251110132055.pdf - Found 4 barang bukti, 18 amar
2025-11-10 13:56:29,357 - INFO - Processing: putusan_1492_pid.sus_2025_pn_sby_20251110132101.pdf
2025-11-10 13:56:30,029 - INFO - Successfully processed: putusan_1492_pid.sus_2025_pn_sby_20251110132101.pdf - Found 4 barang bukti, 18 amar
2025-11-10 13:56:30,030 - INFO - Processing: putusan_1631_pid.sus_2025_pn_sby_20251110131613.pdf
2025-11-10 13:56:30,588 - INFO - Successfully processed: putusan_1631_pid.sus_2025_pn_sby_20251110131613.pdf - Found 6 barang bukti, 2 amar
2025-11-10 13:56:30,589 - INFO - Processing: putusan_1631_pid.sus_2025_pn_sby_20251110131618.pdf
2025-11-10 13:56:31,221 - INFO - Successfully processed: putusan_1631_pid.sus_2025_pn_sby_20251110131618.pdf - Found 6 barang bukti, 2 amar
2025-11-10 13:56:31,222 - INFO - Processing: putusan_1681_pid.sus_2025_pn_sby_20251110132006.pdf
2025-11-10 13:56:31,520 - INFO - Successfully proc


SUMMARY
Total files processed: 50
Successfully processed: 50
Errors: 0
Total barang bukti items found: 290
Total amar putusan items found: 277

Output files:
- Excel overview: overview_20251110_135653.xlsx
- Detailed JSON: detailed_results_20251110_135653.json
- Log file: extraction_log_*.log
