In [None]:
# PREPROCESSING.ipynb - Modifikasi untuk ekstraksi fitur
import pandas as pd
import re
import json
from tqdm import tqdm
import spacy

# Load model bahasa Indonesia
nlp = spacy.blank('id')
nlp.add_pipe('sentencizer')

def preprocess_text(text):
    """Preprocessing teks putusan"""
    if pd.isna(text):
        return ""
    
    # Normalisasi teks
    text = text.lower()
    text = re.sub(r'[^a-z0-9\s]', ' ', text)  # Hapus tanda baca
    text = re.sub(r'\s+', ' ', text).strip()   # Normalisasi spasi
    
    return text

def extract_metadata_features(row):
    """Ekstraksi fitur dari metadata"""
    features = {
        'case_id': row.name + 1,
        'no_perkara': row['nomor'],
        'tanggal': row['tanggal'],
        'jenis_perkara': row['jenis_perkara'],
        'pasal': extract_pasal(row['pasal']),
        'amar': preprocess_text(row['amar'])
    }
    return features

def extract_pasal(text):
    """Ekstrak pasal yang relevan"""
    pasal = re.findall(r'pasal\s+\d+\s+(?:ayat\s+\d+)?\s*(?:huruf\s+[a-z])?\s*(?:undang-undang|uu|kuhp|kuh per)', text, flags=re.IGNORECASE)
    return '; '.join(pasal) if pasal else ""

def extract_key_content(text):
    """Ekstrak konten kunci dari teks putusan"""
    doc = nlp(text)
    sentences = [sent.text for sent in doc.sents]
    
    # Cari bagian fakta
    fakta = []
    for i, sent in enumerate(sentences):
        if 'm e n g a d i l i' in sent.lower() or 'menimbang' in sent.lower():
            start_idx = i
            break
    
    # Ambil 10 kalimat setelah "mengadili"
    fakta = ' '.join(sentences[start_idx:start_idx+10])
    
    # Cari amar putusan
    amar = ""
    for i, sent in enumerate(sentences):
        if 'm e m u t u s k a n' in sent.lower() or 'memutuskan' in sent.lower():
            amar = ' '.join(sentences[i:i+5])
            break
    
    return {
        'ringkasan_fakta': preprocess_text(fakta),
        'argument_hukum': preprocess_text(amar),
        'text_length': len(text.split())
    }

def process_all_documents(df):
    results = []
    for idx, row in tqdm(df.iterrows(), total=len(df)):
        try:
            # Baca teks putusan
            with open(f'data/raw/case_{idx+1:03d}.txt', 'r', encoding='utf-8') as f:
                text = f.read()
            
            # Ekstrak fitur
            features = extract_metadata_features(row)
            content_features = extract_key_content(text)
            
            # Gabungkan semua fitur
            features.update(content_features)
            features['text_full'] = preprocess_text(text)
            results.append(features)
        except Exception as e:
            print(f"Error processing case {idx+1}: {e}")
    
    return pd.DataFrame(results)

# Load metadata
df = pd.read_csv('data/metadata_raw.csv')

# Proses semua dokumen
processed_df = process_all_documents(df)

# Simpan hasil
processed_df.to_csv('data/processed/cases.csv', index=False)
processed_df.to_json('data/processed/cases.json', orient='records', force_ascii=False)