In [1]:
!pip install transformers torch sentence-transformers Sastrawi scikit-learn faiss-cpu rank_bm25

Defaulting to user installation because normal site-packages is not writeable


In [2]:
import pandas as pd
import numpy as np
import json
import os
import re
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from sklearn.feature_extraction.text import TfidfVectorizer
from transformers import AutoTokenizer, AutoModel
import torch

In [3]:
# === 1. LOAD DATA ===
file_path = 'data/preprocessed/cleaned_putusan_hasil.csv'
try:
    df = pd.read_csv(file_path)
except FileNotFoundError:
    print(f"Error: File {file_path} not found.")
    exit(1)

In [4]:
# === 2. PREPROCESSING ===
stopword_factory = StopWordRemoverFactory()
stemmer = StemmerFactory().create_stemmer()
stop_words_indonesia = stopword_factory.get_stop_words() + ["terdakwa", "korban", "menyatakan", "secara", "sah", "meyakinkan"]

synonyms = {
    "pengeroyokan": ["kekerasan bersama-sama", "penganiayaan bersama-sama"],
    "penganiayaan": ["kekerasan", "penyerangan"],
    "turut serta": ["ikut serta", "bersama-sama"],
    "luka berat": ["cedera parah", "luka serius"]
}

def preprocess(text: str) -> str:
    if not isinstance(text, str):
        text = ""
    text = text.lower()
    text = re.sub(r'[^a-z0-9\s]', ' ', text)
    text = stemmer.stem(text)
    for key, syn_list in synonyms.items():
        for syn in syn_list:
            text = text.replace(syn, key)
    text = ' '.join([word for word in text.split() if word not in stop_words_indonesia])
    return text

In [5]:
# === 3. INDOBERT EMBEDDING (PRE-TRAINED) ===
tokenizer = AutoTokenizer.from_pretrained("indobenchmark/indobert-base-p2")
model = AutoModel.from_pretrained("indobenchmark/indobert-base-p2")

def bert_embed(text: str) -> np.ndarray:
    try:
        inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
        with torch.no_grad():
            outputs = model(**inputs)
        embeddings = outputs.last_hidden_state.max(dim=1).values.squeeze().numpy()
        return embeddings / np.linalg.norm(embeddings) if np.linalg.norm(embeddings) != 0 else np.zeros(768)
    except Exception as e:
        print(f"Error embedding text: {e}")
        return np.zeros(768)

In [6]:
# === 4. EKSTRAKSI METADATA ===
def extract_metadata(row):
    pihak = ""
    text = f"{row.get('kata_kunci', '')} {row.get('catatan_amar', '')}"
    names = re.findall(r'\b[A-Z][a-z]+\s[A-Z][a-z]+\b', text)
    pihak = " vs ".join(names[:2]) if len(names) >= 2 else "Unknown"
    return {
        "case_id": row.name + 1,
        "no_perkara": row.get("nomor", ""),
        "tanggal": row.get("tanggal_register", ""),
        "pasal": row.get("kata_kunci", row.get("catatan_amar", "")),
        "pihak": pihak,
        "jenis_perkara": row.get("klasifikasi", ""),
        "hakim_ketua": row.get("hakim_ketua", ""),
        "hakim_anggota": row.get("hakim_anggota", ""),
        "panitera": row.get("panitera", ""),
        "tahun": row.get("tahun", ""),
        "lembaga_peradilan": row.get("lembaga_peradilan", ""),
        "jenis_lembaga_peradilan": row.get("jenis_lembaga_peradilan", ""),
        "tanggal_musyawarah": row.get("tanggal_musyawarah", ""),
        "tanggal_dibacakan": row.get("tanggal_dibacakan", ""),
        "pidana_penjara": row.get("pidana_penjara", "")
    }

In [7]:
# === 5. EKSTRAKSI KONTEN KUNCI ===
def extract_fakta_dan_putusan(row):
    ringkasan_fakta = row.get("amar", "")
    text = f"{row.get('amar', '')} {row.get('catatan_amar', '')}"
    barang_bukti = re.findall(r'barang bukti:?\s*([^\.\;]+)', text, re.IGNORECASE)
    dakwaan = re.findall(r'dakwaan:?\s*([^\.\;]+)', text, re.IGNORECASE)
    ringkasan_fakta = f"{ringkasan_fakta} Barang Bukti: {barang_bukti[0] if barang_bukti else 'None'}. Dakwaan: {dakwaan[0] if dakwaan else 'None'}."
    return {
        "ringkasan_fakta": ringkasan_fakta,
        "putusan_hukum": row.get("catatan_amar", ""),
        "text_full": f"{row.get('amar', '')} {row.get('catatan_amar', '')}"
    }

In [8]:
# === 6. FEATURE ENGINEERING ===
tfidf_vectorizer = TfidfVectorizer(stop_words=stop_words_indonesia, max_features=15000, sublinear_tf=True, ngram_range=(1, 3))
corpus = [preprocess(f"{row.get('amar', '')} {row.get('catatan_amar', '')}") for _, row in df.iterrows()]
tfidf_vectorizer.fit(corpus)

def compute_features(row):
    text = preprocess(f"{row.get('amar', '')} {row.get('catatan_amar', '')}")
    words = re.findall(r'\b\w+\b', text) if text else []
    tfidf_vec = tfidf_vectorizer.transform([text]).toarray()[0] if text else np.zeros(15000)
    embedding = bert_embed(text)
    return {
        "length": len(words),
        "tfidf": tfidf_vec.tolist(),
        "indobert_embedding": embedding.tolist(),
        "qa_pair": [
            {"question": "Apa isi ringkasan fakta?", "answer": row.get("amar", "")},
            {"question": "Apa putusan hakim?", "answer": row.get("catatan_amar", "")},
            {"question": "Pasal apa yang diterapkan?", "answer": row.get("kata_kunci", row.get("catatan_amar", ""))},
            {"question": "Siapa hakim ketua?", "answer": row.get("hakim_ketua", "")},
            {"question": "Berapa lama pidana penjara?", "answer": row.get("pidana_penjara", "")}
        ]
    }

In [9]:
# === 7. KOMBINASI SEMUA ===
case_records = []

for idx, row in df.iterrows():
    try:
        meta = extract_metadata(row)
        content = extract_fakta_dan_putusan(row)
        feature = compute_features(row)
        case = {**meta, **content, **feature}
        case_records.append(case)
    except Exception as e:
        print(f"Error processing case {idx + 1}: {e}")

In [10]:
# === 8. SIMPAN CSV DAN JSON ===
output_folder = "data/processed"
os.makedirs(output_folder, exist_ok=True)

# Simpan ke CSV
df_case = pd.DataFrame(case_records)
csv_columns = [
    "case_id", "no_perkara", "tanggal", "ringkasan_fakta", "pasal", "pihak", "text_full",
    "jenis_perkara", "hakim_ketua", "hakim_anggota", "panitera", "tahun",
    "lembaga_peradilan", "jenis_lembaga_peradilan", "tanggal_musyawarah",
    "tanggal_dibacakan", "pidana_penjara", "length", "qa_pair"
]
df_case[csv_columns].to_csv(os.path.join(output_folder, "cases.csv"), index=False)

# Simpan embeddings separately
embeddings = [{"case_id": case["case_id"], "tfidf": case["tfidf"], "indobert_embedding": case["indobert_embedding"]} for case in case_records]
with open(os.path.join(output_folder, "embeddings.json"), "w", encoding="utf-8") as f:
    json.dump(embeddings, f, indent=2, ensure_ascii=False)

# Simpan ke JSON (full data)
json_path = os.path.join(output_folder, "cases.json")
with open(json_path, "w", encoding="utf-8") as f:
    json.dump(case_records, f, indent=2, ensure_ascii=False)

print(f"✅ Case representation berhasil disimpan ke:\n- CSV: {os.path.join(output_folder, 'cases.csv')}\n- JSON: {json_path}\n- Embeddings: {os.path.join(output_folder, 'embeddings.json')}")

✅ Case representation berhasil disimpan ke:
- CSV: data/processed\cases.csv
- JSON: data/processed\cases.json
- Embeddings: data/processed\embeddings.json
