In [25]:
!pip install transformers torch sentence-transformers Sastrawi scikit-learn faiss-cpu rank_bm25

Defaulting to user installation because normal site-packages is not writeable


In [26]:
import pandas as pd
import numpy as np
import json
import os
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.naive_bayes import MultinomialNB
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from transformers import AutoTokenizer, AutoModel
from rank_bm25 import BM25Okapi
import torch
import faiss
from typing import List, Tuple

In [27]:
# === 1. LOAD DATA ===
file_path = 'data/processed/cases.csv'
try:
    df = pd.read_csv(file_path)
except FileNotFoundError:
    print(f"Error: File {file_path} not found.")
    exit(1)

In [28]:
# === 2. PREPROCESSING ===
stopword_factory = StopWordRemoverFactory()
stemmer = StemmerFactory().create_stemmer()
stop_words_indonesia = stopword_factory.get_stop_words() + ["terdakwa", "korban", "menyatakan", "secara", "sah", "meyakinkan"]

synonyms = {
    "pengeroyokan": ["kekerasan bersama-sama", "penganiayaan bersama-sama"],
    "penganiayaan": ["kekerasan", "penyerangan"],
    "turut serta": ["ikut serta", "bersama-sama"],
    "luka berat": ["cedera parah", "luka serius"]
}

def preprocess(text: str) -> str:
    if not isinstance(text, str):
        text = ""
    text = text.lower()
    text = re.sub(r'[^a-z0-9\s]', ' ', text)
    text = stemmer.stem(text)
    for key, syn_list in synonyms.items():
        for syn in syn_list:
            text = text.replace(syn, key)
    text = ' '.join([word for word in text.split() if word not in stop_words_indonesia])
    return text

corpus = df["text_full"].apply(preprocess)

In [29]:
# === 3. REPRESENTASI VEKTOR ===
# TF-IDF
tfidf_vectorizer = TfidfVectorizer(stop_words=stop_words_indonesia, max_features=15000, sublinear_tf=True, ngram_range=(1, 3))
tfidf_matrix = tfidf_vectorizer.fit_transform(corpus)

# BM25
tokenized_corpus = [doc.split() for doc in corpus]
bm25 = BM25Okapi(tokenized_corpus)

# IndoBERT Embeddings
tokenizer = AutoTokenizer.from_pretrained("indobenchmark/indobert-base-p2")
model = AutoModel.from_pretrained("./indobert_finetuned" if os.path.exists("./indobert_finetuned") else "indobenchmark/indobert-base-p2")

def bert_embed(text: str) -> np.ndarray:
    try:
        inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
        with torch.no_grad():
            outputs = model(**inputs)
        embeddings = outputs.last_hidden_state.max(dim=1).values.squeeze().numpy()
        return embeddings / np.linalg.norm(embeddings) if np.linalg.norm(embeddings) != 0 else np.zeros(768)
    except Exception as e:
        print(f"Error embedding text: {e}")
        return np.zeros(768)

# Load or compute IndoBERT embeddings
embeddings_file = 'data/processed/embeddings.json'
if os.path.exists(embeddings_file):
    with open(embeddings_file, 'r', encoding='utf-8') as f:
        embeddings_data = json.load(f)
    bert_embeddings = np.array([data["indobert_embedding"] for data in embeddings_data])
else:
    bert_embeddings = np.array([bert_embed(text) for text in corpus])

In [30]:
# === 4. SPLITTING DATA ===
if "pasal" not in df:
    print("Warning: 'pasal' column not found. Falling back to no stratification and skipping Naive Bayes training.")
    nb_trained = False
    stratify_col = None
    X_train_tfidf, X_test_tfidf, case_id_train, case_id_test = train_test_split(
        tfidf_matrix, df["case_id"], test_size=0.2, random_state=42
    )
    X_train_bert, X_test_bert = train_test_split(
        bert_embeddings, test_size=0.2, random_state=42
    )
else:
    nb_trained = True
    stratify_col = df["pasal"]
    X_train_tfidf, X_test_tfidf, y_train, y_test = train_test_split(
        tfidf_matrix, df["pasal"], test_size=0.2, random_state=42, stratify=stratify_col
    )
    X_train_bert, X_test_bert = train_test_split(
        bert_embeddings, test_size=0.2, random_state=42, stratify=stratify_col
    )
    case_id_train, case_id_test = train_test_split(
        df["case_id"], test_size=0.2, random_state=42, stratify=stratify_col
    )

In [31]:
# === 5. MODEL RETRIEVAL: NAIVE BAYES ===
if nb_trained:
    nb = MultinomialNB()
    nb.fit(X_train_tfidf, y_train)

In [32]:
# === 6. FAISS INDEX FOR INDOBERT RETRIEVAL ===
dimension = bert_embeddings.shape[1]
index = faiss.IndexFlatIP(dimension)
index.add(bert_embeddings.astype(np.float32))

In [33]:
# === 7. FUNGSI RETRIEVAL ===
def retrieve(query: str, k: int = 5, method: str = "bm25") -> List[Tuple[int, float]]:
    query_clean = preprocess(query)
    try:
        if method == "bm25":
            query_tokens = query_clean.split()
            scores = bm25.get_scores(query_tokens)
            top_k_idx = np.argsort(scores)[::-1][:k]
            scores = scores[top_k_idx]
        elif method == "tfidf":
            query_vec = tfidf_vectorizer.transform([query_clean])
            scores = cosine_similarity(query_vec, tfidf_matrix).flatten()
            top_k_idx = scores.argsort()[::-1][:k]
            scores = scores[top_k_idx]
        elif method == "bert":
            query_vec = bert_embed(query_clean).astype(np.float32)
            query_vec = query_vec / np.linalg.norm(query_vec) if np.linalg.norm(query_vec) != 0 else np.zeros(dimension, dtype=np.float32)
            scores, indices = index.search(query_vec.reshape(1, -1), k)
            top_k_idx = indices[0]
            scores = scores[0]
        elif method == "nb":
            if not nb_trained:
                print("Warning: Naive Bayes not trained. Falling back to TF-IDF.")
                query_vec = tfidf_vectorizer.transform([query_clean])
                scores = cosine_similarity(query_vec, tfidf_matrix).flatten()
                top_k_idx = scores.argsort()[::-1][:k]
                scores = scores[top_k_idx]
            else:
                query_vec = tfidf_vectorizer.transform([query_clean])
                predicted_pasal = nb.predict(query_vec)[0]
                candidate_idx = df[df["pasal"] == predicted_pasal].index
                if len(candidate_idx) == 0:
                    print(f"Warning: No cases found for predicted pasal {predicted_pasal}. Falling back to TF-IDF.")
                    scores = cosine_similarity(query_vec, tfidf_matrix).flatten()
                    top_k_idx = scores.argsort()[::-1][:k]
                    scores = scores[top_k_idx]
                else:
                    candidate_matrix = tfidf_matrix[candidate_idx]
                    scores = cosine_similarity(query_vec, candidate_matrix).flatten()
                    top_k_idx = candidate_idx[scores.argsort()[::-1][:min(k, len(scores))]]
                    scores = scores[scores.argsort()[::-1][:min(k, len(scores))]]
        else:
            raise ValueError("Invalid method. Choose 'bm25', 'tfidf', 'bert', or 'nb'.")
        top_case_ids = df.iloc[top_k_idx]["case_id"].tolist()
        return list(zip(top_case_ids, scores))
    except Exception as e:
        print(f"Error in retrieval for query '{query}' with method '{method}': {e}")
        return []

In [34]:
# === 8. PENGUJIAN AWAL ===
query_eval = [
    {"query": "Terdakwa melakukan penganiayaan yang menyebabkan luka", "ground_truth": 7},
    {"query": "Terdakwa bersama-sama melakukan kekerasan terhadap orang di muka umum", "ground_truth": 6},
    {"query": "Terdakwa melakukan penganiayaan berat yang mengakibatkan luka berat", "ground_truth": 14},
    {"query": "Terdakwa turut serta dalam penganiayaan secara bersama-sama", "ground_truth": 5},
    {"query": "Terdakwa melakukan kekerasan terhadap pegawai negeri yang menyebabkan luka", "ground_truth": 1},
    {"query": "Terdakwa melakukan penganiayaan yang mengakibatkan kematian", "ground_truth": 20},
    {"query": "Terdakwa dengan terang-terangan menggunakan kekerasan terhadap orang", "ground_truth": 12},
    {"query": "Terdakwa melakukan penggelapan barang karena hubungan kerja", "ground_truth": 71},
    {"query": "Terdakwa menghasut untuk melakukan perbuatan pidana di muka umum", "ground_truth": 85},
    {"query": "Terdakwa bersama-sama melakukan pengeroyokan yang menyebabkan luka", "ground_truth": 90}
]

# Filter valid queries
valid_query_eval = [q for q in query_eval if q["ground_truth"] in df["case_id"].values]
if len(valid_query_eval) < len(query_eval):
    print(f"Warning: {len(query_eval) - len(valid_query_eval)} queries skipped due to invalid ground_truth case_id.")
if not valid_query_eval:
    print("Error: No valid queries available for evaluation. Exiting.")
    exit(1)

os.makedirs("data/eval", exist_ok=True)
with open("data/eval/queries.json", "w", encoding="utf-8") as f:
    json.dump(valid_query_eval, f, indent=2, ensure_ascii=False)

In [35]:
# === 9. EVALUATE RETRIEVAL ===
def evaluate_retrieval(query_eval, k=5, method="bm25"):
    y_true = []
    y_pred = []
    failure_cases = []

    for q in query_eval:
        results = retrieve(q["query"], k, method)
        if not results:
            y_true.append(1)  # Assume relevant
            y_pred.append(0)  # No cases retrieved
            failure_cases.append({
                "query": q["query"],
                "ground_truth": q["ground_truth"],
                "top_k_case_ids": [],
                "reason": "No cases retrieved"
            })
            continue

        case_ids, _ = zip(*results)
        gt = q["ground_truth"]
        # Consider retrieval correct if ground-truth case_id is in top-k
        is_relevant = 1 if gt in case_ids else 0
        y_true.append(1)  # Query has a relevant case
        y_pred.append(is_relevant)

        if not is_relevant:
            failure_cases.append({
                "query": q["query"],
                "ground_truth": gt,
                "top_k_case_ids": list(case_ids),
                "reason": "Ground-truth case not in top-k"
            })

    # Compute metrics
    metrics = {
        "accuracy": accuracy_score(y_true, y_pred) if y_true else 0.0,
        "precision": precision_score(y_true, y_pred, zero_division=0),
        "recall": recall_score(y_true, y_pred, zero_division=0),
        "f1_score": f1_score(y_true, y_pred, zero_division=0)
    }

    # Save failure cases
    os.makedirs("data/eval", exist_ok=True)
    with open(f"data/eval/failure_cases_retrieval_{method}.json", "w", encoding="utf-8") as f:
        json.dump(failure_cases, f, indent=2, ensure_ascii=False)

    return metrics

In [36]:
# === 10. EVALUATE ALL METHODS AND DISPLAY METRICS ===
if valid_query_eval:
    methods = ["bm25", "tfidf", "bert", "nb"]
    metrics_list = []

    for method in methods:
        metrics = evaluate_retrieval(valid_query_eval, k=5, method=method)
        metrics_list.append({
            "Model": method.upper(),
            "Accuracy": metrics["accuracy"],
            "Precision": metrics["precision"],
            "Recall": metrics["recall"],
            "F1-Score": metrics["f1_score"]
        })

    # Create metrics table
    metrics_df = pd.DataFrame(metrics_list)

    # Display metrics table
    print("\n=== Retrieval Performance Metrics ===")
    print(metrics_df.to_string(index=False, float_format="{:.4f}".format))

    # Save metrics to CSV
    os.makedirs("data/eval", exist_ok=True)
    metrics_df.to_csv("data/eval/retrieval_metrics.csv", index=False)
    print("\nMetrics saved to data/eval/retrieval_metrics.csv")
else:
    print("No evaluation performed due to empty valid_query_eval.")


=== Retrieval Performance Metrics ===
Model  Accuracy  Precision  Recall  F1-Score
 BM25    0.6000     1.0000  0.6000    0.7500
TFIDF    0.7000     1.0000  0.7000    0.8235
 BERT    0.1000     1.0000  0.1000    0.1818
   NB    0.6000     1.0000  0.6000    0.7500

Metrics saved to data/eval/retrieval_metrics.csv
