In [14]:
!pip install transformers torch sentence-transformers Sastrawi scikit-learn faiss-cpu rank_bm25

Defaulting to user installation because normal site-packages is not writeable


In [15]:
import pandas as pd
import numpy as np
import json
import os
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import accuracy_score
from typing import List, Tuple
from sklearn.naive_bayes import MultinomialNB
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from transformers import AutoTokenizer, AutoModel
from rank_bm25 import BM25Okapi
import torch
import faiss
from scipy.stats import rankdata

In [16]:
# === 1. LOAD DATA ===
file_path = 'data/processed/cases.csv'
try:
    df = pd.read_csv(file_path)
except FileNotFoundError:
    print(f"Error: File {file_path} not found.")
    exit(1)

In [17]:
# === 2. PREPROCESSING ===
stopword_factory = StopWordRemoverFactory()
stemmer = StemmerFactory().create_stemmer()
stop_words_indonesia = stopword_factory.get_stop_words() + ["terdakwa", "korban", "menyatakan", "secara", "sah", "meyakinkan"]

synonyms = {
    "pengeroyokan": ["kekerasan bersama-sama", "penganiayaan bersama-sama"],
    "penganiayaan": ["kekerasan", "penyerangan"],
    "turut serta": ["ikut serta", "bersama-sama"],
    "luka berat": ["cedera parah", "luka serius"]
}

def preprocess(text: str) -> str:
    if not isinstance(text, str):
        text = ""
    text = text.lower()
    text = re.sub(r'[^a-z0-9\s]', ' ', text)
    text = stemmer.stem(text)
    for key, syn_list in synonyms.items():
        for syn in syn_list:
            text = text.replace(syn, key)
    text = ' '.join([word for word in text.split() if word not in stop_words_indonesia])
    return text

corpus = df["text_full"].apply(preprocess)

In [18]:
# === 3. REPRESENTASI VEKTOR ===
# TF-IDF
tfidf_vectorizer = TfidfVectorizer(stop_words=stop_words_indonesia, max_features=15000, sublinear_tf=True, ngram_range=(1, 3))
tfidf_matrix = tfidf_vectorizer.fit_transform(corpus)

# BM25
tokenized_corpus = [doc.split() for doc in corpus]
bm25 = BM25Okapi(tokenized_corpus)

# IndoBERT Embeddings
tokenizer = AutoTokenizer.from_pretrained("indobenchmark/indobert-base-p2")
model = AutoModel.from_pretrained("./indobert_finetuned" if os.path.exists("./indobert_finetuned") else "indobenchmark/indobert-base-p2")

def bert_embed(text: str) -> np.ndarray:
    try:
        inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
        with torch.no_grad():
            outputs = model(**inputs)
        embeddings = outputs.last_hidden_state.max(dim=1).values.squeeze().numpy()
        return embeddings / np.linalg.norm(embeddings) if np.linalg.norm(embeddings) != 0 else np.zeros(768)
    except Exception as e:
        print(f"Error embedding text: {e}")
        return np.zeros(768)

# Load or compute IndoBERT embeddings
embeddings_file = 'data/processed/embeddings.json'
if os.path.exists(embeddings_file):
    with open(embeddings_file, 'r', encoding='utf-8') as f:
        embeddings_data = json.load(f)
    bert_embeddings = np.array([data["indobert_embedding"] for data in embeddings_data])
else:
    bert_embeddings = np.array([bert_embed(text) for text in corpus])

In [19]:
# === 4. SPLITTING DATA ===
# Use pasal for stratification and as Naive Bayes target
if "pasal" not in df:
    print("Warning: 'pasal' column not found. Falling back to no stratification and skipping Naive Bayes training.")
    nb_trained = False
    stratify_col = None
    X_train_tfidf, X_test_tfidf, case_id_train, case_id_test = train_test_split(
        tfidf_matrix, df["case_id"], test_size=0.2, random_state=42
    )
    X_train_bert, X_test_bert = train_test_split(
        bert_embeddings, test_size=0.2, random_state=42
    )
else:
    nb_trained = True
    stratify_col = df["pasal"]
    X_train_tfidf, X_test_tfidf, y_train, y_test = train_test_split(
        tfidf_matrix, df["pasal"], test_size=0.2, random_state=42, stratify=stratify_col
    )
    X_train_bert, X_test_bert = train_test_split(
        bert_embeddings, test_size=0.2, random_state=42, stratify=stratify_col
    )
    case_id_train, case_id_test = train_test_split(
        df["case_id"], test_size=0.2, random_state=42, stratify=stratify_col
    )

In [20]:
# === 5. MODEL RETRIEVAL: NAIVE BAYES ===
if nb_trained:
    nb = MultinomialNB()
    nb.fit(X_train_tfidf, y_train)

    # Evaluate Naive Bayes
    y_pred = nb.predict(X_test_tfidf)
    accuracy = accuracy_score(y_test, y_pred)
    prob_scores = nb.predict_proba(X_test_tfidf)
    class_labels = np.unique(y_train)  # Use training set pasal values
    log_losses = []

    for i, y_true in enumerate(y_test):
        if y_true not in class_labels:
            print(f"Warning: Test pasal {y_true} not in training set. Skipping.")
            continue
        true_class_idx = np.where(class_labels == y_true)[0][0]
        true_prob = prob_scores[i, true_class_idx]
        # Compute log loss for the true class probability
        loss = -np.log(max(true_prob, 1e-15))  # Avoid log(0) with small epsilon
        log_losses.append(loss)

    log_loss_value = np.mean(log_losses) if log_losses else 0.0
    print(f"Naive Bayes Accuracy: {accuracy:.4f}")
    print(f"Naive Bayes Log Loss: {log_loss_value:.4f}")
else:
    print("Naive Bayes training skipped due to missing 'pasal' column.")

Naive Bayes Accuracy: 1.0000
Naive Bayes Log Loss: 0.0339


In [21]:
# === 6. FAISS INDEX FOR INDOBERT RETRIEVAL ===
dimension = bert_embeddings.shape[1]
index = faiss.IndexFlatIP(dimension)
index.add(bert_embeddings.astype(np.float32))

In [22]:
# === 7. FUNGSI RETRIEVAL ===
def retrieve(query: str, k: int = 5, method: str = "bm25") -> List[Tuple[int, float]]:
    query_clean = preprocess(query)
    try:
        if method == "bm25":
            query_tokens = query_clean.split()
            scores = bm25.get_scores(query_tokens)
            top_k_idx = np.argsort(scores)[::-1][:k]
            scores = scores[top_k_idx]
        elif method == "tfidf":
            query_vec = tfidf_vectorizer.transform([query_clean])
            scores = cosine_similarity(query_vec, tfidf_matrix).flatten()
            top_k_idx = scores.argsort()[::-1][:k]
            scores = scores[top_k_idx]
        elif method == "bert":
            query_vec = bert_embed(query_clean).astype(np.float32)
            query_vec = query_vec / np.linalg.norm(query_vec) if np.linalg.norm(query_vec) != 0 else np.zeros(dimension, dtype=np.float32)
            scores, indices = index.search(query_vec.reshape(1, -1), k)
            top_k_idx = indices[0]
            scores = scores[0]
        elif method == "nb":
            if not nb_trained:
                print("Warning: Naive Bayes not trained. Falling back to TF-IDF.")
                query_vec = tfidf_vectorizer.transform([query_clean])
                scores = cosine_similarity(query_vec, tfidf_matrix).flatten()
                top_k_idx = scores.argsort()[::-1][:k]
                scores = scores[top_k_idx]
            else:
                query_vec = tfidf_vectorizer.transform([query_clean])
                predicted_pasal = nb.predict(query_vec)[0]
                candidate_idx = df[df["pasal"] == predicted_pasal].index
                if len(candidate_idx) == 0:
                    print(f"Warning: No cases found for predicted pasal {predicted_pasal}. Falling back to TF-IDF.")
                    scores = cosine_similarity(query_vec, tfidf_matrix).flatten()
                    top_k_idx = scores.argsort()[::-1][:k]
                    scores = scores[top_k_idx]
                else:
                    candidate_matrix = tfidf_matrix[candidate_idx]
                    scores = cosine_similarity(query_vec, candidate_matrix).flatten()
                    top_k_idx = candidate_idx[scores.argsort()[::-1][:min(k, len(scores))]]
                    scores = scores[scores.argsort()[::-1][:min(k, len(scores))]]
        else:
            raise ValueError("Invalid method. Choose 'bm25', 'tfidf', 'bert', or 'nb'.")
        top_case_ids = df.iloc[top_k_idx]["case_id"].tolist()
        return list(zip(top_case_ids, scores))
    except Exception as e:
        print(f"Error in retrieval for query '{query}' with method '{method}': {e}")
        return []

In [23]:
# === 8. PENGUJIAN AWAL ===
query_eval = [
    {"query": "Terdakwa melakukan penganiayaan yang menyebabkan luka", "ground_truth": 7},
    {"query": "Terdakwa bersama-sama melakukan kekerasan terhadap orang di muka umum", "ground_truth": 6},
    {"query": "Terdakwa melakukan penganiayaan berat yang mengakibatkan luka berat", "ground_truth": 14},
    {"query": "Terdakwa turut serta dalam penganiayaan secara bersama-sama", "ground_truth": 5},
    {"query": "Terdakwa melakukan kekerasan terhadap pegawai negeri yang menyebabkan luka", "ground_truth": 1},
    {"query": "Terdakwa melakukan penganiayaan yang mengakibatkan kematian", "ground_truth": 20},
    {"query": "Terdakwa dengan terang-terangan menggunakan kekerasan terhadap orang", "ground_truth": 12},
    {"query": "Terdakwa melakukan penggelapan barang karena hubungan kerja", "ground_truth": 71},
    {"query": "Terdakwa menghasut untuk melakukan perbuatan pidana di muka umum", "ground_truth": 85},
    {"query": "Terdakwa bersama-sama melakukan pengeroyokan yang menyebabkan luka", "ground_truth": 90}
]

# Filter valid queries
valid_query_eval = [q for q in query_eval if q["ground_truth"] in df["case_id"].values]
if len(valid_query_eval) < len(query_eval):
    print(f"Warning: {len(query_eval) - len(valid_query_eval)} queries skipped due to invalid ground_truth case_id.")
if not valid_query_eval:
    print("Error: No valid queries available for evaluation. Exiting.")
    exit(1)

os.makedirs("data/eval", exist_ok=True)
with open("data/eval/queries.json", "w", encoding="utf-8") as f:
    json.dump(valid_query_eval, f, indent=2, ensure_ascii=False)

In [24]:
# === 9. EVALUATE RETRIEVAL ===
def evaluate_retrieval(query_eval, k=5, method="bm25"):
    precisions = []
    mrrs = []
    ndcgs = []

    # Open logs.txt in append mode
    with open("logs.txt", "a", encoding="utf-8") as log_file:
        log_file.write(f"\n=== Evaluation for {method.upper()} ===\n")

        for q in query_eval:
            results = retrieve(q["query"], k, method)
            if not results:
                print(f"Skipping query '{q['query']}' due to retrieval error")
                log_file.write(f"Skipping query '{q['query']}' due to retrieval error\n")
                continue
            case_ids = [case_id for case_id, _ in results]
            scores = [score for _, score in results]
            gt = q["ground_truth"]
            precision = 1 if gt in case_ids else 0
            precisions.append(precision)

            rank = [i + 1 for i, case_id in enumerate(case_ids) if case_id == gt]
            mrr = 1 / rank[0] if rank else 0
            mrrs.append(mrr)

            relevance = [1 if case_id == gt else 0 for case_id in case_ids]
            dcg = sum(rel / np.log2(idx + 2) for idx, rel in enumerate(relevance))
            idcg = 1 / np.log2(2)
            ndcg = dcg / idcg if idcg > 0 else 0
            ndcgs.append(ndcg)

            # Print to console
            print(f"Query: {q['query']}\nTop-{k}: {case_ids}\nScores: {scores}\nGT: {gt}\nPrecision@5: {precision}\nMRR: {mrr:.4f}\nNDCG@5: {ndcg:.4f}\n---")

            # Write to logs.txt
            log_file.write(f"Query: {q['query']}\n")
            log_file.write(f"Top-{k}: {case_ids}\n")
            log_file.write(f"Scores: {scores}\n")
            log_file.write(f"GT: {gt}\n")
            log_file.write(f"Precision@5: {precision}\n")
            log_file.write(f"MRR: {mrr:.4f}\n")
            log_file.write(f"NDCG@5: {ndcg:.4f}\n")
            log_file.write("---\n")

        mean_precision = np.mean(precisions) if precisions else 0.0
        mean_mrr = np.mean(mrrs) if mrrs else 0.0
        mean_ndcg = np.mean(ndcgs) if ndcgs else 0.0

        # Print mean metrics to console
        print(f"Mean Precision@5 ({method}): {mean_precision:.4f}")
        print(f"Mean MRR ({method}): {mean_mrr:.4f}")
        print(f"Mean NDCG@5 ({method}): {mean_ndcg:.4f}")

        # Write mean metrics to logs.txt
        log_file.write(f"Mean Precision@5 ({method}): {mean_precision:.4f}\n")
        log_file.write(f"Mean MRR ({method}): {mean_mrr:.4f}\n")
        log_file.write(f"Mean NDCG@5 ({method}): {mean_ndcg:.4f}\n")
        log_file.write("=" * 50 + "\n")

        return mean_precision, mean_mrr, mean_ndcg

In [25]:
# === 10. EVALUATE ALL METHODS ===
if valid_query_eval:
    # Open logs.txt and write header
    with open("logs.txt", "a", encoding="utf-8") as log_file:
        log_file.write(f"\n=== Evaluation Run on {pd.Timestamp.now()} ===\n")

    bm25_precision, bm25_mrr, bm25_ndcg = evaluate_retrieval(valid_query_eval, k=5, method="bm25")
    tfidf_precision, tfidf_mrr, tfidf_ndcg = evaluate_retrieval(valid_query_eval, k=5, method="tfidf")
    bert_precision, bert_mrr, bert_ndcg = evaluate_retrieval(valid_query_eval, k=5, method="bert")
    nb_precision, nb_mrr, nb_ndcg = evaluate_retrieval(valid_query_eval, k=5, method="nb")
else:
    print("No evaluation performed due to empty valid_query_eval.")

Query: Terdakwa melakukan penganiayaan yang menyebabkan luka
Top-5: [7, 79, 51, 28, 87]
Scores: [5.4860177360702025, 5.407549967639458, 5.288512843250853, 5.16279541535525, 4.9849604213534]
GT: 7
Precision@5: 1
MRR: 1.0000
NDCG@5: 1.0000
---
Query: Terdakwa bersama-sama melakukan kekerasan terhadap orang di muka umum
Top-5: [75, 25, 76, 40, 37]
Scores: [9.884378795643297, 9.573269448772384, 9.523747443874148, 9.331816852798939, 9.331816852798939]
GT: 6
Precision@5: 0
MRR: 0.0000
NDCG@5: 0.0000
---
Query: Terdakwa melakukan penganiayaan berat yang mengakibatkan luka berat
Top-5: [28, 87, 82, 14, 77]
Scores: [12.356939605002125, 11.862315484014914, 11.654985426663032, 11.518134894732654, 11.300252216500759]
GT: 14
Precision@5: 1
MRR: 0.2500
NDCG@5: 0.4307
---
Query: Terdakwa turut serta dalam penganiayaan secara bersama-sama
Top-5: [76, 59, 12, 37, 40]
Scores: [6.9036353582849435, 6.7487489911719205, 6.429887300178696, 6.1604426435880715, 6.1604426435880715]
GT: 5
Precision@5: 0
MRR: 0.0