In [114]:
!pip install transformers torch sentence-transformers Sastrawi scikit-learn faiss-cpu rank_bm25

Defaulting to user installation because normal site-packages is not writeable


In [115]:
import pandas as pd
import numpy as np
import json
import os
import re
import sys
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, log_loss
from typing import List, Tuple
from collections import Counter
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from transformers import AutoTokenizer, AutoModel
from rank_bm25 import BM25Okapi
import torch
import faiss
from scipy.stats import rankdata

In [116]:
# === 1. LOAD DATA ===
file_path = 'data/processed/cases.csv'
try:
    df = pd.read_csv(file_path)
except FileNotFoundError:
    print(f"Error: File {file_path} not found.")
    sys.exit(1)

# Ensure case_id is unique
if df["case_id"].duplicated().any():
    print("Error: Duplicate case_id found in dataset.")
    sys.exit(1)

# Check for required columns
required_columns = ['case_id', 'text_full', 'pidana_penjara']
if not all(col in df for col in required_columns):
    print(f"Error: Dataset missing required columns: {', '.join(set(required_columns) - set(df.columns))}")
    sys.exit(1)

In [117]:
# === 2. EXTRAK SOLUSI ===
def normalize_solution(text: str) -> str:
    if not isinstance(text, str) or not text.strip() or text.lower() in ['none', 'tidak dipidana', 'bebas', 'tidak terbukti', 'lepas']:
        return "None"
    text = text.lower().strip()
    match = re.search(r'(penjara|denda)?\s*(\d+\.?\d*)\s*(tahun|bulan|juta)?(?:\s*(\d+)\s*bulan)?', text)
    if match:
        penalty_type = match.group(1) if match.group(1) else "penjara"
        value = match.group(2)
        unit = match.group(3) if match.group(3) else "tahun" if penalty_type == "penjara" else "juta"
        extra_months = f" {match.group(4)} bulan" if match.group(4) else ""
        return f"{penalty_type} {value} {unit}{extra_months}".strip()
    return "None"

# Create case_solutions dictionary
case_solutions = {
    row['case_id']: normalize_solution(str(row['pidana_penjara']))
    for _, row in df.iterrows()
}
if len(case_solutions) < len(df):
    print(f"Warning: {len(df) - len(case_solutions)} cases with missing or empty pidana_penjara.")

if not case_solutions:
    print("Error: No valid solutions found in dataset.")
    sys.exit(1)

In [118]:
# === 3. PREPROCESSING ===
stopword_factory = StopWordRemoverFactory()
stemmer = StemmerFactory().create_stemmer()
stop_words_indonesia = stopword_factory.get_stop_words() + ["terdakwa", "korban", "menyatakan", "secara", "sah", "meyakinkan"]

synonyms = {
    "pengeroyokan": ["kekerasan bersama-sama", "penganiayaan bersama-sama"],
    "penganiayaan": ["kekerasan", "penyerangan"],
    "turut serta": ["ikut serta", "bersama-sama"],
    "luka berat": ["cedera parah", "luka serius"],
    "penganiayaan ringan": ["kekerasan ringan", "penyerangan ringan"]
}

def preprocess(text: str) -> str:
    """Preprocess text: lowercase, remove punctuation, stem, apply synonyms, remove stopwords."""
    if not isinstance(text, str):
        text = ""
    text = text.lower()
    text = re.sub(r'[^a-z0-9\s]', ' ', text)
    text = stemmer.stem(text)
    for key, syn_list in synonyms.items():
        for syn in syn_list:
            text = text.replace(syn, key)
    text = ' '.join([word for word in text.split() if word not in stop_words_indonesia])
    return text

corpus = df["text_full"].apply(preprocess)

In [119]:
# === 4. REPRESENTASI VEKTOR ===
# TF-IDF
tfidf_vectorizer = TfidfVectorizer(stop_words=stop_words_indonesia, max_features=15000, sublinear_tf=True, ngram_range=(1, 3))
tfidf_matrix = tfidf_vectorizer.fit_transform(corpus)

# BM25
tokenized_corpus = [doc.split() for doc in corpus]
bm25 = BM25Okapi(tokenized_corpus)

# IndoBERT Embeddings
tokenizer = AutoTokenizer.from_pretrained("indobenchmark/indobert-base-p2")
model = AutoModel.from_pretrained("./indobert_finetuned" if os.path.exists("./indobert_finetuned") else "indobenchmark/indobert-base-p2")

def bert_embed(text: str) -> np.ndarray:
    """Generate IndoBERT embeddings for text."""
    try:
        inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
        with torch.no_grad():
            outputs = model(**inputs)
        embeddings = outputs.last_hidden_state.max(dim=1).values.squeeze().numpy()
        return embeddings / np.linalg.norm(embeddings) if np.linalg.norm(embeddings) != 0 else np.zeros(768)
    except Exception as e:
        print(f"Error embedding text: {e}")
        return np.zeros(768)

# Load or compute IndoBERT embeddings
embeddings_file = 'data/processed/embeddings.json'
if os.path.exists(embeddings_file):
    with open(embeddings_file, 'r', encoding='utf-8') as f:
        embeddings_data = json.load(f)
    bert_embeddings = np.array([data["indobert_embedding"] for data in embeddings_data])
else:
    bert_embeddings = np.array([bert_embed(text) for text in corpus])

In [120]:
# === 5. SPLITTING DATA ===
if "pasal" not in df:
    print("Warning: 'pasal' column not found. Skipping Naive Bayes training.")
    nb_trained = False
    stratify_col = None
    X_train_tfidf, X_test_tfidf, case_id_train, case_id_test = train_test_split(
        tfidf_matrix, df["case_id"], test_size=0.2, random_state=42
    )
    X_train_bert, X_test_bert = train_test_split(
        bert_embeddings, test_size=0.2, random_state=42
    )
else:
    nb_trained = True
    stratify_col = df["pasal"]
    X_train_tfidf, X_test_tfidf, y_train, y_test = train_test_split(
        tfidf_matrix, df["pasal"], test_size=0.2, random_state=42, stratify=stratify_col
    )
    X_train_bert, X_test_bert = train_test_split(
        bert_embeddings, test_size=0.2, random_state=42, stratify=stratify_col
    )
    case_id_train, case_id_test = train_test_split(
        df["case_id"], test_size=0.2, random_state=42, stratify=stratify_col
    )

In [121]:
# === 6. MODEL RETRIEVAL: NAIVE BAYES ===
if nb_trained:
    nb = MultinomialNB()
    nb.fit(X_train_tfidf, y_train)
    y_pred = nb.predict(X_test_tfidf)
    accuracy = accuracy_score(y_test, y_pred)
    prob_scores = nb.predict_proba(X_test_tfidf)
    class_labels = np.unique(y_train)
    log_losses = []
    for i, y_true in enumerate(y_test):
        if y_true not in class_labels:
            print(f"Warning: Test pasal {y_true} not in training set. Skipping.")
            continue
        true_class_idx = np.where(class_labels == y_true)[0][0]
        true_prob = prob_scores[i, true_class_idx]
        loss = -np.log(max(true_prob, 1e-15))
        log_losses.append(loss)
    log_loss_value = np.mean(log_losses) if log_losses else 0.0
    print(f"Naive Bayes Accuracy: {accuracy:.4f}")
    print(f"Naive Bayes Log Loss: {log_loss_value:.4f}")
else:
    print("Naive Bayes training skipped due to missing 'pasal' column.")

Naive Bayes Accuracy: 1.0000
Naive Bayes Log Loss: 0.0339


In [122]:
# === 7. FAISS INDEX FOR INDOBERT RETRIEVAL ===
dimension = bert_embeddings.shape[1]
index = faiss.IndexFlatIP(dimension)
index.add(bert_embeddings.astype(np.float32))

In [123]:
# === 8. FUNGSI RETRIEVAL ===
def retrieve(query: str, k: int = 5, method: str = "nb") -> List[Tuple[int, float]]:
    """Retrieve top-k cases for a query using the specified method."""
    query_clean = preprocess(query)
    try:
        if method == "bm25":
            query_tokens = query_clean.split()
            scores = bm25.get_scores(query_tokens)
            # Normalize BM25 scores to [0,1] for consistency
            scores = (scores - np.min(scores)) / (np.max(scores) - np.min(scores) + 1e-10)
            top_k_idx = np.argsort(scores)[::-1][:k]
            scores = scores[top_k_idx]
        elif method == "tfidf":
            query_vec = tfidf_vectorizer.transform([query_clean])
            scores = cosine_similarity(query_vec, tfidf_matrix).flatten()
            top_k_idx = scores.argsort()[::-1][:k]
            scores = scores[top_k_idx]
        elif method == "bert":
            query_vec = bert_embed(query_clean).astype(np.float32)
            query_vec = query_vec / np.linalg.norm(query_vec) if np.linalg.norm(query_vec) != 0 else np.zeros(dimension, dtype=np.float32)
            scores, indices = index.search(query_vec.reshape(1, -1), k)
            top_k_idx = indices[0]
            scores = scores[0]
        elif method == "nb":
            if not nb_trained:
                print("Warning: Naive Bayes not trained. Falling back to TF-IDF.")
                query_vec = tfidf_vectorizer.transform([query_clean])
                scores = cosine_similarity(query_vec, tfidf_matrix).flatten()
                top_k_idx = scores.argsort()[::-1][:k]
                scores = scores[top_k_idx]
            else:
                query_vec = tfidf_vectorizer.transform([query_clean])
                predicted_pasal = nb.predict(query_vec)[0]
                candidate_idx = df[df["pasal"] == predicted_pasal].index
                if len(candidate_idx) == 0:
                    print(f"Warning: No cases found for predicted pasal {predicted_pasal}. Falling back to TF-IDF.")
                    scores = cosine_similarity(query_vec, tfidf_matrix).flatten()
                    top_k_idx = scores.argsort()[::-1][:k]
                    scores = scores[top_k_idx]
                else:
                    candidate_matrix = tfidf_matrix[candidate_idx]
                    scores = cosine_similarity(query_vec, candidate_matrix).flatten()
                    top_k_idx = candidate_idx[scores.argsort()[::-1][:min(k, len(scores))]]
                    scores = scores[scores.argsort()[::-1][:min(k, len(scores))]]
        else:
            raise ValueError("Invalid method. Choose 'bm25', 'tfidf', 'bert', or 'nb'.")
        top_case_ids = df.iloc[top_k_idx]["case_id"].tolist()
        return list(zip(top_case_ids, scores))
    except Exception as e:
        print(f"Error in retrieval for query '{query}' with method '{method}': {e}")
        return []

In [124]:
# === 9. PREDIKSI OUTCOME ===
def predict_outcome(query: str, k: int = 5, method: str = "nb", strategy: str = "weighted", threshold: float = 0.1) -> str:
    """
    Predict the outcome for a query based on top-k similar cases.

    Args:
        query (str): Input query describing the case.
        k (int): Number of top cases to retrieve.
        method (str): Retrieval method ('bm25', 'tfidf', 'bert', 'nb').
        strategy (str): Prediction strategy ('majority', 'weighted').
        threshold (float): Minimum similarity score to consider a case relevant.

    Returns:
        str: Predicted solution or "None" if no penalty is inferred.
    """
    top_k = retrieve(query, k=k, method=method)
    if not top_k:
        return "None"  # No similar cases found

    case_ids, scores = zip(*top_k)
    # Filter cases with similarity scores above threshold
    valid_cases = [(cid, score) for cid, score in top_k if score >= threshold]
    if not valid_cases:
        return "None"  # No cases meet the threshold

    case_ids, scores = zip(*valid_cases)
    solutions = [case_solutions[cid] for cid in case_ids if cid in case_solutions]

    if not solutions:
        return "None"  # No valid solutions found

    # Check if all retrieved cases have "None" as solution
    if all(sol == "None" for sol in solutions):
        return "None"

    # Apply prediction strategy
    if strategy == "majority":
        solution_counts = Counter(solutions)
        predicted_solution = solution_counts.most_common(1)[0][0]
    else:  # Weighted similarity
        solution_scores = {}
        total_score = sum(scores)
        normalized_scores = [score / (total_score + 1e-10) for score in scores]  # Normalize scores
        for sol, score in zip(solutions, normalized_scores):
            if sol != "None":  # Exclude "None" from weighted scoring
                solution_scores[sol] = solution_scores.get(sol, 0.0) + score
        if not solution_scores:
            return "None"  # No non-None solutions
        predicted_solution = max(solution_scores, key=solution_scores.get)

    return predicted_solution

In [125]:
# === 10. DEMO MANUAL ===
# Configuration
RETRIEVAL_METHOD = "tfidf"  # "nb" NaiveBayes, "tfidf" TF-IDF, "bert" IndoBERT, and "bm25" BM25
PREDICTION_STRATEGY = "majority"  # Weighted or majority for better scores
SIMILARITY_THRESHOLD = 0.1  # Minimum: 0.0 (includes all cases, regardless of similarity).Maximum: 1.0 (requires perfect similarity, rarely achieved).

# Define new queries with some known ground truths for evaluation
new_queries = [
    {"query_id": 1, "query": "Terdakwa melakukan penganiayaan ringan tanpa luka di tempat umum", "ground_truth": "penjara 1 tahun"},
    {"query_id": 2, "query": "Terdakwa bersama-sama melakukan pengeroyokan menyebabkan luka berat", "ground_truth": "penjara 3 tahun"},
    {"query_id": 3, "query": "Terdakwa melakukan penggelapan dalam jabatan senilai 10 juta", "ground_truth": "penjara 1 tahun"},
    {"query_id": 4, "query": "Terdakwa menghasut orang untuk melakukan kekerasan", "ground_truth": "penjara 2 tahun"},
    {"query_id": 5, "query": "Terdakwa melakukan kekerasan berat terhadap anak di bawah umur", "ground_truth": "penjara 5 tahun"},
    {"query_id": 6, "query": "Terdakwa membantu tetangga menyeberang jalan", "ground_truth": "None"}
]

# Run predictions and store results
results = []
exact_matches = 0
total_with_ground_truth = 0

with open("logs.txt", "a", encoding="utf-8") as log_file:
    log_file.write(f"\n=== Prediction Run on {pd.Timestamp.now()} ===\n")
    log_file.write(f"Method: {RETRIEVAL_METHOD.upper()}, Strategy: {PREDICTION_STRATEGY}, Threshold: {SIMILARITY_THRESHOLD}\n")

    for q in new_queries:
        query_id = q["query_id"]
        query = q["query"]
        ground_truth = q["ground_truth"]

        # Predict outcome
        predicted_solution = predict_outcome(query, k=5, method=RETRIEVAL_METHOD, strategy=PREDICTION_STRATEGY, threshold=SIMILARITY_THRESHOLD)
        top_k = retrieve(query, k=5, method=RETRIEVAL_METHOD)
        top_case_ids = [cid for cid, _ in top_k]
        top_scores = [score for _, score in top_k]

        # Evaluate if ground truth exists
        is_correct = False
        if ground_truth is not None:
            total_with_ground_truth += 1
            normalized_ground_truth = normalize_solution(ground_truth)
            is_correct = normalized_ground_truth == predicted_solution
            if is_correct:
                exact_matches += 1
        else:
            normalized_ground_truth = predicted_solution  # Use predicted as ground truth if None

        # Log and print results
        print(f"Query ID: {query_id}")
        print(f"Query: {query}")
        print(f"Predicted Solution: {predicted_solution}")
        print(f"Top-5 Case IDs: {top_case_ids}")
        print(f"Scores: {top_scores}")
        print(f"Ground Truth: {'Not available' if ground_truth is None else ground_truth}")
        print(f"Match: {'Yes' if is_correct else 'No'}")
        print("---")

        log_file.write(f"Query ID: {query_id}\n")
        log_file.write(f"Query: {query}\n")
        log_file.write(f"Predicted Solution: {predicted_solution}\n")
        log_file.write(f"Top-5 Case IDs: {top_case_ids}\n")
        log_file.write(f"Scores: {top_scores}\n")
        log_file.write(f"Ground Truth: {'Not available' if ground_truth is None else ground_truth}\n")
        log_file.write(f"Match: {'Yes' if is_correct else 'No'}\n")
        log_file.write("---\n")

        results.append({
            "query_id": query_id,
            "query": query,
            "predicted_solution": predicted_solution,
            "top_5_case_ids": ",".join(map(str, top_case_ids)),
            "scores": ",".join([f"{score:.6f}" for score in top_scores]),
            "ground_truth": ground_truth if ground_truth is not None else predicted_solution
        })

# Calculate and log accuracy
if total_with_ground_truth > 0:
    accuracy = exact_matches / total_with_ground_truth
    print(f"Exact Match Accuracy: {accuracy:.4f} ({exact_matches}/{total_with_ground_truth})")
    with open("logs.txt", "a", encoding="utf-8") as log_file:
        log_file.write(f"Exact Match Accuracy: {accuracy:.4f} ({exact_matches}/{total_with_ground_truth})\n")
else:
    print("No ground-truth solutions available for accuracy calculation.")
    with open("logs.txt", "a", encoding="utf-8") as log_file:
        log_file.write("No ground-truth solutions available for accuracy calculation.\n")

Query ID: 1
Query: Terdakwa melakukan penganiayaan ringan tanpa luka di tempat umum
Predicted Solution: penjara 1 tahun
Top-5 Case IDs: [7, 64, 28, 5, 66]
Scores: [0.11627914708182566, 0.10422558440011195, 0.09124974772771396, 0.09001615884504785, 0.08989948185598731]
Ground Truth: penjara 1 tahun
Match: Yes
---
Query ID: 2
Query: Terdakwa bersama-sama melakukan pengeroyokan menyebabkan luka berat
Predicted Solution: penjara 3 tahun
Top-5 Case IDs: [82, 75, 28, 76, 14]
Scores: [0.11161688265221262, 0.10907609926741638, 0.10408386945942408, 0.10240657708778028, 0.08754543870021377]
Ground Truth: penjara 3 tahun
Match: Yes
---
Query ID: 3
Query: Terdakwa melakukan penggelapan dalam jabatan senilai 10 juta
Predicted Solution: None
Top-5 Case IDs: [6, 8, 13, 36, 21]
Scores: [0.07957304682778768, 0.0777022458661865, 0.05855179152175507, 0.0536419993361455, 0.052036664795012615]
Ground Truth: penjara 1 tahun
Match: No
---
Query ID: 4
Query: Terdakwa menghasut orang untuk melakukan kekerasan


In [126]:
# Save results
os.makedirs("data/results", exist_ok=True)
results_df = pd.DataFrame(results)
results_df.to_csv("data/results/predictions.csv", index=False)
print(f"Results saved to data/results/predictions.csv")

Results saved to data/results/predictions.csv
