In [None]:
# BERT.ipynb - Modifikasi untuk retrieval
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from transformers import BertTokenizer, BertModel
import torch
import json
import os

# 1. Persiapan Data
df = pd.read_csv('data/processed/cases.csv')

# Contoh query untuk evaluasi
queries = {
    "q1": {
        "text": "penyalahgunaan narkotika jenis sabu seberat 0.5 gram",
        "relevant_cases": [5, 12, 18]  # ID kasus yang relevan
    },
    "q2": {
        "text": "pengedar ganja dengan barang bukti 1 kilogram",
        "relevant_cases": [3, 7, 22]
    }
}

# Create the directory if it doesn't exist
os.makedirs('data/eval', exist_ok=True)

with open('data/eval/queries.json', 'w') as f:
    json.dump(queries, f)

# 2. Pendekatan TF-IDF
def tfidf_retrieval():
    # Buat vektor dokumen
    tfidf = TfidfVectorizer()
    doc_vectors = tfidf.fit_transform(df['ringkasan_fakta'])

    # Fungsi retrieval
    def retrieve(query, k=5):
        query_vec = tfidf.transform([query])
        similarities = cosine_similarity(query_vec, doc_vectors)
        top_k_indices = np.argsort(similarities[0])[-k:][::-1]
        return df.iloc[top_k_indices]['case_id'].tolist()

    return retrieve

# 3. Pendekatan BERT
def bert_retrieval():
    # Load model dan tokenizer
    tokenizer = BertTokenizer.from_pretrained('indobenchmark/indobert-base-p2')
    model = BertModel.from_pretrained('indobenchmark/indobert-base-p2')

    # Encode semua dokumen
    doc_embeddings = []
    for text in tqdm(df['ringkasan_fakta']):
        inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=256)
        with torch.no_grad():
            outputs = model(**inputs)
        doc_embeddings.append(outputs.last_hidden_state[:,0,:].numpy())

    doc_embeddings = np.vstack(doc_embeddings)

    # Fungsi retrieval
    def retrieve(query, k=5):
        inputs = tokenizer(query, return_tensors='pt', truncation=True, padding=True, max_length=256)
        with torch.no_grad():
            outputs = model(**inputs)
        query_embedding = outputs.last_hidden_state[:,0,:].numpy()

        similarities = cosine_similarity(query_embedding, doc_embeddings)
        top_k_indices = np.argsort(similarities[0])[-k:][::-1]
        return df.iloc[top_k_indices]['case_id'].tolist()

    return retrieve

# 4. Evaluasi
def evaluate_retrieval(retriever, queries):
    metrics = {'accuracy': [], 'precision': [], 'recall': []}

    for qid, query in queries.items():
        retrieved = retriever(query['text'], k=5)
        relevant = query['relevant_cases']

        # Hitung metrik
        tp = len(set(retrieved) & set(relevant))
        precision = tp / len(retrieved) if len(retrieved) > 0 else 0
        recall = tp / len(relevant) if len(relevant) > 0 else 0
        accuracy = 1 if any(c in retrieved for c in relevant) else 0

        metrics['accuracy'].append(accuracy)
        metrics['precision'].append(precision)
        metrics['recall'].append(recall)

    # Rata-rata metrik
    return {k: sum(v)/len(v) for k,v in metrics.items()}

# Jalankan evaluasi
with open('data/eval/queries.json') as f:
    queries = json.load(f)

tfidf_retriever = tfidf_retrieval()
bert_retriever = bert_retrieval()

tfidf_metrics = evaluate_retrieval(tfidf_retriever, queries)
bert_metrics = evaluate_retrieval(bert_retriever, queries)

# Simpan hasil evaluasi
metrics_df = pd.DataFrame({'TF-IDF': tfidf_metrics, 'BERT': bert_metrics})
metrics_df.to_csv('data/eval/retrieval_metrics.csv')

print("TF-IDF Metrics:", tfidf_metrics)
print("BERT Metrics:", bert_metrics)