In [None]:
!pip install transformers sentence-transformers --quiet

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m23.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m12.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m18.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m751.5 kB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m956.4 kB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
!pip install transformers --quiet

from transformers import AutoTokenizer, AutoModel
import torch
import numpy as np
from tqdm import tqdm

tokenizer = AutoTokenizer.from_pretrained("indobenchmark/indobert-base-p1")
model = AutoModel.from_pretrained("indobenchmark/indobert-base-p1")

def bert_embed(text: str) -> np.ndarray:
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

In [None]:
import pandas as pd
import pickle
from tqdm import tqdm

CSV_PATH = "/content/drive/MyDrive/cbr_banking/data/processed/cases.csv"
df = pd.read_csv(CSV_PATH)

# Ambil potongan awal dari full text (misal 1500 karakter pertama)
texts = df["text_full"].fillna("").str[:1500].tolist()

bert_embeddings = []
for text in tqdm(texts, desc="🔄 Generating BERT embeddings"):
    try:
        emb = bert_embed(text)
    except:
        emb = np.zeros(768)
    bert_embeddings.append(emb)

bert_matrix = np.vstack(bert_embeddings)

# Simpan supaya tidak perlu mengulang
np.save("/content/drive/MyDrive/cbr_banking/bert_embeddings_textfull.npy", bert_matrix)


🔄 Generating BERT embeddings: 100%|██████████| 150/150 [00:57<00:00,  2.62it/s]


In [None]:
from sklearn.metrics.pairwise import cosine_similarity

def retrieve(query: str, k: int = 5) -> list:
    query_vec = bert_embed(query).reshape(1, -1)
    similarities = cosine_similarity(query_vec, bert_matrix).flatten()
    top_k_idx = np.argsort(similarities)[-k:][::-1]
    return df.iloc[top_k_idx]["case_id"].tolist()


In [None]:
import json

with open("/content/drive/MyDrive/cbr_banking/data/eval/queries.json", "r", encoding="utf-8") as f:
    queries = json.load(f)

for q in queries:
    result = retrieve(q["query"], k=50)
    print(f"\n🟢 Query: {q['query']}")
    print(f"🔍 Retrieved: {result}")
    print(f"✅ Ground truth: {q['ground_truth']}")
    print(f"🎯 Hit? {'YES' if set(result) & set(q['ground_truth']) else 'NO'}")



🟢 Query: penipuan dana nasabah oleh teller bank
🔍 Retrieved: [100, 104, 150, 144, 141, 147, 55, 37, 92, 70, 109, 127, 19, 121, 4, 142, 71, 93, 38, 128, 145, 148, 20, 56, 110, 48, 123, 87, 131, 46, 33, 111, 57, 129, 149, 146, 143, 39, 94, 21, 72, 138, 28, 80, 68, 63, 41, 17, 23, 52]
✅ Ground truth: [5]
🎯 Hit? NO

🟢 Query: penggelapan anggaran kredit koperasi
🔍 Retrieved: [100, 131, 138, 123, 29, 48, 104, 79, 70, 109, 19, 92, 37, 55, 144, 141, 150, 127, 147, 121, 142, 145, 56, 38, 20, 71, 128, 148, 110, 93, 63, 4, 33, 149, 21, 129, 143, 146, 72, 111, 57, 39, 94, 41, 87, 46, 42, 80, 28, 12]
✅ Ground truth: [12]
🎯 Hit? YES

🟢 Query: korupsi direktur bank swasta
🔍 Retrieved: [117, 145, 142, 110, 128, 148, 56, 38, 20, 71, 93, 57, 72, 39, 129, 146, 111, 94, 21, 149, 143, 150, 37, 55, 70, 92, 19, 147, 127, 109, 144, 141, 100, 48, 104, 138, 121, 4, 15, 26, 42, 79, 63, 27, 46, 98, 91, 67, 68, 28]
✅ Ground truth: [3]
🎯 Hit? NO

🟢 Query: pemalsuan dokumen rekening
🔍 Retrieved: [42, 121, 79, 100, 