In [1]:
from tqdm.auto import tqdm
import numpy as np
import pandas as pd
import faiss
import time
from math import log2

In [2]:
# ----------------------------------------------------
# 0. Column config (adapt if needed)
# ----------------------------------------------------
infile = "./dataset/finekb_cases_test.parquet"
infile_train = "./dataset/finekb_cases_train_clustered.parquet"
infile_kb = "./dataset/finekb_kb.parquet"

CASE_ID_COL = "case_id"
KB_ID_COL = "kb_id"
CASE_EMB_COL = "embed_summary"
KB_EMB_COL = "embed_kb"

In [3]:
# ----------------------------------------------------
# 1. Load your KB dataframe
# ----------------------------------------------------
df_kb  = pd.read_parquet(infile_kb)
df_kb  = df_kb.dropna(subset=['kb_id'])

print("Loaded case dataset:", df_kb.shape)

Loaded case dataset: (188, 2)


In [4]:
df_kb.head()

Unnamed: 0,kb_id,embed_kb
0,1,"[0.03608580306172371, -0.015194863080978394, -..."
1,2,"[0.0020392441656440496, 0.005111693404614925, ..."
2,3,"[-0.0012934046098962426, -0.018688751384615898..."
3,4,"[0.01782716065645218, -0.009911049157381058, 0..."
4,5,"[-0.006877582520246506, -0.009154483675956726,..."


In [5]:
# ----------------------------------------------------
# 1. Load your TRAIN dataframe
# ----------------------------------------------------
df_train  = pd.read_parquet(infile_train)
df_train  = df_train.dropna(subset=['kb_id'])

print("Loaded case dataset:", df_train.shape)

Loaded case dataset: (11415, 5)


In [6]:
df_train.head()

Unnamed: 0,case_id,issue_type,kb_id,embed_summary,cluster_id
0,0,remote_access,11,"[0.005332942120730877, -0.003665205556899309, ...",11_c0
1,1,info,102,"[0.00189633306581527, 0.012942066416144371, 0....",102_c5
2,3,contract,35,"[0.0068122390657663345, 0.0030071106739342213,...",35_c3
3,4,fan,158,"[0.013492287136614323, -0.013519312255084515, ...",158_c1
4,6,contract,63,"[-0.005820220801979303, 0.008195333182811737, ...",63_c1


In [7]:
# ----------------------------------------------------
# 1. Load your TEST dataframe
# ----------------------------------------------------
df_test  = pd.read_parquet(infile)
df_test  = df_test.dropna(subset=['kb_id'])

print("Loaded case dataset:", df_test.shape)

Loaded case dataset: (2849, 4)


In [8]:
df_test.head()

Unnamed: 0,case_id,issue_type,kb_id,embed_summary
0,0,remote_access,53,"[-0.0003725903807207942, -0.008335007354617119..."
1,1,contract,27,"[0.002286486327648163, 0.00011503534915391356,..."
2,2,software,38,"[0.01094728522002697, 0.0060193841345608234, 0..."
3,3,contract,33,"[-0.003893519751727581, -0.001837620628066361,..."
5,5,memory,103,"[-0.007071544881910086, 0.0033897554967552423,..."


In [9]:
# -------------------------------------------------------------------
# 1. Helper: L2-normalization
# -------------------------------------------------------------------
def l2_normalize(mat: np.ndarray) -> np.ndarray:
    norms = np.linalg.norm(mat, axis=1, keepdims=True) + 1e-12
    return mat / norms


# -------------------------------------------------------------------
# 2. Build KB index (NO averaging, direct KB embeddings)
# -------------------------------------------------------------------
df_kb = df_kb.copy()
df_kb[KB_ID_COL] = df_kb[KB_ID_COL].astype(int)

kb_ids = df_kb[KB_ID_COL].to_numpy()
kb_emb_list = df_kb[KB_EMB_COL].tolist()
kb_emb_matrix = np.vstack(kb_emb_list).astype("float32")

dim = kb_emb_matrix.shape[1]
print("KB embedding matrix shape:", kb_emb_matrix.shape)

kb_emb_matrix = l2_normalize(kb_emb_matrix)

# FAISS cosine index
index = faiss.IndexFlatIP(dim)
index.add(kb_emb_matrix)

print("FAISS index size =", index.ntotal)

# FAISS ID → KB ID mapping
idx2kb_id = {i: int(kb_id) for i, kb_id in enumerate(kb_ids)}


# -------------------------------------------------------------------
# 3. Build TEST mappings
# -------------------------------------------------------------------
df_test = df_test.copy()
df_test[CASE_ID_COL] = df_test[CASE_ID_COL].astype(int)
df_test[KB_ID_COL] = df_test[KB_ID_COL].astype(int)

# case_id → list of gold KB ids
case_to_kbs = (
    df_test.groupby(CASE_ID_COL)[KB_ID_COL]
    .apply(list)
    .to_dict()
)

# case_id → case vector (avg if multiple rows)
case_to_emb = (
    df_test.groupby(CASE_ID_COL)[CASE_EMB_COL]
    .apply(lambda rows: np.mean(np.vstack(rows.values), axis=0))
    .to_dict()
)

unique_case_ids = list(case_to_kbs.keys())
print("Unique test cases:", len(unique_case_ids))


# -------------------------------------------------------------------
# 4. Evaluation: case_emb → KB_emb
# -------------------------------------------------------------------
def evaluate_case_to_kb_faiss(k: int = 5):
    recall3_list, recall5_list = [], []
    mrr_list, ndcg5_list = [], []
    latencies = []

    for case_id in tqdm(unique_case_ids, desc="Evaluating FAISS KB-index", ncols=150):
        gold_kbs = set(case_to_kbs[case_id])

        # Query embedding
        q_emb = np.array(case_to_emb[case_id], dtype="float32").reshape(1, -1)
        q_emb = l2_normalize(q_emb)

        t0 = time.perf_counter()
        D, I = index.search(q_emb, k)
        t1 = time.perf_counter()

        latencies.append(t1 - t0)

        retrieved_kb_ids = [int(kb_ids[idx]) for idx in I[0]]

        # compute rank
        gold_ranks = [
            retrieved_kb_ids.index(g) + 1
            for g in gold_kbs
            if g in retrieved_kb_ids
        ]
        best_rank = min(gold_ranks) if gold_ranks else None

        recall3_list.append(1 if best_rank and best_rank <= 3 else 0)
        recall5_list.append(1 if best_rank and best_rank <= 5 else 0)
        mrr_list.append(1 / best_rank if best_rank else 0)
        ndcg5_list.append(1 / log2(best_rank + 1) if best_rank and best_rank <= 5 else 0)

    latencies = np.array(latencies)
    total_time = latencies.sum()
    n_queries = len(latencies)

    latency_stats = {
        "avg_ms": float(latencies.mean() * 1000),
        "p50_ms": float(np.percentile(latencies, 50) * 1000),
        "p95_ms": float(np.percentile(latencies, 95) * 1000),
        "qps": float(n_queries / total_time),
        "n_queries": n_queries
    }

    metrics = {
        "Recall@3": np.mean(recall3_list) * 100,
        "Recall@5": np.mean(recall5_list) * 100,
        "MRR":      np.mean(mrr_list),
        "nDCG@5":   np.mean(ndcg5_list) * 100,
    }

    return metrics, latency_stats


# -------------------------------------------------------------------
# 5. Run evaluation
# -------------------------------------------------------------------
results, latency = evaluate_case_to_kb_faiss(k=5)

print("\n===== FAISS(case_emb → KB_emb) =====")
print(f"Recall@3: {results['Recall@3']:.2f}")
print(f"Recall@5: {results['Recall@5']:.2f}")
print(f"MRR:      {results['MRR']:.4f}")
print(f"nDCG@5:   {results['nDCG@5']:.2f}")

print("\nLatency:")
print(f"avg_ms:   {latency['avg_ms']:.3f}")
print(f"p50_ms:   {latency['p50_ms']:.3f}")
print(f"p95_ms:   {latency['p95_ms']:.3f}")
print(f"QPS:      {latency['qps']:.2f}")

# -------------------------------------------------------------------
# 6. Save CSV
# -------------------------------------------------------------------
row = {"method": "FAISS_case_to_KB_emb", **results, **latency}
df_out = pd.DataFrame([row])
csv_path = f"./results/{CASE_EMB_COL}_faiss_case_to_kb_results.csv"
df_out.to_csv(csv_path, index=False)

print("\nSaved results to:", csv_path)
print(df_out)


KB embedding matrix shape: (188, 4096)
FAISS index size = 188
Unique test cases: 2201


Evaluating FAISS KB-index:   0%|                                                                              …


===== FAISS(case_emb → KB_emb) =====
Recall@3: 40.03
Recall@5: 50.48
MRR:      0.3073
nDCG@5:   35.64

Latency:
avg_ms:   0.134
p50_ms:   0.131
p95_ms:   0.144
QPS:      7456.95

Saved results to: ./results/embed_summary_faiss_case_to_kb_results.csv
                 method  Recall@3   Recall@5       MRR     nDCG@5    avg_ms  \
0  FAISS_case_to_KB_emb  40.02726  50.477056  0.307338  35.639846  0.134103   

     p50_ms    p95_ms          qps  n_queries  
0  0.130956  0.144377  7456.954202       2201  
