In [1]:
from tqdm.auto import tqdm
import numpy as np
import pandas as pd
import faiss
import time
from math import log2

In [2]:
# ----------------------------------------------------
# 0. Column config (adapt if needed)
# ----------------------------------------------------
infile = "./dataset/finekb_cases_test.parquet"
infile_train = "./dataset/finekb_cases_train_clustered.parquet"
infile_kb = "./dataset/finekb_kb.parquet"

CASE_ID_COL = "case_id"
KB_ID_COL = "kb_id"
CLUSTER_ID_COL = "cluster_id"
CASE_INDEX_EMB_COL = "embed_summary"
CASE_SEARCH_EMB_COL = "embed_summary"
MIN_CLUSTER_SIZE = 3

In [3]:
# ----------------------------------------------------
# 1. Load your TRAIN dataframe
# ----------------------------------------------------
df_train  = pd.read_parquet(infile_train)
df_train  = df_train.dropna(subset=['kb_id'])

print("Loaded case dataset:", df_train.shape)

Loaded case dataset: (11415, 5)


In [4]:
df_train.head()

Unnamed: 0,case_id,issue_type,kb_id,embed_summary,cluster_id
0,0,remote_access,11,"[0.005332942120730877, -0.003665205556899309, ...",11_c0
1,1,info,102,"[0.00189633306581527, 0.012942066416144371, 0....",102_c5
2,3,contract,35,"[0.0068122390657663345, 0.0030071106739342213,...",35_c3
3,4,fan,158,"[0.013492287136614323, -0.013519312255084515, ...",158_c1
4,6,contract,63,"[-0.005820220801979303, 0.008195333182811737, ...",63_c1


In [5]:
# ----------------------------------------------------
# 1. Load your TEST dataframe
# ----------------------------------------------------
df_test  = pd.read_parquet(infile)
df_test  = df_test.dropna(subset=['kb_id'])

print("Loaded case dataset:", df_test.shape)

Loaded case dataset: (2849, 4)


In [6]:
df_test.head()

Unnamed: 0,case_id,issue_type,kb_id,embed_summary
0,0,remote_access,53,"[-0.0003725903807207942, -0.008335007354617119..."
1,1,contract,27,"[0.002286486327648163, 0.00011503534915391356,..."
2,2,software,38,"[0.01094728522002697, 0.0060193841345608234, 0..."
3,3,contract,33,"[-0.003893519751727581, -0.001837620628066361,..."
5,5,memory,103,"[-0.007071544881910086, 0.0033897554967552423,..."


In [7]:
# -------------------------------------------------------------------
# 1. Helper: L2-normalization
# -------------------------------------------------------------------
def l2_normalize(mat: np.ndarray) -> np.ndarray:
    norms = np.linalg.norm(mat, axis=1, keepdims=True) + 1e-12
    return mat / norms


# -------------------------------------------------------------------
# 2. Build KB "cluster" index from TRAIN / historical cases
#    Group by cluster_id, drop small clusters, avg CASE_INDEX_EMB_COL
# -------------------------------------------------------------------
df_train = df_train.copy()
df_train[KB_ID_COL] = df_train[KB_ID_COL].astype(int)

# 2.1 Filter out small clusters: each cluster must have >= MIN_CLUSTER_SIZE rows
df_train_filtered = (
    df_train
    .groupby(CLUSTER_ID_COL)
    .filter(lambda g: len(g) >= MIN_CLUSTER_SIZE)
)



# 2.2 Compute centroids per remaining cluster_id
df_kb_centroids = (
    df_train_filtered
    .groupby(CLUSTER_ID_COL)
    .agg({
        CASE_INDEX_EMB_COL: lambda rows: np.mean(np.vstack(rows.values), axis=0),
        KB_ID_COL: "first",   # all rows in a cluster share the same kb_id
    })
    .reset_index()
    .rename(columns={CASE_INDEX_EMB_COL: "avg_emb"})
)

# For each indexed vector, we keep the corresponding kb_id
kb_ids = df_kb_centroids[KB_ID_COL].to_numpy()                     # (num_clusters,)
kb_emb_list = df_kb_centroids["avg_emb"].tolist()
kb_emb_matrix = np.vstack(kb_emb_list).astype("float32")           # (num_clusters, dim)

dim = kb_emb_matrix.shape[1]
print("KB cluster centroid matrix shape:", kb_emb_matrix.shape)

# Normalize centroids for cosine similarity via inner product
kb_emb_matrix = l2_normalize(kb_emb_matrix)

# FAISS cosine index
index = faiss.IndexFlatIP(dim)
index.add(kb_emb_matrix)

print("FAISS index size (num KB clusters) =", index.ntotal)

# FAISS internal idx → kb_id mapping
idx2kb_id = {i: int(kb_id) for i, kb_id in enumerate(kb_ids)}


# -------------------------------------------------------------------
# 3. Build TEST mappings
# -------------------------------------------------------------------
df_test = df_test.copy()
df_test[CASE_ID_COL] = df_test[CASE_ID_COL].astype(int)
df_test[KB_ID_COL] = df_test[KB_ID_COL].astype(int)

# case_id → list of gold KB ids
case_to_kbs = (
    df_test.groupby(CASE_ID_COL)[KB_ID_COL]
    .apply(list)
    .to_dict()
)

# case_id → case vector (avg if multiple rows)
case_to_emb = (
    df_test.groupby(CASE_ID_COL)[CASE_SEARCH_EMB_COL]
    .apply(lambda rows: np.mean(np.vstack(rows.values), axis=0))
    .to_dict()
)

unique_case_ids = list(case_to_kbs.keys())
print("Unique test cases:", len(unique_case_ids))



# -------------------------------------------------------------------
# 4. Evaluation: case_emb → KB_cluster_centroid_emb (distinct KB IDs)
# -------------------------------------------------------------------
def evaluate_case_to_kbcentroid_faiss(k: int = 20):
    recall3_list, recall5_list = [], []
    mrr_list, ndcg5_list = [], []
    latencies = []

    for case_id in tqdm(unique_case_ids, desc="Evaluating FAISS KB-clusters", ncols=150):
        gold_kbs = set(case_to_kbs[case_id])

        # Query embedding
        q_emb = np.array(case_to_emb[case_id], dtype="float32").reshape(1, -1)
        q_emb = l2_normalize(q_emb)

        t0 = time.perf_counter()
        D, I = index.search(q_emb, k)
        t1 = time.perf_counter()

        latencies.append(t1 - t0)

        # map each FAISS returned index → KB IDs
        retrieved_kb_ids = [int(kb_ids[idx]) for idx in I[0]]

        # ---- NEW: enforce distinct KB IDs while preserving rank order ----
        seen = set()
        distinct_kb_ids = []
        for kb in retrieved_kb_ids:
            if kb not in seen:
                distinct_kb_ids.append(kb)
                seen.add(kb)
        # -----------------------------------------------------------------

        # Compute gold ranks in DISTINCT list
        gold_ranks = [
            distinct_kb_ids.index(g) + 1
            for g in gold_kbs
            if g in distinct_kb_ids
        ]
        best_rank = min(gold_ranks) if gold_ranks else None

        recall3_list.append(1 if best_rank and best_rank <= 3 else 0)
        recall5_list.append(1 if best_rank and best_rank <= 5 else 0)
        mrr_list.append(1 / best_rank if best_rank else 0)
        ndcg5_list.append(1 / log2(best_rank + 1) if best_rank and best_rank <= 5 else 0)

    # Latency stats
    latencies = np.array(latencies)
    total_time = latencies.sum()
    n_queries = len(latencies)

    latency_stats = {
        "avg_ms": float(latencies.mean() * 1000),
        "p50_ms": float(np.percentile(latencies, 50) * 1000),
        "p95_ms": float(np.percentile(latencies, 95) * 1000),
        "qps": float(n_queries / total_time),
        "n_queries": int(n_queries),
    }

    metrics = {
        "Recall@3": np.mean(recall3_list) * 100,
        "Recall@5": np.mean(recall5_list) * 100,
        "MRR":      np.mean(mrr_list),
        "nDCG@5":   np.mean(ndcg5_list) * 100,
    }

    return metrics, latency_stats



# -------------------------------------------------------------------
# 5. Run evaluation
# -------------------------------------------------------------------
results, latency = evaluate_case_to_kbcentroid_faiss(k=20)

print("\n===== FAISS(case_emb → KB_cluster_centroid_emb) =====")
print(f"Recall@3: {results['Recall@3']:.2f}")
print(f"Recall@5: {results['Recall@5']:.2f}")
print(f"MRR:      {results['MRR']:.4f}")
print(f"nDCG@5:   {results['nDCG@5']:.2f}")

print("\nLatency:")
print(f"avg_ms:   {latency['avg_ms']:.3f}")
print(f"p50_ms:   {latency['p50_ms']:.3f}")
print(f"p95_ms:   {latency['p95_ms']:.3f}")
print(f"QPS:      {latency['qps']:.2f}")
print(f"n_queries:{latency['n_queries']}")


# -------------------------------------------------------------------
# 6. Save CSV
# -------------------------------------------------------------------
row = {"method": "FAISS_case_cluster", **results, **latency}
df_out = pd.DataFrame([row])
csv_path = (
    f"./results/cluster_{CASE_INDEX_EMB_COL}_search_{CASE_SEARCH_EMB_COL}"
    "_faiss_case_cluster_results.csv"
)
df_out.to_csv(csv_path, index=False)

print("\nSaved results to:", csv_path)
print(df_out)


KB cluster centroid matrix shape: (462, 4096)
FAISS index size (num KB clusters) = 462
Unique test cases: 2201


Evaluating FAISS KB-clusters:   0%|                                                                           …


===== FAISS(case_emb → KB_cluster_centroid_emb) =====
Recall@3: 63.11
Recall@5: 73.83
MRR:      0.5222
nDCG@5:   56.22

Latency:
avg_ms:   0.353
p50_ms:   0.346
p95_ms:   0.371
QPS:      2833.09
n_queries:2201

Saved results to: ./results/cluster_embed_summary_search_embed_summary_faiss_case_cluster_results.csv
               method   Recall@3   Recall@5       MRR     nDCG@5    avg_ms  \
0  FAISS_case_cluster  63.107678  73.830077  0.522175  56.220317  0.352971   

     p50_ms    p95_ms          qps  n_queries  
0  0.345883  0.371443  2833.091293       2201  
