In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import pyarrow.parquet as pq
import numpy as np
import pandas as pd
from sklearn.cluster import AgglomerativeClustering

In [2]:
infile = "./dataset/finekb_cases_train.parquet"
outfile = "./dataset/finekb_cases_train_clustered.parquet"

# --- config: tune these if you want ---
CASE_EMB_COL = "embed_summary"
CLUSTER_COL = "cluster_id"
MIN_CLUSTER_SIZE = 10      # target points per cluster
MAX_CLUSTERS_PER_KB = 50  # upper bound on clusters per kb

In [3]:
# Load parquet file
table = pq.read_table(infile)

# Convert to pandas DataFrame
df_cases = table.to_pandas()


In [4]:
# Load parquet file
table = pq.read_table(infile)

# Convert to pandas DataFrame
df_cases = table.to_pandas()
df_cases.head()

Unnamed: 0,case_id,issue_type,kb_id,embed_summary
0,0,remote_access,11.0,"[0.005332942120730877, -0.003665205556899309, ..."
1,1,info,102.0,"[0.00189633306581527, 0.012942066416144371, 0...."
2,2,memory,,"[0.00967031717300415, 0.013244512490928173, 0...."
3,3,contract,35.0,"[0.0068122390657663345, 0.0030071106739342213,..."
4,4,fan,158.0,"[0.013492287136614323, -0.013519312255084515, ..."


In [5]:
def cluster_one_kb(group: pd.DataFrame) -> pd.DataFrame:
    """
    group: rows of df for a single kb_id
    returns: same rows with a new 'cluster_id' column
    """
    kb_id = group.name
    # turn list-of-floats embeddings into a 2D array
    X = np.vstack(group[CASE_EMB_COL].values)  # (n_samples, dim)
    n_samples = X.shape[0]

    # if not enough samples, just put them all into one cluster
    if n_samples <= MIN_CLUSTER_SIZE:
        group[CLUSTER_COL] = f"{kb_id}_c0"
        return group

    # heuristic: about MIN_CLUSTER_SIZE points per cluster, capped by MAX_CLUSTERS_PER_KB
    n_clusters = max(1, min(MAX_CLUSTERS_PER_KB, n_samples // MIN_CLUSTER_SIZE))

    # Agglomerative clustering with cosine distance
    agglom = AgglomerativeClustering(
        n_clusters=n_clusters,
        metric="cosine",      # for sklearn >= 1.2; use affinity="cosine" on older versions
        linkage="average",
    )
    labels = agglom.fit_predict(X)

    # build cluster ids like "12345_c0", "12345_c1", ...
    group[CLUSTER_COL] = [f"{kb_id}_c{int(lbl)}" for lbl in labels]

    return group

In [6]:
# ---- main call ----

# (optional) drop rows with missing kb_id before clustering
df_cases = df_cases.dropna(subset=["kb_id"])

# ensure kb_id is a simple type for building 'kb_id_cX' strings
df_cases["kb_id"] = df_cases["kb_id"].astype(int)

# apply clustering per kb_id
df_cases = (
    df_cases
    .groupby("kb_id", group_keys=False)
    .apply(cluster_one_kb)
)

print(df_cases[["case_id", "kb_id", CLUSTER_COL]].head())


   case_id  kb_id cluster_id
0        0     11      11_c0
1        1    102     102_c5
3        3     35      35_c3
4        4    158     158_c1
6        6     63      63_c1


  .apply(cluster_one_kb)


In [7]:
# ----------------------------------------------------
# Save to parquet
# ----------------------------------------------------

df_cases.to_parquet(
    outfile,
    engine="pyarrow",
    compression="snappy",
    index=False
)

print("Saved case clusters to:", outfile)

Saved case clusters to: ./dataset/finekb_cases_train_clustered.parquet
