In [10]:
from configs.config import Config 
from src.datasets.course_dataset import CourseDataset
import json 
import torch
import torch.nn.functional as F



In [None]:
cfg = Config()
ds = CourseDataset(cfg=cfg.data)

In [9]:
ccn_to_emb = {}
for i in range(len(ds)):
    sample = ds[i]
    ccn = sample["control_number"]
    emb = sample["embedding"]
    ccn_to_emb[ccn] = emb
print(len(ccn_to_emb))

132214


In [None]:

def cosine_sim_matrix(x, y):
    x = F.normalize(x, dim=1)
    y = F.normalize(y, dim=1)
    return x @ y.T

def intra_cluster_similarity(embs: torch.Tensor) -> float:
    if len(embs) < 2:
        return 1.0
    sim = cosine_sim_matrix(embs, embs)
    mask = ~torch.eye(len(embs), dtype=torch.bool, device=embs.device)
    return sim[mask].mean().item()

def inter_cluster_similarity(cluster_embs: list[torch.Tensor]) -> float:
    sims = []
    for i in range(len(cluster_embs)):
        for j in range(i+1, len(cluster_embs)):
            sim = cosine_sim_matrix(cluster_embs[i], cluster_embs[j])
            sims.append(sim.mean().item())
    return sum(sims) / len(sims) if sims else 0.0


In [15]:
def collect_level_clusters(node, level, clusters_by_level):
    if len(clusters_by_level) <= level:
        clusters_by_level.append([])

    clusters_by_level[level].append(node)
    for child in node.get("children", []):
        collect_level_clusters(child, level + 1, clusters_by_level)

def gather_courses(node):
    """Return a list of all course IDs under this node (including children)."""
    if node.get("courses"):
        return node["courses"]
    courses = []
    for child in node.get("children", []):
        courses.extend(gather_courses(child))
    return courses

def compute_similarity_metrics(taxonomy, ccn_to_emb):
    clusters_by_level = []
    collect_level_clusters(taxonomy, 0, clusters_by_level)

    level_metrics = []
    for level, nodes in enumerate(clusters_by_level):
        cluster_embs = []
        for node in nodes:
            course = gather_courses(node)               # NEW → collect all descendant courses
            embs = [torch.as_tensor(ccn_to_emb[c]).float()
                    for c in course if c in ccn_to_emb]
            if embs:
                cluster_embs.append(torch.stack(embs))

        # need ≥2 clusters at this level to compute inter‑similarity
        if len(cluster_embs) < 2:
            continue

        intra_vals = [intra_cluster_similarity(e) for e in cluster_embs if len(e) > 1]
        intra = sum(intra_vals) / len(intra_vals) if intra_vals else 0.0
        inter = inter_cluster_similarity(cluster_embs)

        level_metrics.append(
            dict(level=level,
                 intra_cluster_similarity=intra,
                 inter_cluster_similarity=inter,
                 num_clusters=len(cluster_embs))
        )
    return level_metrics


In [21]:
TAXONOMY_NAME = "gpt-o4_1k"
taxonomy_json_path = cfg.taxonomy.save_dir / TAXONOMY_NAME / "taxonomy.json"
with open(taxonomy_json_path, "r") as f:
    taxonomy = json.load(f)

metrics = compute_similarity_metrics(taxonomy, ccn_to_emb)
for m in metrics:
    print(m)


{'level': 1, 'intra_cluster_similarity': 0.8164014756679535, 'inter_cluster_similarity': 0.788631178273095, 'num_clusters': 10}
{'level': 2, 'intra_cluster_similarity': 0.8390237980959366, 'inter_cluster_similarity': 0.7884384385420352, 'num_clusters': 50}
{'level': 3, 'intra_cluster_similarity': 0.8583605510633696, 'inter_cluster_similarity': 0.7900004687584485, 'num_clusters': 230}


In [22]:
TAXONOMY_NAME = "o1-mini_1k"
taxonomy_json_path = cfg.taxonomy.save_dir / TAXONOMY_NAME / "taxonomy.json"
with open(taxonomy_json_path, "r") as f:
    taxonomy = json.load(f)

metrics = compute_similarity_metrics(taxonomy, ccn_to_emb)
for m in metrics:
    print(m)


{'level': 1, 'intra_cluster_similarity': 0.8122285544872284, 'inter_cluster_similarity': 0.7883186909887526, 'num_clusters': 10}
{'level': 2, 'intra_cluster_similarity': 0.8333031705447606, 'inter_cluster_similarity': 0.7881935075128159, 'num_clusters': 49}
{'level': 3, 'intra_cluster_similarity': 0.8520084263442399, 'inter_cluster_similarity': 0.7892967590925286, 'num_clusters': 228}


In [23]:
TAXONOMY_NAME = "o3-mini_1k"
taxonomy_json_path = cfg.taxonomy.save_dir / TAXONOMY_NAME / "taxonomy.json"
with open(taxonomy_json_path, "r") as f:
    taxonomy = json.load(f)

metrics = compute_similarity_metrics(taxonomy, ccn_to_emb)
for m in metrics:
    print(m)


{'level': 1, 'intra_cluster_similarity': 0.8105119109153748, 'inter_cluster_similarity': 0.7897831930054559, 'num_clusters': 10}
{'level': 2, 'intra_cluster_similarity': 0.8324166974242853, 'inter_cluster_similarity': 0.7899584523998961, 'num_clusters': 50}
{'level': 3, 'intra_cluster_similarity': 0.8481085191170374, 'inter_cluster_similarity': 0.7899300019069991, 'num_clusters': 241}


In [24]:
TAXONOMY_NAME = "o4-mini_1k"
taxonomy_json_path = cfg.taxonomy.save_dir / TAXONOMY_NAME / "taxonomy.json"
with open(taxonomy_json_path, "r") as f:
    taxonomy = json.load(f)

metrics = compute_similarity_metrics(taxonomy, ccn_to_emb)
for m in metrics:
    print(m)


{'level': 1, 'intra_cluster_similarity': 0.8112664997577668, 'inter_cluster_similarity': 0.7873230788442823, 'num_clusters': 10}
{'level': 2, 'intra_cluster_similarity': 0.8352768141396192, 'inter_cluster_similarity': 0.7878689366943982, 'num_clusters': 50}
{'level': 3, 'intra_cluster_similarity': 0.8560936718844296, 'inter_cluster_similarity': 0.7898402758625234, 'num_clusters': 237}
