In [1]:
# !pip install ipywidgets

from google.colab import output
output.enable_custom_widget_manager()

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.preprocessing import MultiLabelBinarizer
from concurrent.futures import ThreadPoolExecutor


In [2]:
import requests
def get_df(url, name="method_df.npz", df_func=None):
  func = lambda x: {
      "preferredLabel": x["preferredLabel"],
      "esco": list(x["esco"]),
      "score": list(x["score"]),
  }
  df_func = df_func or func
  response = requests.get(url)
  with open(name, "wb") as file:
    file.write(response.content)
  data = np.load(name, allow_pickle=True)
  return pd.DataFrame(df_func(data))

In [3]:
# esco_df = get_df("https://huggingface.co/datasets/abd1987/esco-embeddings-numpy/resolve/main/abd1987_esco_context_skill_extraction.npz", df_func=lambda x: {
#     "label": x["label"],
#     "embedding": list(x["label_embedding"]),
# })
broader_df = get_df("https://raw.githubusercontent.com/abood1987/recommender/master/data/methods_results/broader_list.npz", df_func=lambda x: {
    "label": x["label"],
    "uri": x["uri"],
    "broader": x["broader"]
})
broader_dict = broader_df.set_index("label")[["uri", "broader"]].apply(tuple, axis=1).to_dict()
uris_set = set(broader_df["uri"].to_list())
broader_set = set(broader_df["broader"].to_list())

flan_t5_ft_df =     get_df("https://raw.githubusercontent.com/abood1987/recommender/master/data/methods_results/abd1987_esco_context_flan_t5_ft.npz")
flan_t5_large_df =  get_df("https://raw.githubusercontent.com/abood1987/recommender/master/data/methods_results/abd1987_esco_context_flan_t5_large.npz")
# gpt2_large_df =     get_df("https://raw.githubusercontent.com/abood1987/recommender/master/data/methods_results/abd1987_esco_context_gpt2_large.npz")
ner_df =            get_df("https://raw.githubusercontent.com/abood1987/recommender/master/data/methods_results/abd1987_esco_context_ner.npz")
simple_df =         get_df("https://raw.githubusercontent.com/abood1987/recommender/master/data/methods_results/abd1987_esco_context_simple.npz")
split_df =          get_df("https://raw.githubusercontent.com/abood1987/recommender/master/data/methods_results/abd1987_esco_context_split.npz")

dfs = {
    "Simple": simple_df,
    "NER": ner_df,
    "Flan-t5": flan_t5_large_df,
    # "gpt": gpt2_large_df,
    "Flan-t5-ft": flan_t5_ft_df,
    "Split": split_df
}

In [4]:
import heapq
from collections import Counter

def broader_similarity_ratio(skills):
    if not skills:
        return 0.0

    broadened = []

    for skill in skills:
        broader_info = broader_dict.get(skill)
        if not broader_info:
            continue

        uri, broader = broader_info

        if uri in broader_set:
            broadened.append(uri)  # it's a broader, count as itself
        elif broader:
            broadened.append(broader)  # use the broader

    if not broadened:
        return 0.0

    # Most common broader category count
    most_common_count = Counter(broadened).most_common(1)[0][1]
    return most_common_count / len(skills)

def compute_metrics_manual(y_true, y_pred):
    TP = FP = FN = 0

    for true, pred in zip(y_true, y_pred):
        true_set = set(true)
        pred_set = set(pred)

        TP += len(true_set & pred_set)
        FP += len(pred_set - true_set)
        FN += len(true_set - pred_set)

    precision = TP / (TP + FP) if TP + FP > 0 else 0.0
    recall = TP / (TP + FN) if TP + FN > 0 else 0.0
    f1 = 2 * precision * recall / (precision + recall) if precision + recall > 0 else 0.0
    return round(precision, 2), round(recall, 2), round(f1, 2)

def compute_metrics(df, method, threshold=0.6, top_k=5):
    def filter_top_k(esco, score):
        filtered = ((s, sc) for s, sc in zip(esco, score) if sc >= threshold)
        top_filtered = heapq.nlargest(top_k, filtered, key=lambda x: x[1])
        if not top_filtered:
          return [], []
        skills, scores = zip(*top_filtered)
        return list(skills), list(scores)

    filtered_results = [
        filter_top_k(esco, score) for esco, score in zip(df["esco"], df["score"])
    ]
    filtered_esco, filtered_score = zip(*filtered_results)

    filtered_df = df.copy()
    filtered_df["esco"] = list(filtered_esco)
    filtered_df["score"] = list(filtered_score)
    filtered_df["broader_similarity"] = [broader_similarity_ratio(x) for x in filtered_df["esco"]]


    y_true = [[x] for x in df["preferredLabel"]]
    y_pred = [list(set(pred)) for pred in filtered_df["esco"]]

    all_labels = list(set(label for row in y_true + y_pred for label in row))
    binarizer = MultiLabelBinarizer(classes=all_labels)

    y_true_bin = binarizer.fit_transform(y_true)
    y_pred_bin = binarizer.transform(y_pred)

    precision = precision_score(y_true_bin, y_pred_bin, average="samples", zero_division=0)
    recall = recall_score(y_true_bin, y_pred_bin, average="samples", zero_division=0)
    f1 = f1_score(y_true_bin, y_pred_bin, average="samples", zero_division=0)

    # precision, recall, f1 = compute_metrics_manual(y_true, y_pred)
    return (filtered_df, method, precision, recall, f1, sum(map(len, y_pred)))

def create_hybrid_df(primary_df, fallback_df):
    hybrid_df = primary_df.copy()

    # Replace empty predictions with fallback predictions
    for i, (esco, score) in hybrid_df[["esco", "score"]].iterrows():
        if not esco:  # If primary prediction is empty
            hybrid_df.at[i, "esco"] = fallback_df.at[i, "esco"]
            hybrid_df.at[i, "score"] = fallback_df.at[i, "score"]

    return hybrid_df

In [5]:
from ipywidgets import interact, FloatSlider, IntSlider, Dropdown, Layout, VBox, HTML, interactive_output
from IPython.display import display


def get_cols(df, method, p, r, f1, res_count):
  return {
      "Method": method,
      "Precision": p,
      "Recall": r,
      "F1-score": f1,
      "Count": res_count,
      "Broader-sim": round(df["broader_similarity"].mean(), 2)
  }

def interactive_heatmap(threshold=0.6, top_k=5, primary_method=None, fallback_method=None):
    processed_dfs = {}
    results = []
    with ThreadPoolExecutor(max_workers=6) as executor:
        futures = [executor.submit(compute_metrics, df, method, threshold, top_k) for method, df in dfs.items()]
        for f in futures:
            df, method, p, r, f1, res_count = f.result()
            processed_dfs[method] = df
            results.append(get_cols(df, method, p, r, f1, res_count))
        if primary_method is not None and fallback_method is not None:
          hybrid_df = create_hybrid_df(processed_dfs[primary_method], processed_dfs[fallback_method])
          df, method, p, r, f1, res_count = compute_metrics(hybrid_df, f"{primary_method} + {fallback_method}", threshold, top_k)
          results.append(get_cols(df, method, p, r, f1, res_count))

    metrics_df = pd.DataFrame(results).set_index("Method")

    display(metrics_df)
    print("")

    plt.figure(figsize=(10, 6))
    sns.heatmap(metrics_df[["Precision", "Recall", "F1-score"]], annot=True, fmt=".2f", cmap="Blues")
    plt.title(f"Skill Extraction Performance (Threshold: {threshold}, Top-K: {top_k})")
    plt.show()
    print("")


method_choices = list(dfs.keys())
threshold_slider=FloatSlider(min=0.0, max=1.0, step=0.05, value=0.6, description="Threshold:", layout=Layout(width='25%'))
topk_slider=IntSlider(min=1, max=15, step=1, value=5, description="Top-K:", layout=Layout(width='25%'))
primary_dropdown=Dropdown(options=method_choices, value=None, description="Primary", layout=Layout(width='25%'))
fallback_dropdown=Dropdown(options=method_choices, value=None, description="Fallback", layout=Layout(width='25%'))

out = interactive_output(
    interactive_heatmap,
    {
        'threshold': threshold_slider,
        'top_k': topk_slider,
        'primary_method': primary_dropdown,
        'fallback_method': fallback_dropdown,
    }
)

ui = VBox([
    HTML("<div style='padding-bottom: 20px;'></div>"),
    threshold_slider,
    topk_slider,
    HTML("<div style='padding-top: 10px;'>Choose a hybrid method</div>"),
    primary_dropdown,
    fallback_dropdown,
    HTML("<div style='padding-bottom: 20px;'></div>"),
])

display(ui, out)

VBox(children=(HTML(value="<div style='padding-bottom: 20px;'></div>"), FloatSlider(value=0.6, description='Th…

Output()