In [1]:
import requests
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib_venn import venn2
import ipywidgets as widgets
from concurrent.futures import ThreadPoolExecutor

In [2]:
def get_df(url, name="method_df.npz", df_func=None):
  response = requests.get(url)
  with open(name, "wb") as file:
    file.write(response.content)
  data = np.load(name, allow_pickle=True)
  return pd.DataFrame(df_func(data))

In [3]:
users_df = get_df("https://raw.githubusercontent.com/abood1987/recommender/master/data/external_data/preprocessed_test_users_list.npz", df_func=lambda x: {
    "id": x["id"],
    "skills": list(x["skills"]),
    "esco": list(x["esco"]),
    "score": list(x["score"]),
    "binary": list(x["binary"]),
    "overlap": list(x["overlap"]),
    "tfidf": list(x["tfidf"]),
    "fuzzy": list(x["fuzzy"]),
    "emb": list(x["emb"]),
    "expanded": list(x["expanded"]),
    "binary_expanded": list(x["binary_expanded"]),
    "overlap_expanded": list(x["overlap_expanded"]),
    "tfidf_expanded": list(x["tfidf_expanded"]),
    "fuzzy_expanded": list(x["fuzzy_expanded"]),
    "emb_expanded": list(x["emb_expanded"]),
})
jobs_df = get_df("https://raw.githubusercontent.com/abood1987/recommender/master/data/external_data/preprocessed_test_jobs_list.npz", df_func=lambda x: {
    "id": x["id"],
    "title": x["title"],
    "requirements": list(x["requirements"]),
    "esco": list(x["esco"]),
    "score": list(x["score"]),
})

In [4]:
print(len(users_df))
print(len(jobs_df))

210
60


In [5]:
def get_df(label, threshold, top_k, fuzzy_threshold=None, is_expanded=False):
    label = f"{label}_expanded" if is_expanded else label
    df = users_df.copy()
    df["matched_jobs"] = df[label].apply(
        lambda scores: sorted(
            [title for title, score in scores if score >= threshold],
            key=lambda x: x[1] if isinstance(x, tuple) else 0,
            reverse=True
        )[:top_k]
    )
    return df

def get_fuzzy_df(label, threshold, top_k, fuzzy_threshold=60, is_expanded=False):
    def filter_fuzzy_matches(raw_scores, threshold, fuzzy_threshold):
        matched_jobs = []
        for title, sim_matrix in raw_scores:
            matched_mask = sim_matrix >= fuzzy_threshold
            ratio = matched_mask.any(axis=1).sum() / len(sim_matrix) if len(sim_matrix) else 0
            if ratio >= threshold:
                matched_jobs.append(title)
        return matched_jobs[:top_k]

    label = f"{label}_expanded" if is_expanded else label
    df = users_df.copy()
    df["matched_jobs"] = df[label].apply(
        lambda raw: filter_fuzzy_matches(raw, threshold, fuzzy_threshold)
    )
    return df

In [6]:
def compute_metrics(name, df, unique_jobs, unique_users):
    match_counts = df["matched_jobs"].apply(len)
    return {
        "Method": name,
        "Matched Users": match_counts.gt(0).sum(),
        "Matched Jobs": match_counts.sum(),
        "Avg Matches/User": match_counts.mean(),
        "Unique Jobs": len(set(job for jobs in df["matched_jobs"] for job in jobs)),
        "Unique Jobs per method": len(unique_jobs),
        "Unique Users per method": len(unique_users),
    }

In [7]:
methods = {
    "Binary vector": ("binary", get_df),
    "Overlap": ("overlap", get_df),
    "TF-IDF": ("tfidf", get_df),
    "Fuzzy": ("fuzzy", get_fuzzy_df),
    "Embedding": ("emb", get_df),
}

method_ui = {}
for method_name in methods.keys():
    hide_cb = widgets.Checkbox(value=False, description="Hide")
    broader_cb = widgets.Checkbox(value=False, description="Broader")
    method_ui[method_name] = {"hide": hide_cb, "broader": broader_cb}

method_controls = widgets.VBox([
    widgets.HBox([
        widgets.Label(method, layout={'width': '30%'}),
        controls["hide"],
        controls["broader"],
    ])
    for method, controls in method_ui.items()
])

In [8]:
def get_active_methods():
    active_methods = {}

    for method_name, (label, func) in methods.items():
        hide = method_ui[method_name]["hide"].value
        broader = method_ui[method_name]["broader"].value

        if not hide:
            active_methods[method_name] = (label, func, False)
        if broader:
            broader_name = f"{method_name} + Broader"
            active_methods[broader_name] = (label, func, True)
    return active_methods

In [9]:
def plot_distribution_per_user(dfs):
  out = widgets.Output()
  with out:
    long_df = []
    for method_name, df in dfs.items():
        for job_list in df["matched_jobs"]:
            long_df.append({
                "Method": method_name,
                "Jobs Per User": len(job_list)
            })

    plot_df = pd.DataFrame(long_df)

    # Boxplot
    plt.figure(figsize=(6, 6))
    sns.boxplot(data=plot_df, x="Method", y="Jobs Per User")
    plt.title("Distribution of Jobs per User (per Method)")
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()
  return out

In [10]:
def plot_method_comparison(metrics_df):
    out = widgets.Output()
    with out:
      metrics_df[["Method", "Matched Users"]].plot(x="Method", kind="bar", figsize=(6, 6), title="Method Comparison")
      plt.xticks(rotation=45)
      plt.ylabel("Count")
      plt.tight_layout()
      plt.show()
    return out

In [11]:
def jaccard_for_users(df1, df2):
    jaccards = []
    for a, b in zip(df1["matched_jobs"], df2["matched_jobs"]):
        set_a, set_b = set(a), set(b)
        if set_a or set_b:
            jaccards.append(len(set_a & set_b) / len(set_a | set_b))
        else:
            jaccards.append(1.0)
    return np.mean(jaccards)

# def plot_jaccard_heatmap(processed_dfs):
#     out = widgets.Output()
#     with out:
#       names = list(processed_dfs.keys())
#       jaccard_matrix = np.zeros((len(names), len(names)))

#       for i in range(len(names)):
#           for j in range(len(names)):
#               jaccard_matrix[i][j] = jaccard_for_users(processed_dfs[names[i]], processed_dfs[names[j]])

#       plt.figure(figsize=(9, 6))
#       sns.heatmap(jaccard_matrix, xticklabels=names, yticklabels=names, annot=True, cmap="YlGnBu")
#       plt.title("Jaccard Similarity Between Methods")
#       plt.tight_layout()
#       plt.show()
#     return out
def plot_jaccard_heatmap(processed_dfs):
    out = widgets.Output()
    with out:
        names = list(processed_dfs.keys())
        jaccard_matrix = np.zeros((len(names), len(names)))

        for i in range(len(names)):
            for j in range(len(names)):
                jaccard_matrix[i][j] = jaccard_for_users(
                    processed_dfs[names[i]], processed_dfs[names[j]]
                )

        # ✅ Mask lower triangle to hide redundant values
        mask = np.tril(np.ones_like(jaccard_matrix, dtype=bool), k=-1)

        plt.figure(figsize=(9, 6))
        sns.heatmap(
            jaccard_matrix,
            xticklabels=names,
            yticklabels=names,
            annot=True,
            cmap="YlGnBu",
            mask=mask
        )
        plt.title("Jaccard Similarity Between Methods")
        plt.tight_layout()
        plt.show()
    return out

In [12]:
method_selector_1 = widgets.Dropdown(description="Method A:")
method_selector_2 = widgets.Dropdown(description="Method B:")

def plot_venn_overlap(methods_data, method_a, method_b, is_jobs_overlap=False):
    out = widgets.Output()
    df_a = methods_data.get(method_a)
    df_b = methods_data.get(method_b)
    if df_a is None or df_b is None:
        return

    if is_jobs_overlap:
        data_a = set(job for job_list in df_a["matched_jobs"] for job in job_list)
        data_b = set(job for job_list in df_b["matched_jobs"] for job in job_list)
    else:
        data_a = set(df_a[df_a["matched_jobs"].apply(len) > 0]["id"].to_list())
        data_b = set(df_b[df_b["matched_jobs"].apply(len) > 0]["id"].to_list())

    with out:
        plt.figure(figsize=(6, 6))
        venn2([data_a, data_b], set_labels=(method_a, method_b))
        plt.title(f"{'Unique jobs' if is_jobs_overlap else 'Users'} overlap: {method_a} vs {method_b}")
        plt.show()
    return out

In [13]:
def compute_matched_users_jobs(df):
    users = set(df[df["matched_jobs"].apply(len) > 0]["id"])
    jobs = set(job for jobs in df["matched_jobs"] for job in jobs)
    return users, jobs

def interactive_analysis(threshold=0.5, top_k=50, fuzzy_threshold=60):
    active_methods = get_active_methods()
    processed_dfs = {}
    metrics = []
    all_users = {}
    all_jobs = {}

    with ThreadPoolExecutor(max_workers=6) as executor:
        futures = {
            name: executor.submit(func, label, threshold, top_k, fuzzy_threshold, is_expanded)
            for name, (label, func, is_expanded) in active_methods.items()
        }

        for name, future in futures.items():
            df = future.result()
            processed_dfs[name] = df

            users, jobs = compute_matched_users_jobs(df)
            all_users[name] = users
            all_jobs[name] = jobs

    for name, df in processed_dfs.items():
      other_users = set().union(*(v for k, v in all_users.items() if k != name))
      other_jobs = set().union(*(v for k, v in all_jobs.items() if k != name))
      metrics.append(
          compute_metrics(
              name,
              df,
              all_jobs[name] - other_jobs,
              all_users[name] - other_users,
          )
      )

    metrics_df = pd.DataFrame(metrics)

    method_names = list(processed_dfs.keys())
    method_selector_1.options = method_names
    method_selector_2.options = method_names

    method_selector_1.value = method_names[0] if not method_selector_1.value else method_selector_1.value
    method_selector_2.value = method_names[1] if not method_selector_2.value else method_selector_2.value

    comparison_plot = plot_method_comparison(metrics_df)
    distribution_plot = plot_distribution_per_user(processed_dfs)
    heatmap_plot = plot_jaccard_heatmap(processed_dfs)
    users_overlap_plot = plot_venn_overlap(processed_dfs, method_selector_1.value, method_selector_2.value)
    jobs_overlap_plot = plot_venn_overlap(processed_dfs, method_selector_1.value, method_selector_2.value, is_jobs_overlap=True)

    return metrics_df, comparison_plot, distribution_plot, heatmap_plot, users_overlap_plot, jobs_overlap_plot

In [14]:
threshold_slider = widgets.FloatSlider(min=0.0, max=1.0, step=0.05, value=0.5, description="Threshold:", layout={'width': '500px'})
topk_slider = widgets.IntSlider(min=1, max=50, step=1, value=50, description="Top-K:", layout={'width': '500px'})
fuzzy_thresh_slider = widgets.IntSlider(min=0, max=100, step=5, value=60, description="Fuzzy Score:", layout={'width': '500px'})

refresh_button = widgets.Button(description="Compute", button_style="success")
refresh_output = widgets.Output()
def on_refresh_clicked(b):
    with refresh_output:
        refresh_output.clear_output()
        metrics_df, comparison_plot, distribution_plot, heatmap_plot, users_overlap_plot, jobs_overlap_plot = interactive_analysis(
            threshold=threshold_slider.value,
            top_k=topk_slider.value,
            fuzzy_threshold=fuzzy_thresh_slider.value
        )
        # Display method summary table
        display(widgets.HTML("<h4>Method Comparison (Interactive)</h4>"))
        display(metrics_df)
        display(
            widgets.VBox([
                widgets.HTML("<div style='height: 10px'></div>"),
                widgets.HBox([
                    comparison_plot,
                    widgets.HTML("<div style='width: 2px'></div>"),
                    distribution_plot,
                    widgets.HTML("<div style='width: 2px'></div>"),
                    heatmap_plot,
                ]),
                widgets.HTML("<div style='height: 10px'></div>"),
                widgets.HBox([
                    users_overlap_plot,
                    widgets.HTML("<div style='width: 2px'></div>"),
                    jobs_overlap_plot
                ])
            ])
        )

refresh_button.on_click(on_refresh_clicked)

styled_method_box = widgets.Box(
    [method_controls],
    layout={
        'border': '1px solid black',
        'padding': '10px',
        'width': '500px',
    }
)

ui = widgets.VBox([
    styled_method_box,
    threshold_slider,
    topk_slider,
    fuzzy_thresh_slider,
    widgets.HTML("<h4>User Overlap Between Methods</h4>"),
    widgets.HBox([method_selector_1, method_selector_2]),
    refresh_button,
    refresh_output,
])

display(ui)


VBox(children=(Box(children=(VBox(children=(HBox(children=(Label(value='Binary vector', layout=Layout(width='3…