In [1]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

import pandas as pd
import numpy as np
import re
from sklearn.preprocessing import MultiLabelBinarizer
from collections import Counter
from itertools import chain
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from mlxtend.frequent_patterns import apriori, association_rules
import networkx as nx
import matplotlib.pyplot as plt

In [2]:
rules_sorted= pd.read_csv('../data/association_rules.csv')

In [3]:
# @title mapping
def plot_skill_rule_graph(user_skills,
                          rules_df,
                          top_n=12,
                          min_confidence=0.5,
                          min_lift=1.0,
                          figsize=(8, 6)):
    """
    Draw a small directed graph of association rules that involve the user's skills.
    - Nodes  = skills
    - Edges  = rules A -> B
    - Edge width ~ confidence
    - User skills are colored differently
    """
    user_set = {s.strip().lower() for s in user_skills}

    rules = rules_df.copy()

    # ensure lengths exist
    if "antecedent_len" not in rules.columns:
        rules["antecedent_len"] = rules["antecedents"].apply(len)
    if "consequent_len" not in rules.columns:
        rules["consequent_len"] = rules["consequents"].apply(len)

    # keep simple 1-skill -> 1-skill rules
    rules = rules[
        (rules["antecedent_len"] == 1) &
        (rules["consequent_len"] == 1)
    ]

    # threshold by confidence / lift
    rules = rules[
        (rules["confidence"] >= min_confidence) &
        (rules["lift"] >= min_lift)
    ]

    # unpack frozensets into strings
    rules = rules.assign(
        ant = rules["antecedents"].apply(lambda s: list(s)[0]),
        con = rules["consequents"].apply(lambda s: list(s)[0]),
    )

    # keep rules where user skills appear on either side
    mask = (
        rules["ant"].str.lower().isin(user_set) |
        rules["con"].str.lower().isin(user_set)
    )
    rules_sub = (
        rules[mask]
        .sort_values(["lift", "confidence"], ascending=False)
        .head(top_n)
    )

    if rules_sub.empty:
        print("No rules involving these skills with current thresholds.")
        return

    # build graph
    G = nx.DiGraph()
    for _, row in rules_sub.iterrows():
        G.add_edge(row["ant"], row["con"],
                   confidence=row["confidence"],
                   lift=row["lift"])

    # color user skills differently
    node_colors = [
        "tab:red" if n.lower() in user_set else "tab:blue"
        for n in G.nodes()
    ]

    # edge width based on confidence
    edges = list(G.edges())
    widths = [1.0 + 3.0 * G[u][v]["confidence"] for u, v in edges]

    pos = nx.spring_layout(G, k=0.7, seed=0)

    plt.figure(figsize=figsize)
    nx.draw_networkx_nodes(G, pos, node_color=node_colors, node_size=900)
    nx.draw_networkx_edges(G, pos, arrows=True, width=widths, alpha=0.7)
    nx.draw_networkx_labels(G, pos, font_size=8)

    plt.title("Skill rule graph around: " + ", ".join(user_skills))
    plt.axis("off")
    plt.show()

    # return the rules used in the graph for inspection if you want
    return rules_sub[["ant", "con", "support", "confidence", "lift"]]


In [4]:
plot_skill_rule_graph(
    ["cybersecurity", "network security", "linux"],
    rules_sorted,
    top_n=15,
    min_confidence=0.5,
    min_lift=1.0
)

plot_skill_rule_graph(
    ["python", "sql", "data analysis"],
    rules_sorted
)

plot_skill_rule_graph(
    ["cybersecurity"],
    rules_sorted
)

plot_skill_rule_graph(
    ["robotics"],
    rules_sorted
)

No rules involving these skills with current thresholds.
No rules involving these skills with current thresholds.
No rules involving these skills with current thresholds.
No rules involving these skills with current thresholds.


Bar-chart of “next skills” based only on rules


In [5]:
def plot_skill_rule_bars(user_skills,
                         rules_df,
                         top_n=10,
                         min_confidence=0.5,
                         min_lift=1.0,
                         figsize=(8, 4)):
    """
    Bar chart of top associated skills for the given user skills.
    Uses 1-skill -> 1-skill rules where the user's skills appear
    either in antecedent OR consequent, and suggests the *other* skill.
    Excludes skills the user already has.
    """
    import matplotlib.pyplot as plt
    import numpy as np

    user_set = {s.strip().lower() for s in user_skills}

    rules = rules_df.copy()

    # ensure lengths
    if "antecedent_len" not in rules.columns:
        rules["antecedent_len"] = rules["antecedents"].apply(len)
    if "consequent_len" not in rules.columns:
        rules["consequent_len"] = rules["consequents"].apply(len)

    # 1-skill -> 1-skill rules + thresholds
    rules = rules[
        (rules["antecedent_len"] == 1) &
        (rules["consequent_len"] == 1) &
        (rules["confidence"] >= min_confidence) &
        (rules["lift"] >= min_lift)
    ].copy()

    # unpack
    rules["ant"] = rules["antecedents"].apply(lambda s: list(s)[0])
    rules["con"] = rules["consequents"].apply(lambda s: list(s)[0])

    # keep rules where user skills appear on either side
    mask_any = (
        rules["ant"].str.lower().isin(user_set) |
        rules["con"].str.lower().isin(user_set)
    )
    rules_sub = rules[mask_any].copy()

    if rules_sub.empty:
        print("No matching rules for these skills; try lowering thresholds.")
        return

    # figure out what to suggest for each rule:
    # if user skill is ant -> suggest con, else suggest ant
    def get_target(row):
        if row["ant"].lower() in user_set:
            return row["con"]
        else:
            return row["ant"]

    rules_sub["suggested"] = rules_sub.apply(get_target, axis=1)

    # drop suggestions the user already has
    rules_sub = rules_sub[~rules_sub["suggested"].str.lower().isin(user_set)]

    if rules_sub.empty:
        print("No *additional* skills found from rules (all suggestions already in user set).")
        return

    # score = confidence × lift
    rules_sub["score"] = rules_sub["confidence"] * rules_sub["lift"]

    # keep best rule per suggested skill
    recs = (
        rules_sub
        .sort_values(["score", "support"], ascending=False)
        .drop_duplicates(subset=["suggested"])
        .head(top_n)
    )

    # plot
    plt.figure(figsize=figsize)
    plt.barh(recs["suggested"], recs["score"])
    plt.gca().invert_yaxis()
    plt.xlabel("score (confidence × lift)")
    plt.title("Top associated skills for: " + ", ".join(user_skills))
    plt.show()

    # return details if you want to inspect
    return recs[["ant", "con", "suggested", "support", "confidence", "lift", "score"]]


In [6]:
plot_skill_rule_bars(
    ["python", "sql", "data analysis"],
    rules_sorted,
    top_n=8,
    min_confidence=0.5,
    min_lift=1.0
)

#plot_skill_rule_bars(
  #  ["html", "css", "javascript"],
  #  rules_sorted,
   # top_n=8
#)


No matching rules for these skills; try lowering thresholds.
