In [1]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

import pandas as pd
import numpy as np
import re
from sklearn.preprocessing import MultiLabelBinarizer
from collections import Counter
from itertools import chain
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from mlxtend.frequent_patterns import apriori, association_rules
import networkx as nx
import matplotlib.pyplot as plt

In [2]:
df_ap= pd.read_csv('../data/skills_only_data.csv')

In [3]:
# @title One-time precomputation of co-occurrence

#df: job_title + binary skill columns
skill_names = df_ap.columns.to_list()

# map lowercase name -> actual column name (for user input)
skill_name_map = {s.lower(): s for s in skill_names}

# matrix for co-occurrence fallback
X = df_ap.values.astype(np.uint8)

# co-occurrence counts for all skill pairs
co_counts = X.T @ X              # shape (n_skills, n_skills)

# per-skill counts and supports
skill_counts = co_counts.diagonal()
n_jobs = len(df_ap)
skill_support = skill_counts / n_jobs

# index for quick lookup
skill_index = {s: i for i, s in enumerate(skill_names)}

print("Precomputed co-occurrence matrix for", len(skill_names), "skills.")


Precomputed co-occurrence matrix for 2347 skills.


In [4]:
n_jobs = len(df_ap)

print("Apriori input shape:", df_ap.shape)

# =====================================================
# 1. EXPLORE DIFFERENT SUPPORT LEVELS (for your report)
# =====================================================
support_values = [0.02, 0.015, 0.01, 0.0075, 0.005]  # 2%, 1.5%, 1%, 0.75%, 0.5%

for ms in support_values:
    fi = apriori(
        df_ap,
        min_support=ms,
        use_colnames=True,
        max_len=2,        # singletons + pairs
        verbose=0
    )
    fi["itemset_len"] = fi["itemsets"].apply(len)

    rules_tmp = association_rules(fi, metric="confidence", min_threshold=0.4)

    print(f"\n=== min_support = {ms:.4f} (~{int(ms*n_jobs)} jobs) ===")
    print("  1-itemsets:", (fi["itemset_len"] == 1).sum())
    print("  2-itemsets:", (fi["itemset_len"] == 2).sum())
    print("  rules (conf >= 0.4):", len(rules_tmp))

# =====================================================
# 2. CHOOSE A FINAL CONFIG (more generous than before)
# =====================================================

# Example: require at least 50 jobs (~1% support)
min_support_count = 50
min_support = min_support_count / n_jobs

print(f"\nUsing FINAL min_support = {min_support:.4f} (~{min_support_count} jobs)")

frequent_itemsets = apriori(
    df_ap,
    min_support=min_support,
    use_colnames=True,
    max_len=2,        # pairs only for now
    verbose=1
)

frequent_itemsets["itemset_len"] = frequent_itemsets["itemsets"].apply(len)

print("Itemsets by length:")
print(frequent_itemsets["itemset_len"].value_counts())
display(
    frequent_itemsets
        .sort_values("support", ascending=False)
        .head(20)
)

pairs = frequent_itemsets[frequent_itemsets["itemset_len"] == 2]
print("Number of frequent pairs:", len(pairs))

# ---------- association rules ----------
rules = association_rules(
    frequent_itemsets,
    metric="confidence",
    min_threshold=0.4   # a bit looser here
)

# add lengths
rules["antecedent_len"] = rules["antecedents"].apply(len)
rules["consequent_len"] = rules["consequents"].apply(len)

# keep A -> B rules (antecedent ≥1 skill, consequent = 1 skill)
rules = rules[
    (rules["antecedent_len"] >= 1) &
    (rules["consequent_len"] == 1)
]

# OPTIONAL: tighten again by confidence / lift
rules = rules[
    (rules["confidence"] >= 0.5) &    # final filter
    (rules["lift"] >= 1.05)
]

rules_sorted = rules.sort_values(["lift", "confidence"], ascending=False)

print("FINAL number of rules:", len(rules_sorted))
display(rules_sorted.head(20))


Apriori input shape: (2112, 2347)





=== min_support = 0.0200 (~42 jobs) ===
  1-itemsets: 191
  2-itemsets: 470
  rules (conf >= 0.4): 223





=== min_support = 0.0150 (~31 jobs) ===
  1-itemsets: 254
  2-itemsets: 845
  rules (conf >= 0.4): 339





=== min_support = 0.0100 (~21 jobs) ===
  1-itemsets: 363
  2-itemsets: 1563
  rules (conf >= 0.4): 572





=== min_support = 0.0075 (~15 jobs) ===
  1-itemsets: 467
  2-itemsets: 2702
  rules (conf >= 0.4): 913





=== min_support = 0.0050 (~10 jobs) ===
  1-itemsets: 651
  2-itemsets: 4834
  rules (conf >= 0.4): 1426

Using FINAL min_support = 0.0237 (~50 jobs)
Processing 25760 combinations | Sampling itemset size 2
Itemsets by length:
itemset_len
2    350
1    161
Name: count, dtype: int64




Unnamed: 0,support,itemsets,itemset_len
31,0.424716,(communication),1
40,0.290246,(data analysis),1
109,0.265152,(problem solving),1
138,0.237689,(sql),1
114,0.235795,(python),1
301,0.19839,"(communication, problem solving)",2
113,0.193182,(project management),1
134,0.178977,(software development),1
22,0.178977,(business unit),1
142,0.178504,(systems engineering),1


Number of frequent pairs: 350
FINAL number of rules: 109


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski,antecedent_len,consequent_len
134,(docker),(kubernetes),0.048769,0.061553,0.033617,0.68932,11.198805,1.0,0.030616,3.020626,0.957396,0.438272,0.668943,0.617737,1,1
135,(kubernetes),(docker),0.061553,0.048769,0.033617,0.546154,11.198805,1.0,0.030616,2.095933,0.970438,0.438272,0.522885,0.617737,1,1
80,(css),(javascript),0.026042,0.090436,0.023674,0.909091,10.052356,1.0,0.021319,10.005208,0.924599,0.255102,0.900052,0.585436,1,1
6,(ai),(machine learning),0.048769,0.100852,0.043087,0.883495,8.76029,1.0,0.038169,7.717685,0.931265,0.404444,0.870427,0.655363,1,1
120,(r),(data science),0.057292,0.069129,0.029356,0.512397,7.412204,1.0,0.025396,1.909075,0.917662,0.302439,0.476186,0.468527,1,1
36,(cisco),(network engineering),0.036458,0.098485,0.026042,0.714286,7.252747,1.0,0.022451,3.155303,0.894742,0.23913,0.683073,0.489354,1,1
83,(vulnerability management),(cybersecurity),0.033144,0.109375,0.026042,0.785714,7.183673,1.0,0.022417,4.15625,0.890304,0.223577,0.759398,0.511905,1,1
174,(r),(tableau),0.057292,0.076705,0.030777,0.53719,7.003367,1.0,0.026382,1.994978,0.909307,0.298165,0.498741,0.469212,1,1
118,(data science),(machine learning),0.069129,0.100852,0.042614,0.616438,6.11229,1.0,0.035642,2.344207,0.898508,0.334572,0.573416,0.519487,1,1
107,(data mining),(data visualization),0.055871,0.118371,0.038826,0.694915,5.870644,1.0,0.032212,2.889783,0.878758,0.286713,0.653953,0.511458,1,1


## Recommendation

Even if association rules don't match completely, recommendation can be given based on co-occurrence

In [5]:
# @title function for rules & correlations
def map_user_skills_to_columns(user_skills):
    """Map user input skills (strings) to actual df_ap columns."""
    mapped = []
    unknown = []
    for s in user_skills:
        key = s.strip().lower()
        if key in skill_name_map:
            mapped.append(skill_name_map[key])
        else:
            unknown.append(s)
    return mapped, unknown


def recommend_skills_for_user(
    user_skills,
    rules_df,
    top_n=10,
    min_confidence=0.4,
    min_lift=1.0,
):
    """
    Always returns some recommended skills, even if association rules don't match.

    Output columns:
    ['suggested_skill', 'score', 'source', 'support', 'confidence', 'lift']
    """

    # --- normalize & map user skills ---
    mapped_user_skills, unknown = map_user_skills_to_columns(user_skills)
    user_set = set(mapped_user_skills)

    print("User skills mapped to columns:", mapped_user_skills)
    if unknown:
        print("Unknown skills (not in data):", unknown)


    # 1) RULE-BASED RECOMMENDATIONS (FORWARD + REVERSE)

    if rules_df is not None and not rules_df.empty and user_set:
        rules_filt = rules_df.copy()

        # basic thresholds
        rules_filt = rules_filt[
            (rules_filt["confidence"] >= min_confidence)
            & (rules_filt["lift"] >= min_lift)
        ].copy()

        def has_overlap(skill_set):
            return len(set(skill_set) & user_set) > 0

        # forward: user has antecedent(s), suggest consequent
        mask_fwd = rules_filt["antecedents"].apply(has_overlap)
        # reverse: user has consequent(s), suggest antecedent
        mask_rev = rules_filt["consequents"].apply(has_overlap)

        rules_filt["direction"] = np.where(mask_fwd, "forward",
                                   np.where(mask_rev, "reverse", "none"))
        rules_filt = rules_filt[rules_filt["direction"] != "none"]

        if not rules_filt.empty:
            # choose what to suggest based on direction
            def get_suggested(row):
                if row["direction"] == "forward":
                    return list(row["consequents"])[0]
                else:
                    return list(row["antecedents"])[0]

            rules_filt["suggested_skill"] = rules_filt.apply(get_suggested, axis=1)

            # drop skills user already has
            rules_filt = rules_filt[~rules_filt["suggested_skill"].isin(user_set)]

            # score = confidence * lift
            rules_filt["score"] = rules_filt["confidence"] * rules_filt["lift"]
            rules_filt["source"] = "rules"

            rules_recs = (
                rules_filt
                .sort_values(["score", "support"], ascending=False)
                .drop_duplicates(subset=["suggested_skill"])
                [["suggested_skill", "score", "source", "support", "confidence", "lift"]]
                .head(top_n)
            )

            if not rules_recs.empty:
                return rules_recs


    # 2) CO-OCCURRENCE FALLBACK (P(candidate | user_skill))

    if user_set:
        candidate_skills = [s for s in skill_names if s not in user_set]
        cand_scores = {}

        for cand in candidate_skills:
            j = skill_index[cand]
            scores = []
            for u in user_set:
                i = skill_index[u]
                if skill_counts[i] > 0:
                    # P(cand | u) = count(u & cand) / count(u)
                    scores.append(co_counts[i, j] / skill_counts[i])
            if scores:
                cand_scores[cand] = float(np.mean(scores))

        if cand_scores:
            cooc_recs = (
                pd.DataFrame(
                    [(skill, score) for skill, score in cand_scores.items()],
                    columns=["suggested_skill", "score"],
                )
                .sort_values("score", ascending=False)
                .head(top_n)
            )

            cooc_recs["source"] = "cooccurrence"
            cooc_recs["support"] = cooc_recs["suggested_skill"].map(
                dict(zip(skill_names, skill_support))
            )
            cooc_recs["confidence"] = np.nan
            cooc_recs["lift"] = np.nan

            return cooc_recs


    # 3) GLOBAL FREQUENCY FALLBACK (if user skills don't map)

    global_recs = (
        pd.DataFrame(
            {"suggested_skill": skill_names, "score": skill_support}
        )
        .sort_values("score", ascending=False)
        .head(top_n)
    )

    global_recs["source"] = "global_frequency"
    global_recs["support"] = global_recs["score"]
    global_recs["confidence"] = np.nan
    global_recs["lift"] = np.nan

    return global_recs


In [6]:
# @title test cases

test_cases = [
    ("Unknown",
      ["tableau"]),

    ("Unknown",
     ["python", "sql", "data analysis"]),

    ("Classic data-analysis stack",
     ["python", "sql", "data analysis"]),

    ("Junior data analyst (Excel-heavy)",
     ["excel", "sql", "data visualization"]),

    ("BI / dashboard person",
     ["tableau", "power bi", "data visualization"]),

    ("Web dev – front end",
     ["html", "css", "javascript"]),

    ("Web dev – full stack JS",
     ["javascript", "node", "react", "sql"]),

    ("DevOps / cloud",
     ["linux", "docker", "kubernetes", "aws"]),

    ("Backend Java engineer",
     ["java", "spring", "sql"]),

    ("Cybersecurity / infosec",
     ["cybersecurity", "network security", "linux"]),

    ("QA / testing",
     ["software testing", "selenium", "automation"]),

    ("Totally random + one real skill",
     ["knitting", "underwater basket weaving", "python"]),
]


In [7]:
for label, user_skills in test_cases:
    print("=" * 80)
    print(label)
    print("User skills:", user_skills)

    recs = recommend_skills_for_user(
        user_skills=user_skills,
        rules_df=rules_sorted,
        top_n=8,
        min_confidence=0.5,
        min_lift=1.05,
    )

    display(recs)


Unknown
User skills: ['tableau']
User skills mapped to columns: ['tableau']


Unnamed: 0,suggested_skill,score,source,support,confidence,lift
174,r,3.762139,rules,0.030777,0.53719,7.003367
128,data visualization,3.481694,rules,0.049242,0.641975,5.423407
177,sql,2.545079,rules,0.059659,0.777778,3.272244
103,data analysis,2.322238,rules,0.062973,0.820988,2.82859
32,business unit,1.801971,rules,0.043561,0.567901,3.173035
171,python,1.397656,rules,0.044034,0.574074,2.434627


Unknown
User skills: ['python', 'sql', 'data analysis']
User skills mapped to columns: ['python', 'sql', 'data analysis']


Unnamed: 0,suggested_skill,score,source,support,confidence,lift
166,r,3.764467,rules,0.053977,0.942149,3.995619
167,spark,3.045794,rules,0.023674,0.847458,3.594037
177,tableau,2.545079,rules,0.059659,0.777778,3.272244
89,data visualization,2.454247,rules,0.099905,0.844,2.907876
112,data mining,2.447435,rules,0.042614,0.762712,3.208859
119,data science,2.363806,rules,0.05161,0.746575,3.166199
130,data warehouse,2.347564,rules,0.029356,0.746988,3.142706
96,power bi,2.084225,rules,0.059659,0.777778,2.679717


Classic data-analysis stack
User skills: ['python', 'sql', 'data analysis']
User skills mapped to columns: ['python', 'sql', 'data analysis']


Unnamed: 0,suggested_skill,score,source,support,confidence,lift
166,r,3.764467,rules,0.053977,0.942149,3.995619
167,spark,3.045794,rules,0.023674,0.847458,3.594037
177,tableau,2.545079,rules,0.059659,0.777778,3.272244
89,data visualization,2.454247,rules,0.099905,0.844,2.907876
112,data mining,2.447435,rules,0.042614,0.762712,3.208859
119,data science,2.363806,rules,0.05161,0.746575,3.166199
130,data warehouse,2.347564,rules,0.029356,0.746988,3.142706
96,power bi,2.084225,rules,0.059659,0.777778,2.679717


Junior data analyst (Excel-heavy)
User skills: ['excel', 'sql', 'data visualization']
User skills mapped to columns: ['excel', 'sql', 'data visualization']


Unnamed: 0,suggested_skill,score,source,support,confidence,lift
107,data mining,4.0796,rules,0.038826,0.694915,5.870644
128,tableau,3.481694,rules,0.049242,0.641975,5.423407
125,r,3.074885,rules,0.034564,0.603306,5.096727
123,power bi,2.665672,rules,0.043087,0.561728,4.745481
89,data analysis,2.454247,rules,0.099905,0.844,2.907876
130,data warehouse,2.347564,rules,0.029356,0.746988,3.142706
127,statistics,2.153209,rules,0.024621,0.504854,4.26501
115,data modeling,1.624698,rules,0.041193,0.621429,2.614456


BI / dashboard person
User skills: ['tableau', 'power bi', 'data visualization']
User skills mapped to columns: ['tableau', 'power bi', 'data visualization']


Unnamed: 0,suggested_skill,score,source,support,confidence,lift
107,data mining,4.0796,rules,0.038826,0.694915,5.870644
174,r,3.762139,rules,0.030777,0.53719,7.003367
177,sql,2.545079,rules,0.059659,0.777778,3.272244
89,data analysis,2.454247,rules,0.099905,0.844,2.907876
127,statistics,2.153209,rules,0.024621,0.504854,4.26501
32,business unit,1.801971,rules,0.043561,0.567901,3.173035
171,python,1.397656,rules,0.044034,0.574074,2.434627
58,communication,0.633038,rules,0.039773,0.518519,1.22086


Web dev – front end
User skills: ['html', 'css', 'javascript']
User skills mapped to columns: ['html', 'css', 'javascript']


Unnamed: 0,suggested_skill,score,source,support,confidence,lift
1909,software development,0.423797,cooccurrence,0.057765,,
1967,sql,0.408993,cooccurrence,0.116477,,
1031,java,0.388937,cooccurrence,0.116004,,
1582,python,0.368786,cooccurrence,0.114583,,
290,communication,0.310952,cooccurrence,0.06108,,
0,.net,0.30643,cooccurrence,0.113636,,
207,c#,0.296991,cooccurrence,0.049716,,
788,git,0.267159,cooccurrence,0.065814,,


Web dev – full stack JS
User skills: ['javascript', 'node', 'react', 'sql']
User skills mapped to columns: ['javascript', 'react', 'sql']
Unknown skills (not in data): ['node']


Unnamed: 0,suggested_skill,score,source,support,confidence,lift
80,css,9.138505,rules,0.023674,0.909091,10.052356
172,r,2.816371,rules,0.046875,0.818182,3.442231
177,tableau,2.545079,rules,0.059659,0.777778,3.272244
112,data mining,2.447435,rules,0.042614,0.762712,3.208859
130,data warehouse,2.347564,rules,0.029356,0.746988,3.142706
159,power bi,1.801241,rules,0.050189,0.654321,2.75284
176,statistics,1.675492,rules,0.030777,0.631068,2.655011
115,data modeling,1.624698,rules,0.041193,0.621429,2.614456


DevOps / cloud
User skills: ['linux', 'docker', 'kubernetes', 'aws']
User skills mapped to columns: ['linux', 'docker', 'kubernetes', 'aws']


Unnamed: 0,suggested_skill,score,source,support,confidence,lift
147,windows,2.946364,rules,0.036458,0.616,4.783059
22,python,1.282892,rules,0.052083,0.55,2.33253


Backend Java engineer
User skills: ['java', 'spring', 'sql']
User skills mapped to columns: ['java', 'spring', 'sql']


Unnamed: 0,suggested_skill,score,source,support,confidence,lift
172,r,2.816371,rules,0.046875,0.818182,3.442231
177,tableau,2.545079,rules,0.059659,0.777778,3.272244
112,data mining,2.447435,rules,0.042614,0.762712,3.208859
130,data warehouse,2.347564,rules,0.029356,0.746988,3.142706
159,power bi,1.801241,rules,0.050189,0.654321,2.75284
176,statistics,1.675492,rules,0.030777,0.631068,2.655011
115,data modeling,1.624698,rules,0.041193,0.621429,2.614456
126,data visualization,1.534843,rules,0.071496,0.604,2.541131


Cybersecurity / infosec
User skills: ['cybersecurity', 'network security', 'linux']
User skills mapped to columns: ['cybersecurity', 'network security', 'linux']


Unnamed: 0,suggested_skill,score,source,support,confidence,lift
83,vulnerability management,5.644315,rules,0.026042,0.785714,7.183673
147,windows,2.946364,rules,0.036458,0.616,4.783059
156,network engineering,2.910127,rules,0.025095,0.535354,5.435897


QA / testing
User skills: ['software testing', 'selenium', 'automation']
User skills mapped to columns: ['software testing', 'selenium', 'automation']


Unnamed: 0,suggested_skill,score,source,support,confidence,lift
1584,python,0.359107,cooccurrence,0.114583,,
1910,software development,0.31465,cooccurrence,0.057765,,
289,communication,0.304534,cooccurrence,0.06108,,
1032,java,0.251938,cooccurrence,0.116004,,
1967,sql,0.237883,cooccurrence,0.116477,,
311,computer science,0.226389,cooccurrence,0.091856,,
628,engineering,0.225542,cooccurrence,0.099432,,
2134,testing,0.217468,cooccurrence,0.106061,,


Totally random + one real skill
User skills: ['knitting', 'underwater basket weaving', 'python']
User skills mapped to columns: ['python']
Unknown skills (not in data): ['knitting', 'underwater basket weaving']


Unnamed: 0,suggested_skill,score,source,support,confidence,lift
166,r,3.764467,rules,0.053977,0.942149,3.995619
167,spark,3.045794,rules,0.023674,0.847458,3.594037
119,data science,2.363806,rules,0.05161,0.746575,3.166199
170,statistics,1.958782,rules,0.033144,0.679612,2.882208
148,machine learning,1.938342,rules,0.068182,0.676056,2.86713
8,ai,1.848451,rules,0.032197,0.660194,2.79986
109,data mining,1.713259,rules,0.035511,0.635593,2.695528
104,data engineering,1.51507,rules,0.024621,0.597701,2.534829


In [8]:
rules_sorted.to_csv("../data/association_rules.csv", index=False)

# Job role recommendation

In [9]:
df1= pd.read_csv('../data/skills_with_jobtitle_data.csv')

In [10]:
# @title job role recommendation based on user skills
# cached once
skill_cols = df1.columns.drop("job_title")
skill_name_map_for_roles = {s.lower(): s for s in skill_cols}
skill_index_for_roles = {s: i for i, s in enumerate(skill_cols)}
X_jobs = df1[skill_cols].values.astype(np.uint8)   # job-skill matrix


def suggest_job_roles(user_skills,
                      df_jobs=df1,
                      top_n=15,
                      min_overlap=1):
    """
    Given a list of user skills (strings), return job titles that best match.
    Uses simple set overlap + Jaccard similarity on the binary skill matrix.
    """
    # --- map user skills to existing columns ---
    mapped = []
    unknown = []
    for s in user_skills:
        key = s.strip().lower()
        if key in skill_name_map_for_roles:
            mapped.append(skill_name_map_for_roles[key])
        else:
            unknown.append(s)

    print("User skills mapped to columns:", mapped)
    if unknown:
        print("Unknown (not in data):", unknown)

    if not mapped:
        print("No user skills matched any columns.")
        return pd.DataFrame()

    # user skill vector
    user_vec = np.zeros(len(skill_cols), dtype=np.uint8)
    for sk in mapped:
        user_vec[skill_index_for_roles[sk]] = 1

    # --- overlap & similarity ---
    # overlap = number of shared skills with each job
    overlap = X_jobs @ user_vec

    # counts
    job_skill_counts = X_jobs.sum(axis=1)
    user_skill_count = user_vec.sum()

    # avoid division by zero
    union = job_skill_counts + user_skill_count - overlap
    union[union == 0] = 1

    # Jaccard similarity
    jaccard = overlap / union

    # coverage metrics
    coverage_user = overlap / user_skill_count          # how much of user set covered
    coverage_job = np.divide(
        overlap,
        job_skill_counts,
        out=np.zeros_like(overlap, dtype=float),
        where=job_skill_counts != 0,
    )

    # --- build results per job posting ---
    results = df_jobs[["job_title"]].copy()
    results["overlap"] = overlap
    results["jaccard"] = jaccard
    results["coverage_user"] = coverage_user
    results["coverage_job"] = coverage_job

    # keep only jobs with at least some overlap
    results = results[results["overlap"] >= min_overlap]

    if results.empty:
        print("No jobs share any skills with this skill set.")
        return results

    # --- aggregate by job_title (so repeated postings collapse) ---
    agg = (
        results.groupby("job_title")
        .agg(
            avg_overlap=("overlap", "mean"),
            max_overlap=("overlap", "max"),
            avg_jaccard=("jaccard", "mean"),
            max_jaccard=("jaccard", "max"),
            avg_coverage_user=("coverage_user", "mean"),
            avg_coverage_job=("coverage_job", "mean"),
            n_postings=("overlap", "count"),
        )
        .reset_index()
    )

    # rank by max_overlap then avg_jaccard
    agg = agg.sort_values(
        ["max_overlap", "avg_jaccard"],
        ascending=False
    ).head(top_n)

    return agg

In [11]:
user_skills = ["python", "sql", "data analysis"]

top_roles = suggest_job_roles(user_skills, top_n=15, min_overlap=2)
display(top_roles)


User skills mapped to columns: ['python', 'sql', 'data analysis']


Unnamed: 0,job_title,avg_overlap,max_overlap,avg_jaccard,max_jaccard,avg_coverage_user,avg_coverage_job,n_postings
15,BI Business Analyst,3.0,3,0.375,0.375,1.0,0.375,1
5,Accountant - Data Analyst,3.0,3,0.25,0.25,1.0,0.25,1
202,Technical Data Analyst,2.5,3,0.234848,0.333333,0.833333,0.268182,2
18,"BUSINESS DATA ANALYST- Austin , TX",3.0,3,0.214286,0.214286,1.0,0.214286,1
132,Junior Data Analyst,2.5,3,0.204861,0.222222,0.833333,0.21875,2
17,"BI Data Analyst (ThoughtSpot, Tableau, SQL) - ...",3.0,3,0.2,0.2,1.0,0.2,1
72,Data Analyst with Healthcare- REMOTE,3.0,3,0.2,0.2,1.0,0.2,1
97,Data Scientist (USA),3.0,3,0.2,0.2,1.0,0.2,1
77,"Data Analyst-SQL, Python, Visualization tool",3.0,3,0.1875,0.1875,1.0,0.1875,1
98,Data Scientist - Advana,3.0,3,0.1875,0.1875,1.0,0.1875,1


In [12]:
user_skills = ["cybersecurity"]

top_roles = suggest_job_roles(user_skills, top_n=15, min_overlap=1)
display(top_roles)

User skills mapped to columns: ['cybersecurity']


Unnamed: 0,job_title,avg_overlap,max_overlap,avg_jaccard,max_jaccard,avg_coverage_user,avg_coverage_job,n_postings
151,VPN Network Systems Engineer (Secret Clearance...,1.0,1,0.2,0.2,1.0,0.2,1
61,IT Security Analyst,1.0,1,0.173611,0.333333,1.0,0.173611,3
37,Cyber Systems Security Analyst,1.0,1,0.142857,0.142857,1.0,0.142857,1
50,Fire and Security Engineer (Static),1.0,1,0.142857,0.142857,1.0,0.142857,1
78,"Information security analysts - Malver, PA - L...",1.0,1,0.111111,0.111111,1.0,0.111111,1
108,Secret Cleared Network Engineer,1.0,1,0.111111,0.111111,1.0,0.111111,1
15,Business Analyst- NEED LOCAL CANDIDATES,1.0,1,0.1,0.1,1.0,0.1,1
148,Technical Writer/Business Analyst (Onsite),1.0,1,0.1,0.1,1.0,0.1,1
5,Application Security Analyst,1.0,1,0.090909,0.090909,1.0,0.090909,1
39,Cybersecurity Analyst (RMF),1.0,1,0.090909,0.090909,1.0,0.090909,1
