![job image](job.jpg "Job image")

# Role2Skills Project
The idea is a system which extracts skills from job descriptions and uses machine learning to cluster and classify job roles, revealing which skill combinations define different professions and what competencies are most demanded on the job market.

In [None]:
# Imports
import re
from collections import Counter
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

plt.rcParams.update({
    "figure.facecolor": "white",
    "axes.facecolor": "white",
    "savefig.facecolor": "white",
    "axes.edgecolor": "black",
    "text.color": "black",
    "axes.labelcolor": "black",
    "xtick.color": "black",
    "ytick.color": "black",
    "grid.color": "0.85",
})

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.decomposition import PCA

# Dataset imports

Vacancy dataset imported from Kaggle: https://www.kaggle.com/code/mpwolke/yandex-jobs and translated into english by me

In [None]:
# Load data
CSV_PATH = "data/yandex vacancies eng.csv"
df = pd.read_csv(CSV_PATH)

df.head()


Skills dataset in order to expand skill dictionary with more skills. Imported from Kaggle: https://www.kaggle.com/datasets/zamamahmed211/skills and converted into csv

In [None]:
# Dataset full of different skills (May be useful)
import pandas as pd
import re

SKILLS_PATH = "data/skills_dataset.csv"
skills_df = pd.read_csv(SKILLS_PATH)

skills_df.head()

Turning vacancy text fields into a single text block for skill extraction and model training

In [None]:
# Basic cleaning (safe columns)

for col in ["Requirements", "Description", "Pluses", "Hashtags"]:
    if col in df.columns:
        df[col] = df[col].fillna("").astype(str)
    else:
        df[col] = ""

# Combine text fields for skill extraction
# Use set to remove duplicated text blocks

def combine_unique_text(row):
    parts = {
        row["Requirements"].strip().lower(),
        row["Description"].strip().lower(),
        row["Pluses"].strip().lower(),
        row["Hashtags"].strip().lower(),
    }
    parts.discard("")  # remove empty strings
    return "\n".join(parts)

df["text"] = df.apply(combine_unique_text, axis=1)

Regex-based skill extraction to identify skills mentioned in job descriptions

In [None]:
# Key = canonical name, value = regex that matches common variants
skills_regex = {
    # Languages
    "python": r"\bpython\b",
    "java": r"\bjava\b",
    "kotlin": r"\bkotlin\b",
    "swift": r"\bswift\b",
    "go": r"\bgolang\b|\bgo\b",
    "ruby": r"\bruby\b",
    "php": r"\bphp\b",
    "scala": r"\bscala\b",
    "c": r"\bc\b(?!\+|\#)",               # tries to avoid catching c++ / c#
    "c++": r"\bc\+\+\b|\bcpp\b",
    "c#": r"\bc\#\b|c\s*sharp",
    ".net": r"\.net\b|dotnet",
    "javascript": r"\bjavascript\b|\bjs\b",
    "typescript": r"\btypescript\b|\bts\b",

    # Web / frontend
    "html": r"\bhtml\b",
    "css": r"\bcss\b",
    "sass": r"\bsass\b|\bscss\b",
    "react": r"\breact\b",
    "next.js": r"\bnext\.?js\b",
    "vue": r"\bvue\b|\bvue\.?js\b",
    "angular": r"\bangular\b",
    "redux": r"\bredux\b",

    # Backend / frameworks
    "node.js": r"\bnode\.?js\b|\bnodejs\b",
    "express": r"\bexpress\b",
    "nestjs": r"\bnest\.?js\b|\bnestjs\b",
    "spring": r"\bspring\b",
    "spring boot": r"\bspring\s*boot\b",
    "django": r"\bdjango\b",
    "flask": r"\bflask\b",
    "fastapi": r"\bfastapi\b",
    "laravel": r"\blaravel\b",
    "rails": r"\brails\b|ruby on rails",

    # Databases
    "sql": r"\bsql\b",
    "postgresql": r"\bpostgres(?:ql)?\b",
    "mysql": r"\bmysql\b",
    "mongodb": r"\bmongo(?:db)?\b",
    "redis": r"\bredis\b",
    "elasticsearch": r"\belasticsearch\b|\belk\b",

    # DevOps / cloud
    "linux": r"\blinux\b",
    "git": r"\bgit\b",
    "docker": r"\bdocker\b",
    "kubernetes": r"\bkubernetes\b|\bk8s\b",
    "terraform": r"\bterraform\b",
    "ansible": r"\bansible\b",
    "ci/cd": r"\bci\/cd\b|\bcicd\b|\bcontinuous integration\b",
    "aws": r"\baws\b|amazon web services",
    "gcp": r"\bgcp\b|google cloud",
    "azure": r"\bazure\b|microsoft azure",

    # APIs / messaging
    "rest": r"\brest\b|\brestful\b",
    "graphql": r"\bgraphql\b",
    "grpc": r"\bgrpc\b",
    "kafka": r"\bkafka\b",
    "rabbitmq": r"\brabbitmq\b",

    # Data / ML
    "pandas": r"\bpandas\b",
    "numpy": r"\bnumpy\b",
    "scikit-learn": r"scikit[-\s]?learn|\bsklearn\b",
    "pytorch": r"\bpytorch\b",
    "tensorflow": r"\btensorflow\b",
    "spark": r"\bspark\b|\bpyspark\b",
    "airflow": r"\bairflow\b",
}

# IMPORTANT: keep a clean copy of manual dict
skills_regex_manual = dict(skills_regex)

print("Manual skills_regex size:", len(skills_regex_manual))

Normalization of extracted skills to canonical forms to handle synonyms and variations

In [None]:
# --- Skill normalization: config & helpers ---

SKILL_CANONICAL_MAP = {
    # JS ecosystem
    "node.js": "nodejs",
    "nodejs": "nodejs",
    "node_js": "nodejs",
    "express": "express",
    "expressjs": "express",

    # CI/CD variants
    "ci/cd": "ci_cd",
    "cicd": "ci_cd",
    "ci-cd": "ci_cd",
    "ci cd": "ci_cd",

    # Spark variants
    "pyspark": "spark",
    "apache spark": "spark",
    "spark": "spark",

    # DB variants
    "postgres": "postgresql",
    "postgresql": "postgresql",
    "postgre": "postgresql",

    # ML libs
    "scikit-learn": "sklearn",
    "scikit learn": "sklearn",
    "sklearn": "sklearn",

    # C-family
    "c": "c",
    "c++": "cpp",
    "cpp": "cpp",
    "c#": "csharp",
    "csharp": "csharp",

    # Go
    "golang": "go",
    "go": "go",
    "go lang": "go",

    # Common skills
    "python": "python",
    "java": "java",
    "kotlin": "kotlin",
    "javascript": "javascript",
    "js": "javascript",
    "typescript": "typescript",
    "ts": "typescript",
    "react": "react",
    "reactjs": "react",
    "redux": "redux",
    "html": "html",
    "css": "css",
    "git": "git",
    "linux": "linux",
    "docker": "docker",
    "sql": "sql",
    "mysql": "mysql",
    "tensorflow": "tensorflow",
    "tf": "tensorflow",
    "pytorch": "pytorch",
    "torch": "pytorch",
}

DANGEROUS_CANONICAL = {"c", "go"}

def _valid_dangerous(canonical: str, text: str) -> bool:
    t = (text or "").lower()

    if canonical == "go":
        return ("golang" in t) or ("go lang" in t)

    if canonical == "c":
        return (
            "ansi c" in t
            or "c language" in t
            or "embedded c" in t
            or "iso c" in t
        )

    return True

def normalize_skills(raw_skills: list[str], text: str) -> list[str]:
    norm = []
    for s in raw_skills:
        s0 = str(s).strip().lower()
        canonical = SKILL_CANONICAL_MAP.get(s0, s0)

        if canonical in DANGEROUS_CANONICAL and not _valid_dangerous(canonical, text):
            continue

        norm.append(canonical)

    out = set(norm)

    if ("cpp" in out) or ("csharp" in out):
        out.discard("c")

    return sorted(out)

print("Normalization ready.")


In [None]:
# --- Process Kaggle skills dataset ---

raw = (
    skills_df["Skills"]
    .astype(str)
    .str.replace('"', '', regex=False)
    .str.strip()
    .str.lower()
)

raw = raw[raw.notna()]
raw = raw[~raw.isin(["", "nan", "none", "null"])]
raw = raw[raw.str.len() >= 2]

# Noise filtering (non-capturing group to avoid pandas warning)
BAD_PATTERNS = [
    r"[^\x00-\x7F]",  # non-latin
    r"\d",            # digits
    r"\b(?:pay|salary|equity|insurance|travel|student|retention|payback|bullet)\b",
]
bad_re = re.compile("|".join(BAD_PATTERNS))

raw = raw[~raw.str.contains(bad_re, regex=True)]

raw_unique = sorted(set(raw.tolist()))

# Normalize Kaggle skills using normalization
norm_skills = []
for s in raw_unique:
    out = normalize_skills([s], text=s)  # context = itself
    if out:
        norm_skills.extend(out)

norm_unique = sorted(set(norm_skills))

def _safe_regex(skill: str) -> str:
    return rf"(?<!\w){re.escape(skill)}(?!\w)"

# --- FILTER by vacancy dataset frequency (document frequency) ---
MIN_DOC_FREQ = 5  # tune: 3, 5, 10

# compile patterns once for speed
cand_compiled = {s: re.compile(_safe_regex(s), flags=re.IGNORECASE) for s in norm_unique}

doc_freq = {s: 0 for s in norm_unique}
texts = df["text"].astype(str).tolist()

for text in texts:
    for s, pat in cand_compiled.items():
        if pat.search(text):
            doc_freq[s] += 1

kaggle_kept = sorted([s for s, c in doc_freq.items() if c >= MIN_DOC_FREQ])

# RESET to manual dict, then add only kept Kaggle skills
skills_regex = dict(skills_regex_manual)

added = 0
for s in kaggle_kept:
    if s not in skills_regex:
        skills_regex[s] = _safe_regex(s)
        added += 1

print(f"[skills_dataset] Raw unique: {len(raw_unique)}")
print(f"[skills_dataset] Normalized unique: {len(norm_unique)}")
print(f"[skills_dataset] Kept by df >= {MIN_DOC_FREQ}: {len(kaggle_kept)}")
print(f"[skills_dataset] Added to skills_regex: {added}")
print(f"[skills_dataset] Final skills_regex size: {len(skills_regex)}")


In [None]:
# Skill extraction (regex)
def extract_skills(text: str) -> list[str]:
    found = []
    for skill, pattern in skills_regex.items():
        if re.search(pattern, text, flags=re.IGNORECASE):
            found.append(skill)
    return found

# 1) raw extraction
df["skills_raw"] = df["text"].apply(extract_skills)

# 2) canonical normalization + dedup per vacancy
df["skills"] = df.apply(lambda r: normalize_skills(r["skills_raw"], r["text"]), axis=1)

df["skills_count"] = df["skills"].apply(len)

df[["skills", "skills_count"]].head()

# Plots and analysis of extracted skills

In [None]:
# Plot helpers
def plot_hist(series, title, xlabel, ylabel, bins=30):
    plt.figure()
    plt.hist(series.dropna(), bins=bins)
    plt.title(title)
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.show()


def plot_bar(series, title, xlabel, ylabel="Count"):
    counts = series.value_counts().sort_index()

    plt.figure()
    plt.bar(counts.index, counts.values)
    plt.title(title)
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.show()

In [None]:
# Skill frequency
all_skills = [s for row in df["skills"] for s in row]
skill_freq = Counter(all_skills)

top = skill_freq.most_common(15)
labels = [k for k, _ in top]
values = [v for _, v in top]

plt.figure()
plt.bar(labels, values)
plt.xticks(rotation=45, ha="right")
plt.title("Top skills")
plt.ylabel("Count")
plt.show()


Some skills like Python and SQL appear much more often than others. This shows skill imbalance in the data and why normalization is needed.

In [None]:
# Skills per job
plot_bar(
    df["skills_count"],
    title="Number of skills per job vacancy",
    xlabel="Number of skills mentioned in a job vacancy",
    ylabel="Number of job vacancies"
)


# for example 150 vacancies require 2 skills, 60 vacancies require 4 skills, etc.

This distribution shows that most job vacancies mention only a small number of skills.
This suggests that job descriptions are usually focused, which makes a dictionary-based skill extraction approach suitable.

In [None]:
from collections import Counter

all_skills = [s for skills in df["skills"] for s in skills]
skill_freq = Counter(all_skills)

skill_freq_series = pd.Series(skill_freq.values())

plot_hist(
    series=skill_freq_series,
    title="Distribution of skill frequencies",
    xlabel="Number of vacancies mentioning a skill",
    ylabel="Number of skills",
    bins=40
)

# for example 25 skills are mentioned in ~7 vacancies, 1 skill is mentioned in ~280 vacancies, etc
# for example sql or python are mentioned in ~280 vacancies, while some rare skills (around 28 of them) are mentioned in only several vacancies

This distribution shows that a few skills (for example, Python or SQL) appear very often, while most skills are mentioned only in a small number of vacancies.
This indicates an imbalanced skill distribution and explains why skill normalization is needed.

In [None]:
# Filter skills by document frequency
N = len(df)
skill_doc_freq = Counter()

for row in df["skills"]:
    for s in set(row):
        skill_doc_freq[s] += 1

min_df = 5 # previously was max(2, int(0.01 * N)); set to 5 because skipped a lot of useful skills
max_df = int(0.7 * N)

kept_skills = [
    s for s, c in skill_doc_freq.items()
    if min_df <= c <= max_df
]

kept_skills


In [None]:
kept_skills = sorted(kept_skills)

def filter_skills(skills):
    return [s for s in skills if s in kept_skills]

df["skills_filtered"] = df["skills"].apply(filter_skills)

mlb = MultiLabelBinarizer(classes=kept_skills)
X = mlb.fit_transform(df["skills_filtered"])

print("X shape:", X.shape)


In [None]:
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter

# compute co-occurrence matrix
co = (X.T @ X).astype(int)
np.fill_diagonal(co, 0)

# choose top skills by frequency
skill_freq = Counter()

for row in df["skills_filtered"]:
    for s in row:
        skill_freq[s] += 1

TOP_N = 20
top_skills = [s for s, _ in skill_freq.most_common(TOP_N)]

# indices of these skills in kept_skills / X
idx = [kept_skills.index(s) for s in top_skills]

co_top = co[np.ix_(idx, idx)]

# plot
plt.figure(figsize=(9, 7))
plt.imshow(co_top, aspect="auto")
plt.xticks(range(len(top_skills)), top_skills, rotation=90)
plt.yticks(range(len(top_skills)), top_skills)
plt.title("Skill co-occurrence (top skills by frequency)")
plt.colorbar(label="Number of vacancies")
plt.tight_layout()
plt.show()

The analysis shows that most job vacancies list only a few skills, while some list many, creating a long-tail distribution. Common skills appear much more often and frequently occur together, which reflects typical technology stacks in job roles.

# Searching for data clusters
HDBSCAN was used to cluster job vacancies based on their skill vectors in order to identify groups of similar job roles. The algorithm automatically finds dense clusters of vacancies with similar skill requirements and assigns unclear or rare cases to noise. For each cluster, the most common skills were extracted to interpret and describe the corresponding job role.

In [None]:
import hdbscan

clusterer = hdbscan.HDBSCAN(
    min_cluster_size=10,      # role minimal size
    min_samples=5,
    metric="euclidean",
    cluster_selection_method="eom"
)

labels = clusterer.fit_predict(X)

df["cluster"] = labels


In [None]:
df["cluster"].value_counts().sort_index()

In [None]:
from collections import Counter

cluster_skills = []

for cl in sorted(df["cluster"].unique()):
    if cl == -1:
        continue

    subset = df[df["cluster"] == cl]
    freq = Counter([s for row in subset["skills_filtered"] for s in row])

    cluster_skills.append({
        "cluster": cl,
        "size": len(subset),
        "top_skills": ", ".join([s for s, _ in freq.most_common(10)])
    })

pd.DataFrame(cluster_skills).sort_values("size", ascending=False)

HDBSCAN labels dense groups of similar vacancies as clusters, while assigning less common or ambiguous vacancies to noise (cluster -1).
A minimum cluster size of 10 was used, meaning that only job role types with at least 10 similar vacancies were considered stable clusters.

In [None]:
df[df["cluster"] == 0][["Header", "skills"]].head(5)

Using HDBSCAN, similar job vacancies were grouped into clusters based on shared skills. The clusters represent common job roles such as frontend, backend, data, and DevOps, while less clear vacancies were labeled as noise. Overall, the results show that job roles can be identified using skill-based representations.

# Classificator model
A multi-label classification model was trained to predict required skills from job vacancy text. TF-IDF features were extracted from the vacancy text and a One-Vs-Rest logistic regression classifier was used to predict multiple skills for each vacancy. The trained model can take a job description as input and return the most likely required skills.

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

df_train = df[df["skills_filtered"].apply(len) > 0].copy()
X_text = df_train["text"].fillna("").astype(str)
y_lists = df_train["skills_filtered"]

mlb = MultiLabelBinarizer()
Y = mlb.fit_transform(y_lists)

X_train, _, Y_train, _ = train_test_split(X_text, Y, test_size=0.2, random_state=42)

clf = Pipeline([
    ("tfidf", TfidfVectorizer(
        lowercase=True,
        ngram_range=(1, 2),
        min_df=2,
        max_df=0.9,
        sublinear_tf=True,
        max_features=200_000
    )),
    ("ovr", OneVsRestClassifier(
        LogisticRegression(
            solver="liblinear",
            max_iter=2000,
            class_weight="balanced"
        )
    ))
])

clf.fit(X_train, Y_train)

def predict_skills(text: str, top_k: int = 10):
    p = clf.predict_proba([str(text)])[0]
    idx = np.argsort(p)[::-1][:top_k]
    return [mlb.classes_[i] for i in idx]


# Lets check the model in action

In [None]:
predict_skills("Flutter developer", top_k=10)

The trained model can predict relevant skills for new job descriptions by analyzing the text and outputting the most probable skills based on learned patterns from the training data.