---
title: "Skill Gap Analysis"
subtitle: ""

bibliography: references.bib
csl: csl/econometrica.csl
format: 
  html:
    toc: true
    number-sections: true
    df-print: paged
    code: false
    code-tools: true
    section-divs: true
---

In [None]:
#| echo: false
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.offline as pyo
import plotly.io as pio
from collections import Counter
import os

In [None]:
#| echo: false
data = pd.read_csv("files/cleaned_job_postings.csv")

In [None]:
#| echo: false
figures_folder = "figures"
if not os.path.exists(figures_folder):
    os.makedirs(figures_folder)

In [None]:
skills_data = {
    "Name": ["Yixuan", "Arohit", "Chengjie"],
    "Python": [5, 3, 4],
    "SQL": [4, 2, 5],
    "Machine Learning": [3, 1, 4],
    "Cloud Computing": [2, 2, 3],
    "Data Visualization": [4, 3, 5],
    "Statistics": [5, 2, 4],
    "Project Management": [3, 4, 3],
    "Communication": [2, 5, 4],
    "Problem-Solving": [4, 4, 5],
    "Teamwork": [5, 5, 5],
    "Excel" :[4, 4, 4],
    "Adaptability": [4, 5, 3],
    "Data Analysis": [4, 3, 4],
    "Leadership": [3, 4, 2],
    "R": [3, 5, 4]

}

df_skills = pd.DataFrame(skills_data)
df_skills.set_index("Name", inplace=True)
df_skills

In [None]:
#| echo: false
fig = px.imshow(
    df_skills,
    labels=dict(x="Skills", y="Team Members", color="Skill Level"),
    x=df_skills.columns,
    y=df_skills.index,
    color_continuous_scale="RdBu",
    title="Team Skill Levels Heatmap",
    text_auto=True
)

fig.update_layout(
    width=800,
    height=600,
    margin=dict(l=100, r=100, t=100, b=100),
    xaxis_title="Skills",
    yaxis_title="Team Members",
    coloraxis_colorbar_title="Skill Level"
)

heatmap_file = os.path.join(figures_folder, "team_skill_levels_heatmap.html")
fig.write_html(heatmap_file)

<iframe src="figures/team_skill_levels_heatmap.html" width="100%" height="500"></iframe>


In [None]:
skill_keywords = [
    "Python", "R", "SQL", "Data Analysis", "Machine Learning",
    "Statistics", "Data Visualization", "Excel", "Tableau", "Power BI",
    "Java", "C++", "JavaScript", "HTML/CSS", "Cloud Computing",
    "Cybersecurity", "Network Administration", "Database Management",
    "Communication", "Problem-Solving", "Teamwork", "Project Management",
    "Leadership", "Time Management", "Adaptability", "Financial Analysis",
    "Marketing Strategy", "Customer Relationship Management", "Supply Chain Management",
    "Regulatory Compliance"
]

In [None]:
def extract_skills(body_text):
    if pd.isna(body_text) or not body_text:
        return []
    body_text = body_text.lower()
    # Extract skills present in the text
    skills = [skill for skill in skill_keywords if skill.lower() in body_text]
    return skills

In [None]:
data['extracted_skills'] = data['BODY'].apply(extract_skills)

In [None]:
all_skills = set()
for skills in data['extracted_skills']:
    all_skills.update(skills)
print(all_skills)

In [None]:
for skill in all_skills:
    if skill not in df_skills.columns:
        df_skills[skill] = 0  

df_skills