# What technologies should you learn to land one of the Top-3 in-demand roles?
- Likelihood of Technologies Requested for Top-3 Data Roles
- Number of skills requested for each of Top-3 Data Roles

In [9]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import ast

from collections import Counter

df = pd.read_csv('/Users/kolesnikevgenia/Documents/Python_Projects/Job_Skills/Raw_Data/df_Final.csv')

In [2]:
df_technology = df['job_type_skills'].copy()

df_technology = df_technology.drop_duplicates()
df_technology = df_technology.dropna()

#combine all dictionaries into one
technology_dict = {}
for row in df_technology:
    row_dict = ast.literal_eval(row)
    for key, value in row_dict.items():
        if key in technology_dict:
            technology_dict[key] += value
        else:
            technology_dict[key] = value

#remove duplicates by converting values to set then back to list
for key, value in technology_dict.items():
    technology_dict[key] = list(set(value))

df_technology = pd.DataFrame(list(technology_dict.items()), columns=['technology', 'skills'])

df_technology = df_technology.explode('skills')
df_technology.head(5)

Unnamed: 0,technology,skills
0,analyst_tools,spss
0,analyst_tools,power bi
0,analyst_tools,spreadsheet
0,analyst_tools,sap
0,analyst_tools,dax


In [14]:
top_roles = ['Data Analyst', 'Data Scientist', 'Data Engineer']
df_top = df[df['job_title_short'].isin(top_roles)]

df_dropna = df_top.dropna(subset=['job_skills']).copy()
df_dropna['job_skills'] = df_dropna['job_skills'].apply(ast.literal_eval)
df_exploded = df_dropna.explode('job_skills')

df_plot = df_exploded.merge(df_technology, left_on='job_skills', right_on='skills').copy()
df_plot.head(3)

Unnamed: 0,job_title_short,job_title,job_location,job_via,job_schedule_type,job_work_from_home,search_location,job_posted_date,job_no_degree_mention,job_health_insurance,...,job_skills,job_type_skills,salary_year_avg_eur,salary_month_avg_eur,country,is_eu,region_group,skills_count,technology,skills
0,Data Scientist,CRM Data Specialist,"San José Province, San José, Costa Rica",via Ai-Jobs.net,Full-time,False,Costa Rica,2023-08-01 13:37:57,False,False,...,gdpr,"{'analyst_tools': ['excel'], 'libraries': ['gd...",96447.6,8037.3,,False,Other,2,libraries,gdpr
1,Data Scientist,CRM Data Specialist,"San José Province, San José, Costa Rica",via Ai-Jobs.net,Full-time,False,Costa Rica,2023-08-01 13:37:57,False,False,...,excel,"{'analyst_tools': ['excel'], 'libraries': ['gd...",96447.6,8037.3,,False,Other,2,analyst_tools,excel
2,Data Engineer,Data Engineer,"Arlington, VA",via LinkedIn,Full-time,False,Sudan,2023-06-26 14:22:54,False,False,...,mongodb,"{'analyst_tools': ['tableau'], 'cloud': ['orac...",123312.0,10276.0,,False,Other,11,databases,mongodb


### Likelihood of Technologies Requested for Top-3 Data Roles

In [20]:
# Explode technologies into individual rows
df_exploded = (
    df_plot
    .assign(technology=df_plot['technology'].str.split(', '))
    .explode('technology')
)

# Calculate top 5 technologies per role (normalized)
top_tech_list = []
for role in top_roles:
    tech_counts = (
        df_exploded[df_exploded['job_title_short'] == role]['technology']
        .value_counts(normalize=True)
        .head(5)
        .reset_index()
    )
    tech_counts.columns = ['technology', 'percentage']
    tech_counts['job_title_short'] = role
    tech_counts['percentage_label'] = (tech_counts['percentage'] * 100).round(0).astype(int).astype(str) + '%'
    top_tech_list.append(tech_counts)

df_top_tech = pd.concat(top_tech_list, ignore_index=True)

# Prepare figure with subplots for each role
fig = make_subplots(
    rows=len(top_roles), cols=1,
    shared_xaxes=True,
    vertical_spacing=0.05,
    subplot_titles=top_roles
)

base_colors = px.colors.sequential.Blues

for i, role in enumerate(top_roles, start=1):
    df_role = df_top_tech[df_top_tech['job_title_short'] == role].copy()
    # Sort descending so biggest bars on top
    df_role = df_role.sort_values(by='percentage', ascending=False)

    # Colors: darker for biggest bar, lighter for others
    colors = [base_colors[-2]] + [base_colors[3]] * (len(df_role) - 1)

    fig.add_trace(
        go.Bar(
            x=df_role['percentage'],
            y=df_role['technology'],
            orientation='h',
            text=df_role['percentage_label'],
            textposition='outside',
            marker_color=colors,
            showlegend=False
        ),
        row=i, col=1
    )

# Reverse y-axis so biggest bar is on top visually
for i in range(1, len(top_roles) + 1):
    fig.update_yaxes(autorange="reversed", row=i, col=1)

fig.update_layout(
    height=600,
    title='Likelihood of Technologies Requested for Top-3 Data Roles',
    margin=dict(l=60, r=20, t=80, b=60)
)

fig.show()

### Number of skills requested for each of Top-3 Data Roles

In [27]:
# Count skills
def count_skills(skills_str):
    if pd.isna(skills_str) or skills_str.strip() == '':
        return 0
    return len([s for s in skills_str.split(',') if s.strip() != ''])

df_top = df_top.copy()
df_top['skills_count'] = df_top['job_skills'].apply(count_skills)

#Average number of skills per role
avg_skills_per_role = (
    df_top.groupby('job_title_short')['skills_count']
    .mean()
    .reset_index()
    .rename(columns={'skills_count': 'avg_skills'})
    .sort_values(by='avg_skills', ascending=False)
)

avg_skills_per_role['avg_skills'] = avg_skills_per_role['avg_skills'].round(0).astype(int)

print(avg_skills_per_role)

  job_title_short  avg_skills
1   Data Engineer           7
2  Data Scientist           5
0    Data Analyst           4
