# Which data-related roles are the most popular in Europe and worldwide?

- The Most popular Data Roles by Number of Postings
- Job Postings Trend for Top-3 Data Roles
- Top-3 Data Roles in EU, US and Other Countries

In [2]:
import pandas as pd

#%pip install plotly
import plotly.express as px
import matplotlib.pyplot as plt

import plotly.graph_objects as go
from plotly.subplots import make_subplots

df = pd.read_csv('/Users/kolesnikevgenia/Documents/Python_Projects/Job_Skills/Raw_Data/df_Final.csv')

### The most Popular Data Roles by Number of Job Postings

In [3]:
# Top-10 the most common job titles
top_titles = (
    df['job_title_short']
       .value_counts()
       .head(10)
       .rename_axis('job_title')
       .reset_index(name='count')
       .sort_values(by='count', ascending=True)
)

fig = px.bar(
    top_titles,
    x='count',
    y='job_title',
    orientation='h',
    title='The Most Popular Data Roles by Number of Job Postings',
    labels={'count': 'Number of Job Postings', 'job_title': ''}
)

fig.update_traces(text=top_titles['count'], textposition='outside', marker_color='steelblue')
fig.update_layout(margin=dict(t=60, l=150))  # add left margin for longer titles
fig.show()

### Job Postings Trend for Top-3 Data Roles

In [4]:
# Define consistent colors for top 3 jobs
job_colors = {
    'Data Analyst': '#4c78a8',
    'Data Scientist': '#6baed6',
    'Data Engineer': '#9ecae1'
}

# Set top 3 roles and extract month
top3_roles = ['Data Analyst', 'Data Scientist', 'Data Engineer']
df_top3 = df[df['job_title_short'].isin(top3_roles)].copy()
df_top3['job_posted_date'] = pd.to_datetime(df_top3['job_posted_date'])
df_top3['month'] = df_top3['job_posted_date'].dt.to_period('M').astype(str)

# Group by month and role
monthly_counts = (
    df_top3.groupby(['month', 'job_title_short'])
    .size()
    .reset_index(name='count')
)

# Pivot for plotting
df_pivot = monthly_counts.pivot(index='month', columns='job_title_short', values='count').fillna(0)
df_pivot = df_pivot.sort_index()

# Plot
fig = go.Figure()

for role in top3_roles:
    fig.add_trace(go.Scatter(
        x=df_pivot.index,
        y=df_pivot[role],
        mode='lines+markers',
        name=role,
        line=dict(color=job_colors[role], width=4),
        marker=dict(size=9)
    ))

# Layout and styling
fig.update_layout(
    title='Job Postings Trend for Top-3 Data Roles',
    xaxis_title='',
    yaxis_title='Number of Job Postings',
    height=400,
    width=950,
    template='plotly_white',
    margin=dict(t=60, b=40, l=60, r=40),
    legend_title='Job Title'
)

fig.show()

### Top-3 Data Roles in EU, US and Other Countries

In [5]:
# Define consistent colors for top 3 jobs
job_colors = {
    'Data Analyst': '#4c78a8',
    'Data Scientist': '#6baed6',
    'Data Engineer': '#9ecae1'
}

regions = ['EU', 'US', 'Other']
top3_global = df['job_title_short'].value_counts().nlargest(3).index.tolist()

# Subplots setup
fig = make_subplots(
    rows=1,
    cols=3,
    specs=[[{'type': 'domain'}, {'type': 'domain'}, {'type': 'domain'}]],
    column_widths=[0.3, 0.3, 0.3],
    horizontal_spacing=0.1
)

# Adjusted positions for perfect visual centering
title_positions = [0.12, 0.5, 0.9]

for i, (region, x_pos) in enumerate(zip(regions, title_positions)):
    region_data = df[df['region_group'] == region]['job_title_short']
    
    # Count for top 3 roles only
    counts = {job: region_data[region_data == job].count() for job in top3_global}
    labels = [job for job in top3_global if counts[job] > 0]
    values = [counts[job] for job in labels]
    colors = [job_colors[job] for job in labels]

    fig.add_trace(
    go.Pie(
        labels=labels,
        values=values,
        hole=0.5,
        marker=dict(colors=colors, line=dict(color='white', width=2)),
        textinfo='percent',
        hoverinfo='none',
        showlegend=(i == 0)
    ),
    row=1, col=i+1
)

    fig.add_annotation(
        text=region,
        x=x_pos,
        y=1.03,
        showarrow=False,
        font_size=14
    )

# Layout
fig.update_layout(
    title_text='Top-3 Data Roles by Region',
    height=340,
    width=950,
    margin=dict(t=50, b=30, r=40, l=40),
)

fig.show()