# How well are the most popular roles paid?
- Global Salary Distribution
- Salaries of Top-3 in-Demand Data Roles in Europe
- EU - US Salary Gap

In [4]:
import pandas as pd
import plotly.express as px

import matplotlib.pyplot as plt
import seaborn as sns

import plotly.graph_objects as go

df = pd.read_csv('/Users/kolesnikevgenia/Documents/Python_Projects/Job_Skills/Raw_Data/df_Final.csv')

### Global Salary Distribution

In [5]:
# Remove outliers (5th–95th percentile)
q_low = df['salary_month_avg_eur'].quantile(0.05)
q_high = df['salary_month_avg_eur'].quantile(0.95)
df_valid = df[
    (df['salary_month_avg_eur'] >= q_low) &
    (df['salary_month_avg_eur'] <= q_high)
]

In [6]:
median_salary = df_valid.groupby('job_title_short')['salary_month_avg_eur'].median().sort_values()

fig1 = px.bar(
    median_salary,
    x=median_salary.values,
    y=median_salary.index,
    orientation='h',
    title='Global Median Salary by Job Title',
    labels={'x': 'Median Salary (USD)', 'index': 'Job Title'},
    color_discrete_sequence=['#5DADE2']
)

fig1.update_layout(showlegend=False)
fig1.show()

### Salaries of Top-3 in-Demand Data Roles in Europe

In [7]:
top_roles = ['Data Analyst', 'Data Scientist', 'Data Engineer']

df_eu_top = df[
    (df['job_title_short'].isin(top_roles)) &
    (df['region_group'] == 'EU')
]

In [8]:
fig2 = px.box(
    df_eu_top,
    x='job_title_short',
    y='salary_month_avg_eur',
    title='Salary Distribution in EU for Top-3 Roles',
    labels={
        'salary_month_avg_eur': 'Monthly Salary (EUR)',
        'job_title_short': 'Job Title'
    }
)

fig2.update_traces(marker_color='#5DADE2', line_color='#5DADE2')

# Add median labels
medians = df_eu_top.groupby('job_title_short')['salary_month_avg_eur'].median().round(0)
for job, med in medians.items():
    fig2.add_annotation(
        x=job,
        y=med,
        text=f"Median: €{med:,.0f}",
        showarrow=False,
        font=dict(size=12),
        yshift=10
    )

fig2.update_layout(showlegend=False)
fig2.show()

### EU - US Salary Gap

In [9]:
df_top = df[
    (df['region_group'].isin(['EU', 'US'])) &
    (df['job_title_short'].isin(top_roles))
    ]

salary_medians = df_top.groupby(['job_title_short', 'region_group'])['salary_month_avg_eur'].median().unstack()
salary_medians['gap_abs'] = salary_medians['US'] - salary_medians['EU']
salary_medians['gap_pct'] = 100 * (salary_medians['gap_abs'] / salary_medians['EU'])

print(salary_medians[['EU', 'US', 'gap_abs', 'gap_pct']].round(2))

region_group          EU      US  gap_abs  gap_pct
job_title_short                                   
Data Analyst     6539.94  6606.0    66.06     1.01
Data Engineer    8544.97  9175.0   630.03     7.37
Data Scientist   7812.66  9542.0  1729.34    22.14


In [10]:
# Manual mapping: job_country → ISO-3
country_iso_map = {
    'Germany': 'DEU', 'France': 'FRA', 'Spain': 'ESP', 'Italy': 'ITA',
    'Netherlands': 'NLD', 'Poland': 'POL', 'Sweden': 'SWE', 'Ireland': 'IRL',
    'Belgium': 'BEL', 'Austria': 'AUT', 'Finland': 'FIN', 'Portugal': 'PRT',
    'Denmark': 'DNK', 'Czech Republic': 'CZE', 'Hungary': 'HUN', 'Romania': 'ROU',
    'Greece': 'GRC', 'Croatia': 'HRV', 'Slovakia': 'SVK', 'Slovenia': 'SVN',
    'Bulgaria': 'BGR', 'Lithuania': 'LTU', 'Latvia': 'LVA', 'Estonia': 'EST',
    'Luxembourg': 'LUX', 'Malta': 'MLT', 'Cyprus': 'CYP'
}

# Group by job_country and job_title_short, then get median salary
df_map = df_eu_top.groupby(['job_country', 'job_title_short'])['salary_month_avg_eur'].median().reset_index()

# Map ISO-3 codes
df_map['iso_alpha'] = df_map['job_country'].map(country_iso_map)

# Drop rows without ISO codes (non-EU or unmapped)
df_map = df_map.dropna(subset=['iso_alpha'])

In [11]:
# 1. Prepare traces for each role (Data Analyst, Scientist, Engineer)
fig = go.Figure()

job_titles = df_map['job_title_short'].unique()

for role in job_titles:
    df_role = df_map[df_map['job_title_short'] == role]
    fig.add_trace(go.Choropleth(
        locations=df_role['iso_alpha'],
        z=df_role['salary_month_avg_eur'],
        text=df_role['job_country'],
        colorscale='Blues',
        colorbar_title='EUR',
        zmin=df_map['salary_month_avg_eur'].min(),
        zmax=df_map['salary_month_avg_eur'].max(),
        visible=(role == job_titles[0]),  # show only first role initially
        name=role,
        locationmode='ISO-3'
    ))

# 2. Side-by-side toggle buttons
buttons = [
    {
        'label': role,
        'method': 'update',
        'args': [
            {'visible': [r == role for r in job_titles]},
            {'title': f'Median Monthly Salary in EU — {role}'}
        ]
    }
    for role in job_titles
]

# 3. Layout with buttons above the plot (not overlapping)
fig.update_layout(
    title=dict(
        text=f'Median Monthly Salary in EU — {job_titles[0]}',
        y=0.92,  # move title up to make space
        x=0.5,
        xanchor='center'
    ),
    geo=dict(
        showframe=False,
        showcoastlines=False,
        projection_type='natural earth',
        scope='europe'
    ),
    height=600,
    updatemenus=[{
        'type': 'buttons',
        'buttons': buttons,
        'direction': 'right',
        'pad': {'r': 10, 't': 10},
        'x': 0.5,
        'xanchor': 'center',
        'y': 1.05,
        'yanchor': 'top'
    }]
)

fig.show()