# How can a Data Analyst maximize their salary when searching for a job in Europe?
- Must-Have Skills for Data Analysts and How They Affect Salary
- Top-Paying Analytical Skills and Their Impact on Salary
- Impact of Job Type (Remote vs On-site) and Degree Requirements on Salary
- Best Time to Search: Salary Trends Over Time

In [74]:
import pandas as pd
import numpy as np

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

from sklearn.linear_model import LinearRegression
import ast

df = pd.read_csv('/Users/kolesnikevgenia/Documents/Python_Projects/Job_Skills/Raw_Data/df_Final.csv')

### Remove Salary Outliers

In [75]:
#Calculate global IQR
Q1_global = df['salary_month_avg_eur'].quantile(0.25)
Q3_global = df['salary_month_avg_eur'].quantile(0.75)
IQR_global = Q3_global - Q1_global
lower_bound_global = Q1_global - 1.5 * IQR_global
upper_bound_global = Q3_global + 1.5 * IQR_global

#Build list to collect filtered results
filtered_groups = []

#Iterate over groups manually
for (country, title), group in df.groupby(['job_country', 'job_title_short']):
    if len(group) >= 5:
        Q1 = group['salary_month_avg_eur'].quantile(0.25)
        Q3 = group['salary_month_avg_eur'].quantile(0.75)
        IQR = Q3 - Q1
        lower = Q1 - 1.5 * IQR
        upper = Q3 + 1.5 * IQR
    else:
        lower = lower_bound_global
        upper = upper_bound_global

    filtered = group[
        (group['salary_month_avg_eur'] >= lower) &
        (group['salary_month_avg_eur'] <= upper)
    ]
    filtered_groups.append(filtered)

# Combine all groups back
df_filtered = pd.concat(filtered_groups, ignore_index=True)

print(f"Original dataset size: {len(df)}")
print(f"Filtered dataset size: {len(df_filtered)}")

Original dataset size: 22003
Filtered dataset size: 21310


### Filter Data Analyst Roles in EU/US
Although our main focus is the European job market, analyzing US salary trends provides valuable foresight since the US often leads in technology adoption and skill demand. This helps us benchmark Europe’s position, anticipate emerging trends, and make more informed decisions regarding talent strategy and compensation.

In [76]:
eu_us = ['EU', 'US']

df_da_eu_us = df_filtered[
    (df_filtered['region_group'].isin(eu_us)) &
    (df_filtered['job_title_short'] == 'Data Analyst')
]

### Must-Have Skills for Data Analysts and How They Affect Salary

In [77]:
# Extract analyst_tools skills
analyst_skills = set()
for row in df_da_eu_us['job_type_skills'].dropna().unique():
    row_dict = ast.literal_eval(row)
    analyst_skills.update(row_dict.get('analyst_tools', []))

df_da = df_da_eu_us.dropna(subset=['job_skills', 'salary_month_avg_eur']).copy()
df_da['job_skills'] = df_da['job_skills'].apply(ast.literal_eval)
df_da = df_da.explode('job_skills')
df_plot = df_da[df_da['job_skills'].isin(analyst_skills)].copy()

# Find top 10 skills by number of postings (frequency)
top10_by_count = (
    df_plot['job_skills']
    .value_counts()
    .head(10)
    .index.tolist()
)

df_top10_count = df_plot[df_plot['job_skills'].isin(top10_by_count)]

top10_salary_skills = (
    df_top10_count.groupby('job_skills')['salary_month_avg_eur']
    .median()
    .sort_values(ascending=False)
    .head(10)
    .reset_index()
)

top10_salary_skills['job_postings'] = top10_salary_skills['job_skills'].apply(
    lambda s: (df_plot['job_skills'] == s).sum()
)

top10_salary_skills.columns = ['skill', 'median_salary_eur', 'job_postings']
top10_salary_skills = top10_salary_skills.sort_values('median_salary_eur')

top_skills_set = set(top10_salary_skills['skill'])

# Define top 3 highest-paid skills from these top 10
top3_skills_set = set(top10_salary_skills.sort_values('median_salary_eur', ascending=False)['skill'].head(3))

df_plot['has_top_skill'] = df_plot['job_skills'].apply(lambda s: s in top3_skills_set)
df_valid = df_plot.dropna(subset=['salary_month_avg_eur'])

avg_salary = (
    df_valid.groupby('has_top_skill')['salary_month_avg_eur']
    .mean()
    .reset_index()
)

avg_salary['group'] = avg_salary['has_top_skill'].map({
    True: 'With Top 3 Skills',
    False: 'Without Top 3 Skills'
})

# Sort avg_salary to control bar order on plot
avg_salary['group'] = pd.Categorical(
    avg_salary['group'],
    categories=['Without Top 3 Skills', 'With Top 3 Skills'],
    ordered=True
)
avg_salary = avg_salary.sort_values('group')

with_salary = avg_salary.loc[avg_salary['group'] == 'With Top 3 Skills', 'salary_month_avg_eur'].values[0]
without_salary = avg_salary.loc[avg_salary['group'] == 'Without Top 3 Skills', 'salary_month_avg_eur'].values[0]
pct_diff = (with_salary - without_salary) / without_salary * 100

# Plotting
fig = make_subplots(
    rows=1, cols=2,
    column_widths=[0.6, 0.4],
    subplot_titles=("Top 10 Highest Paid Analyst Tools Skills", "Avg Salary: With vs Without Top 3 Skills"),
    horizontal_spacing=0.1
)

# Left: horizontal bar for median salary of top 10 skills
fig.add_trace(go.Bar(
    x=top10_salary_skills['median_salary_eur'],
    y=top10_salary_skills['skill'],
    orientation='h',
    marker=dict(color=top10_salary_skills['median_salary_eur'], colorscale='Blues'),
    text=top10_salary_skills['median_salary_eur'].round(0).astype(int),
    textposition='auto',
    name='Top Skills'
), row=1, col=1)

# Bold the top 3 skills on left y-axis
top_3_skills = set(top10_salary_skills.tail(3)['skill'])
ticktext = [f"<b>{s}</b>" if s in top_3_skills else s for s in top10_salary_skills['skill']]

fig.update_yaxes(
    tickmode='array',
    tickvals=top10_salary_skills['skill'],
    ticktext=ticktext,
    row=1, col=1
)

# Right: bar for avg salary with/without top 3 skills
fig.add_trace(go.Bar(
    x=avg_salary['group'],
    y=avg_salary['salary_month_avg_eur'],
    marker_color=['#9ecae1' if g == 'Without Top 3 Skills' else '#084594' for g in avg_salary['group']],
    text=avg_salary['salary_month_avg_eur'].round(0).astype(int),
    textposition='auto',
    name='Group Comparison'
), row=1, col=2)

fig.add_annotation(
    x=1.5,
    y=max(avg_salary['salary_month_avg_eur']) * 1.05,
    text=f'+{pct_diff:.1f}%',
    showarrow=False,
    font=dict(size=14, color='gray'),
    xref='x2',
    yref='y2'
)

fig.update_layout(
    height=500,
    width=1000,
    template='plotly_white',
    showlegend=False,
    margin=dict(t=60, b=40, l=60, r=40),
)

fig.update_xaxes(title_text='Median Salary, EUR', row=1, col=1)
fig.update_xaxes(showticklabels=False, row=1, col=1)
fig.update_yaxes(title_text='', row=1, col=1)
fig.update_xaxes(title_text='', row=1, col=2)
fig.update_yaxes(title_text='', showticklabels=False, row=1, col=2)

fig.show()

### Top-Paying Analytical Skills and Their Impact on Salary

In [78]:
# Extract analyst_tools skills
analyst_skills = set()
for row in df_da_eu_us['job_type_skills'].dropna().unique():
    row_dict = ast.literal_eval(row)
    analyst_skills.update(row_dict.get('analyst_tools', []))

df_da = df_da_eu_us.dropna(subset=['job_skills', 'salary_month_avg_eur']).copy()
df_da['job_skills'] = df_da['job_skills'].apply(ast.literal_eval)
df_da = df_da.explode('job_skills')
df_plot = df_da[df_da['job_skills'].isin(analyst_skills)].copy()

# Find top 10 skills by number of postings (frequency)
top10_by_count = (
    df_plot['job_skills']
    .value_counts()
    .head(10)
    .index.tolist()
)

# Step 3: Top 10 highest paid skills in analyst_tools
top10_salary_skills = (
    df_plot.groupby('job_skills')['salary_month_avg_eur']
    .median()
    .sort_values(ascending=False)
    .head(10)
    .reset_index()
)

# Add job postings count for these skills
top10_salary_skills['job_postings'] = top10_salary_skills['job_skills'].apply(
    lambda s: (df_plot['job_skills'] == s).sum()
)

top10_salary_skills.columns = ['skill', 'median_salary_eur', 'job_postings']
top10_salary_skills = top10_salary_skills.sort_values('median_salary_eur')
top_skills_set = set(top10_salary_skills['skill'])

# Step 4: With vs Without top 3 highest-paid skills
top3_skills_set = set(top10_salary_skills.sort_values('median_salary_eur', ascending=False)['skill'].head(3))

df_plot['has_top_skill'] = df_plot['job_skills'].apply(lambda s: s in top3_skills_set)
df_valid = df_plot.dropna(subset=['salary_month_avg_eur'])

avg_salary = (
    df_valid.groupby('has_top_skill')['salary_month_avg_eur']
    .mean()
    .reset_index()
)

avg_salary['group'] = avg_salary['has_top_skill'].map({
    True: 'With Top 3 Skills',
    False: 'Without Top 3 Skills'
})

with_salary = avg_salary.loc[avg_salary['group'] == 'With Top 3 Skills', 'salary_month_avg_eur'].values[0]
without_salary = avg_salary.loc[avg_salary['group'] == 'Without Top 3 Skills', 'salary_month_avg_eur'].values[0]
pct_diff = (with_salary - without_salary) / without_salary * 100

# Step 5: Plotting
fig = make_subplots(
    rows=1, cols=2,
    column_widths=[0.6, 0.4],
    subplot_titles=("Top 10 Highest Paid Analyst Tools Skills", "Avg Salary: With vs Without Top 3 Skills"),
    horizontal_spacing=0.1
)

fig.add_trace(go.Bar(
    x=top10_salary_skills['median_salary_eur'],
    y=top10_salary_skills['skill'],
    orientation='h',
    marker=dict(color=top10_salary_skills['median_salary_eur'], colorscale='Blues'),
    text=top10_salary_skills['median_salary_eur'].round(0).astype(int),
    textposition='auto',
    name='Top Skills'
), row=1, col=1)

fig.add_trace(go.Bar(
    x=avg_salary['group'],
    y=avg_salary['salary_month_avg_eur'],
    marker_color=['#9ecae1' if g == 'Without Top 3 Skills' else '#084594' for g in avg_salary['group']],
    text=avg_salary['salary_month_avg_eur'].round(0).astype(int),
    textposition='auto',
    name='Group Comparison'
), row=1, col=2)

fig.add_annotation(
    x=1.5,  
    y=max(avg_salary['salary_month_avg_eur']) * 1.05,
    text=f'+{pct_diff:.1f}%',
    showarrow=False,
    font=dict(size=14, color='gray'),
    xref='x2',
    yref='y2'
)

fig.update_layout(
    height=500,
    width=1000,
    template='plotly_white',
    showlegend=False,
    margin=dict(t=60, b=40, l=60, r=40),
)

fig.update_xaxes(title_text='Median Salary, EUR', row=1, col=1)
fig.update_xaxes(showticklabels=False, row=1, col=1)
fig.update_yaxes(title_text='', row=1, col=1)
fig.update_xaxes(title_text='', row=1, col=2)
fig.update_yaxes(title_text='', showticklabels=False, row=1, col=2)

fig.show()

- With Top 3 Skills: Job posting has at least one of the top 3 skills.
- Without Top 3 Skills: Job posting has none of the top 3 skills.

### Impact of Job Type (Remote vs On-site) and Degree Requirements on Salary

In [79]:
# Prepare data (same as before)
remote_salary_avg = df_filtered.groupby('job_work_from_home')['salary_month_avg_eur'].mean().reset_index()
remote_salary_avg['Remote Work'] = remote_salary_avg['job_work_from_home'].map({True: 'Remote', False: 'On-site'})

degree_salary_avg = df_filtered.groupby('job_no_degree_mention')['salary_month_avg_eur'].mean().reset_index()
degree_salary_avg['Degree Mention'] = degree_salary_avg['job_no_degree_mention'].map({True: 'No Degree Mentioned', False: 'Degree Mentioned'})

# Define order and colors
remote_order = ['On-site', 'Remote']
degree_order = ['No Degree Mentioned', 'Degree Mentioned']

colors = ['#9ecae1', '#084594']  # light blue, dark blue

# Reorder data to match order
remote_salary_avg = remote_salary_avg.set_index('Remote Work').loc[remote_order].reset_index()
degree_salary_avg = degree_salary_avg.set_index('Degree Mention').loc[degree_order].reset_index()

# Calculate % differences for annotations
def calc_pct_diff(group_df):
    low = group_df['salary_month_avg_eur'].iloc[0]
    high = group_df['salary_month_avg_eur'].iloc[1]
    return (high - low) / low * 100

remote_pct_diff = calc_pct_diff(remote_salary_avg)
degree_pct_diff = calc_pct_diff(degree_salary_avg)

# Create subplot figure with 2 columns
fig = make_subplots(rows=1, cols=2, subplot_titles=('Avg Salary by Remote Work', 'Avg Salary by Degree Mention'))

# Remote Work Bar
fig.add_trace(
    go.Bar(
        x=remote_salary_avg['Remote Work'],
        y=remote_salary_avg['salary_month_avg_eur'],
        marker_color=colors,
        showlegend=False,
        text=remote_salary_avg['salary_month_avg_eur'].round(0).astype(int),
        textposition='auto'
    ),
    row=1,
    col=1
)

# Degree Mention Bar
fig.add_trace(
    go.Bar(
        x=degree_salary_avg['Degree Mention'],
        y=degree_salary_avg['salary_month_avg_eur'],
        marker_color=colors,
        showlegend=False,
        text=degree_salary_avg['salary_month_avg_eur'].round(0).astype(int),
        textposition='auto'
    ),
    row=1,
    col=2
)

fig.add_annotation(
    x=0.5, 
    y=max(remote_salary_avg['salary_month_avg_eur']) * 1.05,
    xref='x domain',  # first subplot x axis
    yref='y',         # first subplot y axis
    text=f'+{remote_pct_diff:.1f}%',
    showarrow=False,
    font=dict(color='grey', size=14)
)

fig.add_annotation(
    x=0.5, 
    y=max(degree_salary_avg['salary_month_avg_eur']) * 1.05,
    xref='x2 domain', # second subplot x axis
    yref='y2',        # second subplot y axis
    text=f'+{degree_pct_diff:.1f}%',
    showarrow=False,
    font=dict(color='grey', size=14)
)

# Update layout: remove y-axis ticks and labels
fig.update_yaxes(
    showticklabels=False,
    showgrid=False,
    zeroline=False
)

# Update layout: remove y-axis ticks and labels
fig.update_yaxes(
    showticklabels=False,
    showgrid=False,
    zeroline=False
)

for ann in fig['layout']['annotations']: ann['y'] += 0.05

fig.update_layout(
    height=400,
    width=800,
    yaxis_title='',
    template='plotly_white',
)

fig.show()

### Best Time to Search: Salary Trends Over Time

In [80]:
df_eu_da = df_filtered[
    (df_filtered['region_group'] == 'EU') &
    (df_filtered['job_title_short'] == 'Data Analyst')
].copy()

# Convert posting date to datetime and extract month
df_eu_da['job_posted_date'] = pd.to_datetime(df_eu_da['job_posted_date'])
df_eu_da['month'] = df_eu_da['job_posted_date'].dt.to_period('M').astype(str)

# Group by month and calculate average salary
monthly_salary = (
    df_eu_da.groupby(['month', 'job_title_short'])['salary_month_avg_eur']
    .mean()
    .reset_index()
)

# Pivot to wide format
df_salary_pivot = monthly_salary.pivot(index='month', columns='job_title_short', values='salary_month_avg_eur').fillna(0)
df_salary_pivot = df_salary_pivot.sort_index()

# Prepare regression input
x = np.arange(len(df_salary_pivot.index)).reshape(-1, 1)
y = df_salary_pivot['Data Analyst'].values
model = LinearRegression().fit(x, y)
trend = model.predict(x)

# Plot actual salaries
fig = go.Figure()

fig.add_trace(go.Scatter(
    x=df_salary_pivot.index,
    y=y,
    mode='lines+markers',
    name='Data Analyst',
    line=dict(color='#4c78a8', width=4),
    marker=dict(size=9)
))

# Plot trend line
fig.add_trace(go.Scatter(
    x=df_salary_pivot.index,
    y=trend,
    mode='lines',
    name='Trend Line (Linear)',
    line=dict(color='gray', dash='dash', width=3)
))

# Layout
fig.update_layout(
    title='Average Salary for Data Analyst Roles in the EU',
    xaxis_title='',
    yaxis_title='Avg Monthly Salary, EUR',
    height=450,
    width=950,
    template='plotly_white',
    margin=dict(t=60, b=40, l=60, r=40),
    legend_title='Legend'
)

peak_salary = y.max()
avg_salary = y.mean()
pct_diff = ((peak_salary - avg_salary) / avg_salary) * 100

# Create annotation text
annotation_text = (
    f"📈 Peak salary: €{peak_salary:,.0f}<br>"
    f"📊 Average salary: €{avg_salary:,.0f}<br>"
    f"🔺 Peak is {pct_diff:.1f}% higher than average"
)

# Add annotation
fig.add_annotation(
    xref="paper", yref="paper",
    x=1.25, y=0.05,  
    text=annotation_text,
    showarrow=False,
    align="right",
    font=dict(size=13, color="black"),
    bgcolor="rgba(255,255,255,0.8)",
    bordercolor="lightgrey",
    borderwidth=1
)

fig.show()