# How can you maximize your salary as a Data Analyst in Europe? 
- Which skills are most strongly correlated with higher pay?
- What type of job better to search for - Remote/In-Office?
- Do you need special degree to get bigger salary?
- When better to search for a job to get bigger salary.

In [80]:
import pandas as pd
#import plotly.express as px

import matplotlib.pyplot as plt
import seaborn as sns

import plotly.graph_objects as go
from plotly.subplots import make_subplots

import numpy as np
import ast


df = pd.read_csv('/Users/kolesnikevgenia/Documents/Python_Projects/Job_Skills/Raw_Data/df_Final.csv')

### Remove Salary Outliers

In [81]:
df_eu_ds = df[
    #(df['region_group'] == 'EU') &
    (df['job_title_short'] == 'Data Analyst')
]

#Calculate global IQR
Q1_global = df_eu_ds['salary_month_avg_eur'].quantile(0.25)
Q3_global = df_eu_ds['salary_month_avg_eur'].quantile(0.75)
IQR_global = Q3_global - Q1_global
lower_bound_global = Q1_global - 1.5 * IQR_global
upper_bound_global = Q3_global + 1.5 * IQR_global

#Build list to collect filtered results
filtered_groups = []

#Iterate over groups manually
for (country, title), group in df_eu_ds.groupby(['job_country', 'job_title_short']):
    if len(group) >= 5:
        Q1 = group['salary_month_avg_eur'].quantile(0.25)
        Q3 = group['salary_month_avg_eur'].quantile(0.75)
        IQR = Q3 - Q1
        lower = Q1 - 1.5 * IQR
        upper = Q3 + 1.5 * IQR
    else:
        lower = lower_bound_global
        upper = upper_bound_global

    filtered = group[
        (group['salary_month_avg_eur'] >= lower) &
        (group['salary_month_avg_eur'] <= upper)
    ]
    filtered_groups.append(filtered)

# Combine all groups back
df_filtered = pd.concat(filtered_groups, ignore_index=True)

print(f"Original dataset size: {len(df_eu_ds)}")
print(f"Filtered dataset size: {len(df_filtered)}")

Original dataset size: 5451
Filtered dataset size: 5324


### Merge technology column

In [82]:
df_technology = df_filtered['job_type_skills'].copy()

df_technology = df_technology.drop_duplicates()
df_technology = df_technology.dropna()

#combine all dictionaries into one
technology_dict = {}
for row in df_technology:
    row_dict = ast.literal_eval(row)
    for key, value in row_dict.items():
        if key in technology_dict:
            technology_dict[key] += value
        else:
            technology_dict[key] = value
    
#remove duplicates by converting values to set then back to list
for key, value in technology_dict.items():
    technology_dict[key] = list(set(value))

df_technology = pd.DataFrame(list(technology_dict.items()), columns=['technology', 'skills'])

df_technology = df_technology.explode('skills')

df_dropna = df_filtered.dropna(subset=['job_skills']).copy()
df_dropna['job_skills'] = df_dropna['job_skills'].apply(ast.literal_eval)
df_exploded = df_dropna.explode('job_skills')

df_plot = df_exploded.merge(df_technology, left_on='job_skills', right_on='skills').copy()

### Top-10 Highest Payed Skills in Top-2 In-Demand Technologies

In [83]:
top_skills_combined = (
    df_plot.groupby('job_skills')['salary_month_avg_eur']
    .median()
    .sort_values(ascending=False)
    .head(10)
    .reset_index()
)
top_skills_combined.columns = ['skill', 'median_salary_eur']
top_skills_combined = top_skills_combined.sort_values('median_salary_eur')

fig = go.Figure(go.Bar(
    x=top_skills_combined['median_salary_eur'],
    y=top_skills_combined['skill'],
    orientation='h',
    marker=dict(color=top_skills_combined['median_salary_eur'], colorscale='Blues'),
    text=top_skills_combined['median_salary_eur'].round(0).astype(int),
    textposition='auto'
))

fig.update_layout(
    title='Top 10 Highest Paid Skills in Analyst Tools & Programming',
    xaxis_title='Median Salary (EUR)',
    yaxis_title='',
    template='plotly_white',
    height=500,
    width=900
)

fig.show()

In [86]:
top_skills_set = set(top_skills_combined['skill'])

# Check if any of the top-10 skills are in job_skills (assumes comma-separated strings)
df_plot['has_top_skill'] = df_plot['job_skills'].apply(
    lambda s: any(skill in s.split(', ') for skill in top_skills_set) if pd.notnull(s) else False
)

# Drop rows with missing salary
df_valid = df_plot.dropna(subset=['salary_month_avg_eur'])

# Calculate average salary for each group
avg_salary = df_valid.groupby('has_top_skill')['salary_month_avg_eur'].mean().reset_index()

# Add readable group labels
avg_salary['group'] = avg_salary['has_top_skill'].map({
    True: 'With Top 10 Skills',
    False: 'Without Top 10 Skills'
})

# Calculate % difference
with_salary = avg_salary.loc[avg_salary['group'] == 'With Top 10 Skills', 'salary_month_avg_eur'].values[0]
without_salary = avg_salary.loc[avg_salary['group'] == 'Without Top 10 Skills', 'salary_month_avg_eur'].values[0]
pct_diff = (with_salary - without_salary) / without_salary * 100

# Plot
fig = go.Figure()

fig.add_trace(go.Bar(
    x=avg_salary['group'],
    y=avg_salary['salary_month_avg_eur'],
    marker_color=['#9ecae1', '#084594'],  # Light blue = Without, Dark blue = With
    text=avg_salary['salary_month_avg_eur'].round(0).astype(int),
    textposition='auto'
))

# Annotate % difference
fig.add_annotation(
    x=0.5,
    y=max(avg_salary['salary_month_avg_eur']) * 1.05,
    text=f'+{pct_diff:.1f}%',
    showarrow=False,
    font=dict(size=14, color='grey')
)

fig.update_yaxes(showticklabels=False, showgrid=False, zeroline=False)

fig.update_layout(
    title='Average Salary: With vs Without Top 10 Skills, EUR',
    yaxis_title='',
    xaxis_title='',
    template='plotly_white',
    height=400,
    width=600,
)

fig.show()

In [None]:
# Prepare data (same as before)
remote_salary_avg = df_filtered.groupby('job_work_from_home')['salary_month_avg_eur'].mean().reset_index()
remote_salary_avg['Remote Work'] = remote_salary_avg['job_work_from_home'].map({True: 'Remote', False: 'On-site'})

degree_salary_avg = df_filtered.groupby('job_no_degree_mention')['salary_month_avg_eur'].mean().reset_index()
degree_salary_avg['Degree Mention'] = degree_salary_avg['job_no_degree_mention'].map({True: 'No Degree Mentioned', False: 'Degree Mentioned'})

# Define order and colors
remote_order = ['On-site', 'Remote']
degree_order = ['No Degree Mentioned', 'Degree Mentioned']

colors = ['#9ecae1', '#084594']  # light blue, dark blue

# Reorder data to match order
remote_salary_avg = remote_salary_avg.set_index('Remote Work').loc[remote_order].reset_index()
degree_salary_avg = degree_salary_avg.set_index('Degree Mention').loc[degree_order].reset_index()

# Calculate % differences for annotations
def calc_pct_diff(group_df):
    low = group_df['salary_month_avg_eur'].iloc[0]
    high = group_df['salary_month_avg_eur'].iloc[1]
    return (high - low) / low * 100

remote_pct_diff = calc_pct_diff(remote_salary_avg)
degree_pct_diff = calc_pct_diff(degree_salary_avg)

# Create subplot figure with 2 columns
fig = make_subplots(rows=1, cols=2, subplot_titles=('Avg Salary by Remote Work', 'Avg Salary by Degree Mention'))

# Remote Work Bar
fig.add_trace(
    go.Bar(
        x=remote_salary_avg['Remote Work'],
        y=remote_salary_avg['salary_month_avg_eur'],
        marker_color=colors,
        showlegend=False,
        text=remote_salary_avg['salary_month_avg_eur'].round(0).astype(int),
        textposition='auto'
    ),
    row=1,
    col=1
)

# Degree Mention Bar
fig.add_trace(
    go.Bar(
        x=degree_salary_avg['Degree Mention'],
        y=degree_salary_avg['salary_month_avg_eur'],
        marker_color=colors,
        showlegend=False,
        text=degree_salary_avg['salary_month_avg_eur'].round(0).astype(int),
        textposition='auto'
    ),
    row=1,
    col=2
)

fig.add_annotation(
    x=0.5, 
    y=max(remote_salary_avg['salary_month_avg_eur']) * 1.05,
    xref='x domain',  # first subplot x axis
    yref='y',         # first subplot y axis
    text=f'+{remote_pct_diff:.1f}%',
    showarrow=False,
    font=dict(color='grey', size=14)
)

fig.add_annotation(
    x=0.5, 
    y=max(degree_salary_avg['salary_month_avg_eur']) * 1.05,
    xref='x2 domain', # second subplot x axis
    yref='y2',        # second subplot y axis
    text=f'+{degree_pct_diff:.1f}%',
    showarrow=False,
    font=dict(color='grey', size=14)
)

# Update layout: remove y-axis ticks and labels
fig.update_yaxes(
    showticklabels=False,
    showgrid=False,
    zeroline=False
)

# Update layout: remove y-axis ticks and labels
fig.update_yaxes(
    showticklabels=False,
    showgrid=False,
    zeroline=False
)

for ann in fig['layout']['annotations']: ann['y'] += 0.05

fig.update_layout(
    height=400,
    width=800,
    yaxis_title='',
    template='plotly_white',
)

fig.show()

In [88]:
df_filtered['job_posted_date'] = pd.to_datetime(df_filtered['job_posted_date'], errors='coerce')
df_filtered['month_num'] = df_filtered['job_posted_date'].dt.month

monthly_avg = (
    df_filtered.groupby('month_num')['salary_month_avg_eur']
    .mean()
    .reset_index()
    .sort_values('month_num')
)

# Add month names (optional)
import calendar
monthly_avg['month'] = monthly_avg['month_num'].apply(lambda x: calendar.month_name[x])

# Calculate overall average salary
overall_avg = df_filtered['salary_month_avg_eur'].mean()

# Calculate % difference from overall average
monthly_avg['pct_diff'] = (monthly_avg['salary_month_avg_eur'] - overall_avg) / overall_avg * 100

# Create bar chart
fig = go.Figure()

# Bars for monthly averages
fig.add_trace(go.Bar(
    x=monthly_avg['month'],
    y=monthly_avg['salary_month_avg_eur'],
    marker_color='#4292c6',
    text=monthly_avg['pct_diff'].apply(lambda x: f"{x:+.1f}%"),
    textposition='outside',
    name='Monthly Avg Salary'
))

# Add horizontal line for overall average
fig.add_trace(go.Scatter(
    x=monthly_avg['month'],
    y=[overall_avg] * len(monthly_avg),
    mode='lines',
    name='Overall Average',
    line=dict(color='grey', dash='dash')
))

# Layout
fig.update_layout(
    title='Average Monthly Salary and % Difference from Overall Avg',
    yaxis_title='Average Salary (EUR)',
    xaxis_title='Month',
    template='plotly_white',
    height=450
)

fig.show()