# Importing Libraries and Dependencies

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px 
import plotly.graph_objects as go
import plotly.figure_factory as ff
import country_converter as coco

%matplotlib inline

# Loading Salary Data

In [None]:
salary_df = pd.read_csv('ds_salaries.csv')
salary_df = salary_df.drop(columns = ['Unnamed: 0', 'salary_currency', 'salary', 'remote_ratio'],axis = 1)
salary_df = salary_df.loc[salary_df['employment_type'] == 'FT'].copy()

# strip column whitespace 
salary_df.columns = salary_df.columns.str.replace(' ', '')
salary_df

# 50 different job titles
# dropped 22 values so we only consider FT employees

In [None]:
# formatting columns slightly to make it easier to interpret
exp_level = {'SE': 'Senior', 'MI': 'Mid', 'EN': 'Entry', 'EX': 'Executive'}
comp_size = {'L': 'Large', 'M': 'Medium', 'S': 'Small'}

salary_df['company_size'] = salary_df['company_size'].astype(str)
salary_df['experience_level'] = salary_df['experience_level'].astype(str)

salary_df['company_size'] = salary_df['company_size'].map(comp_size)
salary_df['experience_level'] = salary_df['experience_level'].map(exp_level)
salary_df


In [None]:
# convert residence/company location to ISO3 values to make them easier to read and plot
country_conversion = coco.convert(names = salary_df['employee_residence'], to = "ISO3")
salary_df['employee_residence'] = country_conversion
country_conversion1 = coco.convert(names = salary_df['company_location'], to = "ISO3")
salary_df['company_location'] = country_conversion1
salary_df

# Distribution of Work Years 
- over half of the data is from 2022, with second highest being 2021, and finally the lowest pct in 2020

In [None]:

work_year = salary_df['work_year'].value_counts()

fig = px.pie(values = work_year.values, 
             names = work_year.index,
             color_discrete_sequence = px.colors.sequential.matter_r, 
             title = 'Work Year Distribution', 
             template = 'plotly_dark')

fig.update_traces(textinfo = 'label+percent+value', textfont_size = 17, 
                  marker = dict(line = dict(color = '#100000', width = 0.2)))

fig.data[0].marker.line.width = 2
fig.data[0].marker.line.color = 'black'
fig.update_layout(
    font = dict(size = 18, family = "Franklin Gothic"))

fig.show()

<img src="images/workyeardistribution.png">

# Distribution of Experience Level
- included both total count and percentage of whole workforce

In [None]:
experience_level = salary_df['experience_level'].value_counts()

fig = px.pie(values = experience_level.values, 
             names = experience_level.index,
             color_discrete_sequence = px.colors.sequential.matter_r, 
             title = 'Work Level Distribution', 
             template = 'plotly_dark')

fig.update_traces(textinfo = 'label+percent+value', textfont_size = 17, 
                  marker = dict(line = dict(color = '#100000', width = 0.2)))

fig.data[0].marker.line.width = 2
fig.data[0].marker.line.color = 'black'
fig.update_layout(
    font = dict(size = 18, family = "Franklin Gothic"))

fig.show()

<img src="images/workleveldistribution.png">

In [None]:
df_mean = salary_df.groupby('company_location')[['salary_in_usd']].mean().sort_values('salary_in_usd', ascending = False,).head(10)

fig = px.bar(df_mean,
             x = df_mean.index,
             y = df_mean['salary_in_usd'],
             title = "Top 10 Salaried Locations and Avg Salary",
             template = 'plotly_dark')

fig.update_layout(
    xaxis_title = "Company Location", 
    yaxis_title = "Salary",
    font = dict(size = 15, family = "Franklin Gothic"))
fig.show()

<img src="images/top10salariedlocations.png">

In [None]:
hist_data = [salary_df['salary_in_usd']] 
group_labels = ['salary_in_usd']

fig = ff.create_distplot(hist_data, group_labels, show_hist = False)
fig.layout.template = 'plotly_dark'
fig.update_layout(title='Salary in USD (DistPlot)', font = dict(size=17, family="Franklin Gothic")) 
fig.show()

# As we can tell from the dist plot, most of the salaries fall between 75-150k

<img src="images/salarydistplot.png">

In [None]:
# based on this there doesn't seem to be much of a tradeoff between working at a medium vs large company in terms of salary
# the smaller companies average much lower salaries which makes sense
grouped_companies = salary_df.groupby('company_size')[['salary_in_usd']].mean().sort_values('salary_in_usd')

fig = px.bar(grouped_companies, x = grouped_companies.index, y = grouped_companies['salary_in_usd'], title = 'Avg Salary vs Different Sized Companies',
            labels = {
                "salary_in_usd": "Salary ($USD)",
                "company_size": "Company Size"
            })
fig.update_traces(width = 0.3)
fig.show()

<img src="images/salaryvscompanysize.png">

In [None]:
# some of the most lucrative careers based on title (top 10)
job_grouped = salary_df.groupby('job_title')[['salary_in_usd']].mean().sort_values('salary_in_usd', ascending = False)[:10]
highest_paid = job_grouped.sort_values('salary_in_usd', ascending = False)

In [None]:
# now we will look at the most popular job titles based on frequency in the dataset 
top_10_jobs = salary_df['job_title'].value_counts()[:10]
fig = px.bar(y = top_10_jobs.values, 
    x = top_10_jobs.index, 
    color = top_10_jobs.index, 
    text = top_10_jobs.values, 
    title = 'Top 10 Job Titles',
    template = 'plotly_dark')

fig.update_layout( 
    xaxis_title = 'Job Titles', 
    yaxis_title = 'Count',
    font = dict(size = 15, family = "Franklin Gothic"))

fig.show()

<img src="images/top10jobs.png">

In [None]:
# density map - shows location of employee residences
residence = salary_df['employee_residence'].value_counts()

fig = px.choropleth(locations = residence.index, 
                    color = residence.values, 
                    color_continuous_scale = px.colors.sequential.YlGnBu, 
                    template = 'plotly_dark', 
                    title = 'Employee Location Distribution Map')

fig.update_layout(font = dict(size = 15, family = "Franklin Gothic"))
fig.show()

<img src="images/Emplocationdist.png">

# Top 10 Employee Locations
- depicts density of workforce based on employee residence
- vast majority of the workforce in the dataset is concentrated in USA

In [None]:
top_10_location = residence[:10] 

fig = px.bar(y = top_10_location.values, 
             x = top_10_location.index, 
             color = top_10_location.index, 
             color_discrete_sequence = px.colors.sequential.deep, 
             title = 'Top 10 Employee Locations', 
             template = 'plotly_dark')

fig.update_layout(

    xaxis_title = "Employee Location", 
    yaxis_title = "Count", 
    font = dict(size = 15, family = "Franklin Gothic"))

fig.show()

<img src="images/top10emplocations.png">