# Data Salaries
* How much money do software engineers make?
* What about Data Scientists?
* Which companies are paying the most?
* What are the trends looking like?

In [1]:
import pandas as pd
import requests
import numpy as np
import math

#data visualization
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import plotly.express as px

In [44]:
data = requests.get('https://www.levels.fyi/js/salaryData.json').json()
df = pd.DataFrame(data)

## Clean Data

In [45]:
# Remove columns that we don't need
df = df.drop(['cityid','rowNumber','dmaid'], axis=1)
df = df.replace("", np.nan)

#convert datatypes
num_cols = ['yearsofexperience','basesalary','bonus','stockgrantvalue',
            'totalyearlycompensation','yearsatcompany']
df[num_cols] = df[num_cols].apply(pd.to_numeric)

#one record without a location, kick it out
df = df[df.location.notnull()]

#round up all of the years of experience even if it is 0.25 years
df['yearsofexperience'] = np.ceil(df.yearsofexperience)
df['yearsatcompany'] = np.ceil(df.yearsatcompany)

#remove records that fall in the top/bottom 95th/5th percentile on totalyearly compensation
#I do this to remove some of the submissions that say they are making $5 million a year or those that are next to nothing
df = df[df['totalyearlycompensation'].between(df['totalyearlycompensation']. \
                                              quantile(.05),df['totalyearlycompensation'].quantile(.95))]

#remove records that are outside of the US. This definition is any location record that has 2 commas or more
df = df[df['location'].str.count(',') == 1]

In [47]:
df.shape

(15501, 14)

#### Clean up the date submission data

In [48]:
df['timestamp'] = pd.to_datetime(df['timestamp'])
df['SubmitDate'] = pd.to_datetime(df['timestamp'])

In [49]:
# Get Date Function
def get_date_int(df, column):
    year = df[column].dt.year
    month = df[column].dt.month
    day = df[column].dt.day
    return year, month, day

In [50]:
submit_year, submit_month, _ = get_date_int(df, 'SubmitDate')

In [51]:
#Normalize dates to the beginning of the month
df['SubmitYear'] = submit_year
df['SubmitMonth'] = submit_month
df['SubmitDate'] = pd.to_datetime(df['SubmitYear']*10000 + df['SubmitMonth']*100 + 1, format='%Y%m%d')

## Observe the flow of submissions over time

In [52]:
df_time = df.SubmitDate.value_counts().reset_index()
df_time.columns = ['Date','Count']
df_time = df_time.sort_values(by='Date')

In [53]:
fig = go.Figure()

fig.add_trace(go.Scatter(x=df_time['Date'],
                         y=df_time['Count'],
                        mode='lines+markers',
                        name='Submission Count'))
fig.update_layout(title='Count of Submissions by Month')

fig.show()

## Look at the top 50 submitted Companies

In [56]:
#get the top 50 companies
top_50 = df.company.value_counts(ascending=False).head(50).index.tolist()
df_50 = df[df.company.isin(top_50)]

In [57]:
#top 50 submissions - grouped by company

df_50_5 = df_50.loc[df_50.yearsofexperience <= 5].groupby(by='company',
                                                        as_index=False)['totalyearlycompensation'].\
                                                        agg({'totalyearlycompensation':['count','mean','median','max']})
#rename multi-index columns - flatten it - drop level
df_50_5.columns = ['Company','Count','Mean','Median','Max']
#round the data to 1 decimal place
df_50_5.iloc[:,-3:] = df_50_5.iloc[:,-3:].round(1)
df_50_5.sample(5)

Unnamed: 0,Company,Count,Mean,Median,Max
45,Workday,48,161.8,152.5,318.0
32,Salesforce,129,198.9,185.0,350.0
47,Yelp,69,188.3,180.0,281.0
17,IBM,118,124.7,120.0,209.0
10,Dell,17,157.5,130.0,416.0


In [58]:
df_50[df_50.yearsofexperience <= 5]['yearsofexperience'].value_counts()

0.0    1231
3.0    1148
2.0    1087
5.0    1042
4.0    1029
1.0     889
Name: yearsofexperience, dtype: int64

In [59]:
df_50.location.value_counts().tail(10)

Southbury, CT         1
Canada, KY            1
St. Petersburg, FL    1
Tyson’s Corner, VA    1
College Park, MD      1
Arlington, TX         1
Irving, TX            1
Renton, WA            1
Bedford, MA           1
Waltham, MA           1
Name: location, dtype: int64

## How has pay by tenure changed over time?

In [78]:
df_experience = df[df.yearsofexperience <= 5].groupby(by=['SubmitDate','yearsofexperience'],
                                                        as_index=False)['totalyearlycompensation'].\
                                                        agg({'totalyearlycompensation':['count','mean','median','max']})
df_experience.columns = ['SubmitDate','YearsExp','Count','Mean','Median','Max']
#start in June 2018 because of sample sizes
df_experience = df_experience[df_experience.SubmitDate > '2018-05-01']
df_experience.sample(5)

Unnamed: 0,SubmitDate,YearsExp,Count,Mean,Median,Max
191,2020-02-01,5.0,18,220.833333,192.5,480.0
190,2020-02-01,4.0,21,210.761905,230.0,321.0
120,2019-03-01,0.0,62,162.83871,154.5,300.0
135,2019-05-01,3.0,68,207.367647,197.0,382.0
153,2019-08-01,3.0,74,189.121622,176.0,500.0


In [80]:
fig = go.Figure()

fig.add_trace(go.Scatter(x=df_experience[df_experience['YearsExp'] == 0]['SubmitDate'],
                         y=df_experience[df_experience['YearsExp'] == 0]['Mean'],
                         mode='lines+markers',
                         name='0 Years'))

fig.add_trace(go.Scatter(x=df_experience[df_experience['YearsExp'] == 1]['SubmitDate'],
                         y=df_experience[df_experience['YearsExp'] == 1]['Mean'],
                         mode='lines+markers',
                         name='1 Year'))

fig.add_trace(go.Scatter(x=df_experience[df_experience['YearsExp'] == 2]['SubmitDate'],
                         y=df_experience[df_experience['YearsExp'] == 2]['Mean'],
                         mode='lines+markers',
                         name='2 Years'))

fig.add_trace(go.Scatter(x=df_experience[df_experience['YearsExp'] == 3]['SubmitDate'],
                         y=df_experience[df_experience['YearsExp'] == 3]['Mean'],
                         mode='lines+markers',
                         name='3 Years'))

fig.add_trace(go.Scatter(x=df_experience[df_experience['YearsExp'] == 4]['SubmitDate'],
                         y=df_experience[df_experience['YearsExp'] == 4]['Mean'],
                         mode='lines+markers',
                         name='4 Years'))

fig.add_trace(go.Scatter(x=df_experience[df_experience['YearsExp'] == 5]['SubmitDate'],
                         y=df_experience[df_experience['YearsExp'] == 5]['Mean'],
                         mode='lines+markers',
                         name='5 Years'))




fig.update_layout(title='Average Annual Compenstion by Years of Experience')

fig.show()

## What about Data Science?

In [62]:
df_ds = df[df.title == 'Data Scientist']

In [63]:
def q25(x):
    return x.quantile(0.25)
def q75(x):
    return x.quantile(0.75)

In [64]:
df_ds_exp = df_ds.groupby(by=['yearsofexperience'],
                          as_index=False).agg({'totalyearlycompensation':['count','mean','max','min','median',q25, q75]})
df_ds_exp.columns = ['Years of Experience','Count','Mean','Max','Min','Median','bottom25%','top25%']
df_ds_exp.iloc[:,2:] = round(df_ds_exp.iloc[:,2:],1)
df_ds_exp.head(6)

Unnamed: 0,Years of Experience,Count,Mean,Max,Min,Median,bottom25%,top25%
0,0.0,53,187.2,368.0,100.0,189.0,150.0,215.0
1,1.0,53,186.9,332.0,99.0,168.0,140.0,220.0
2,2.0,68,184.4,420.0,95.0,174.5,140.0,220.8
3,3.0,67,197.6,423.0,100.0,200.0,145.5,232.5
4,4.0,56,226.7,500.0,104.0,211.0,176.2,270.0
5,5.0,60,216.8,360.0,118.0,214.5,178.8,250.0


In [65]:
top_titles = df.title.value_counts().head(10).index.tolist()
top_titles

['Software Engineer',
 'Product Manager',
 'Software Engineering Manager',
 'Data Scientist',
 'Hardware Engineer',
 'Product Designer',
 'Solution Architect',
 'Program Manager',
 'Investment Banker',
 'Security Engineer']

In [66]:
df_titles = df[df.title.isin(top_titles)]
df_titles = df_titles[df_titles.yearsofexperience <= 5]
df_titles.loc[df_titles.title == 'Data Scientist', 'DataScientist'] = 1
df_titles['DataScientist'] = df_titles['DataScientist'].fillna(0)
df_titles['DataScientist'] = df_titles['DataScientist'].astype(int)
df_titles.DataScientist.value_counts()

0    6853
1     357
Name: DataScientist, dtype: int64

In [67]:
df_titles.title.value_counts()

Software Engineer               6092
Data Scientist                   357
Product Manager                  345
Hardware Engineer                176
Product Designer                 131
Software Engineering Manager      45
Solution Architect                24
Investment Banker                 22
Security Engineer                 10
Program Manager                    8
Name: title, dtype: int64

In [68]:
fig = go.Figure()
 
fig.add_trace(go.Box(
    x= df_titles[df_titles.DataScientist == 1 ]['title'],
    y= df_titles[df_titles.DataScientist == 1 ]['totalyearlycompensation'],
    name='Data Scientist',
    marker_color= "#002B5C"))

fig.add_trace(go.Box(
    x= df_titles[df_titles.DataScientist == 0 ]['title'],
    y= df_titles[df_titles.DataScientist == 0 ]['totalyearlycompensation'],
    name='Other',
    marker_color= "#00471B"))

fig.update_layout(
    title="Compensation for Top 10 Titles (<6 yrs exp)",
    yaxis_title="Total Yearly Compensation (thousands)",
    font=dict(
        family="Roboto",
        size=14,
        color="#696969"
    )
)


fig.show()