# Data Science Salaries 2023 

In [1]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
from scipy.stats import f_oneway

In [2]:
dt = pd.read_csv('ds_salaries.csv')

In [3]:
dt.head()

Unnamed: 0,work_year,experience_level,employment_type,job_title,salary,salary_currency,salary_in_usd,employee_residence,remote_ratio,company_location,company_size
0,2023,SE,FT,Principal Data Scientist,80000,EUR,85847,ES,100,ES,L
1,2023,MI,CT,ML Engineer,30000,USD,30000,US,100,US,S
2,2023,MI,CT,ML Engineer,25500,USD,25500,US,100,US,S
3,2023,SE,FT,Data Scientist,175000,USD,175000,CA,100,CA,M
4,2023,SE,FT,Data Scientist,120000,USD,120000,CA,100,CA,M


In [4]:
dt.shape

(3755, 11)

In [5]:
dt.describe()

Unnamed: 0,work_year,salary,salary_in_usd,remote_ratio
count,3755.0,3755.0,3755.0,3755.0
mean,2022.373635,190695.6,137570.38988,46.271638
std,0.691448,671676.5,63055.625278,48.58905
min,2020.0,6000.0,5132.0,0.0
25%,2022.0,100000.0,95000.0,0.0
50%,2022.0,138000.0,135000.0,0.0
75%,2023.0,180000.0,175000.0,100.0
max,2023.0,30400000.0,450000.0,100.0


## Unique Job Titles 

In [6]:
#Calculate unique job titles: 
unique_job_titles = np.unique(dt['job_title'])
total_unique_job_titles = len(unique_job_titles)
print("Total number of unique job titles:", total_unique_job_titles)
column_values = dt['job_title']
unique_values = np.unique(column_values)
print(unique_values)

Total number of unique job titles: 93
['3D Computer Vision Researcher' 'AI Developer' 'AI Programmer'
 'AI Scientist' 'Analytics Engineer' 'Applied Data Scientist'
 'Applied Machine Learning Engineer' 'Applied Machine Learning Scientist'
 'Applied Scientist' 'Autonomous Vehicle Technician' 'Azure Data Engineer'
 'BI Analyst' 'BI Data Analyst' 'BI Data Engineer' 'BI Developer'
 'Big Data Architect' 'Big Data Engineer' 'Business Data Analyst'
 'Business Intelligence Engineer' 'Cloud Data Architect'
 'Cloud Data Engineer' 'Cloud Database Engineer' 'Compliance Data Analyst'
 'Computer Vision Engineer' 'Computer Vision Software Engineer'
 'Data Analyst' 'Data Analytics Consultant' 'Data Analytics Engineer'
 'Data Analytics Lead' 'Data Analytics Manager'
 'Data Analytics Specialist' 'Data Architect' 'Data DevOps Engineer'
 'Data Engineer' 'Data Infrastructure Engineer' 'Data Lead'
 'Data Management Specialist' 'Data Manager' 'Data Modeler'
 'Data Operations Analyst' 'Data Operations Engineer

## Experience Levels

In [7]:
#Calculate # & list types of experience levels 
unique_experience_levels = np.unique(dt['experience_level'])
total_unique_experience_levels = len(unique_experience_levels)
print("Total number of experience levels:", total_unique_experience_levels)
column_values = dt['experience_level']
unique_values = np.unique(column_values)
print(unique_values)

Total number of experience levels: 4
['EN' 'EX' 'MI' 'SE']


## Employee Residences

In [8]:
#Calculate # of employee residences & list employee residences
unique_employee_residences = np.unique(dt['employee_residence'])
total_unique_employee_residences = len(unique_employee_residences)
print("Total number of employee residences:", total_unique_employee_residences)
column_values = dt['employee_residence']
unique_values = np.unique(column_values)
print(unique_values)

Total number of employee residences: 78
['AE' 'AM' 'AR' 'AS' 'AT' 'AU' 'BA' 'BE' 'BG' 'BO' 'BR' 'CA' 'CF' 'CH'
 'CL' 'CN' 'CO' 'CR' 'CY' 'CZ' 'DE' 'DK' 'DO' 'DZ' 'EE' 'EG' 'ES' 'FI'
 'FR' 'GB' 'GH' 'GR' 'HK' 'HN' 'HR' 'HU' 'ID' 'IE' 'IL' 'IN' 'IQ' 'IR'
 'IT' 'JE' 'JP' 'KE' 'KW' 'LT' 'LU' 'LV' 'MA' 'MD' 'MK' 'MT' 'MX' 'MY'
 'NG' 'NL' 'NZ' 'PH' 'PK' 'PL' 'PR' 'PT' 'RO' 'RS' 'RU' 'SE' 'SG' 'SI'
 'SK' 'TH' 'TN' 'TR' 'UA' 'US' 'UZ' 'VN']


## Company Locations 

In [9]:
#Calculate # of company locations & list company locations
unique_company_locations = np.unique(dt['company_location'])
total_unique_company_locations = len(unique_company_locations)
print("Total number of company locations:", total_unique_company_locations)
column_values = dt['company_location']
unique_values = np.unique(column_values)
print(unique_values)

Total number of company locations: 72
['AE' 'AL' 'AM' 'AR' 'AS' 'AT' 'AU' 'BA' 'BE' 'BO' 'BR' 'BS' 'CA' 'CF'
 'CH' 'CL' 'CN' 'CO' 'CR' 'CZ' 'DE' 'DK' 'DZ' 'EE' 'EG' 'ES' 'FI' 'FR'
 'GB' 'GH' 'GR' 'HK' 'HN' 'HR' 'HU' 'ID' 'IE' 'IL' 'IN' 'IQ' 'IR' 'IT'
 'JP' 'KE' 'LT' 'LU' 'LV' 'MA' 'MD' 'MK' 'MT' 'MX' 'MY' 'NG' 'NL' 'NZ'
 'PH' 'PK' 'PL' 'PR' 'PT' 'RO' 'RU' 'SE' 'SG' 'SI' 'SK' 'TH' 'TR' 'UA'
 'US' 'VN']


## Average Salary

In [10]:
#Calculate average salary
average_salary = np.mean(dt["salary"])
print(f"Average salary: ${average_salary:.2f}")

Average salary: $190695.57


## Average Salary based on Experience Level

In [11]:
#Calculate average salary based on experience level
average_salary_e = dt.groupby('experience_level')['salary'].mean()
average_salary_ex = average_salary_e.reset_index()
print(average_salary_ex)

  experience_level         salary
0               EN  188381.178125
1               EX  246802.201754
2               MI  248200.306832
3               SE  170048.965421


## Average Salary based on Employment Type

In [12]:
#Calculate average salary based on employment type
average_salary_et = dt.groupby('employment_type')['salary'].mean()
average_salary_emp = average_salary_et.reset_index()
print(average_salary_emp)

  employment_type         salary
0              CT  113300.000000
1              FL  288755.500000
2              FT  191146.539268
3              PT   79910.823529


## Average Salary based on Job Title 

In [13]:
#Calculate average salary based on job title 
average_salary_j = dt.groupby('job_title')['salary'].mean()
average_salary_jt = average_salary_j.reset_index()
print(average_salary_jt)

                        job_title         salary
0   3D Computer Vision Researcher  120000.000000
1                    AI Developer  137181.818182
2                   AI Programmer   55000.000000
3                    AI Scientist  275312.500000
4              Analytics Engineer  151352.621359
..                            ...            ...
88              Research Engineer  162752.810811
89             Research Scientist  160768.890244
90         Software Data Engineer   75000.000000
91             Staff Data Analyst   15000.000000
92           Staff Data Scientist  105000.000000

[93 rows x 2 columns]


## Average Salary based on Employee Residence

In [17]:
#Calculate average salary based on employee residence 
average_salary_er = dt.groupby('employee_residence')['salary'].mean()
average_salary_emres = average_salary_er.reset_index()
print(average_salary_emres)

   employee_residence         salary
0                  AE  100000.000000
1                  AM   50000.000000
2                  AR   35500.000000
3                  AS   32777.500000
4                  AT   65833.333333
..                ...            ...
73                 TR  114600.000000
74                 UA   57850.000000
75                 US  152822.011651
76                 UZ  103000.000000
77                 VN   33466.666667

[78 rows x 2 columns]


## Average Salary based on Company Location

In [18]:
#Calculate average salary based on company location 
average_salary_c = dt.groupby('company_location')['salary'].mean()
average_salary_cl = average_salary_c.reset_index()
print(average_salary_cl)

   company_location         salary
0                AE  100000.000000
1                AL   10000.000000
2                AM   50000.000000
3                AR   25000.000000
4                AS  468333.333333
..              ...            ...
67               TH  618333.333333
68               TR  114600.000000
69               UA   57850.000000
70               US  157665.357566
71               VN   12000.000000

[72 rows x 2 columns]


## Average Salary based on Company Size

In [19]:
#Calculate average salary based on company size 
average_salary_cs = dt.groupby('company_size')['salary'].mean()
average_salary_csize = average_salary_cs.reset_index()
print(average_salary_csize)

  company_size         salary
0            L  438794.372247
1            M  150712.836029
2            S  281430.101351


## Highest Salaries In USD 

In [20]:
top_company_locations = dt.sort_values("salary_in_usd", ascending = False)
top_company_locations.head(5)

Unnamed: 0,work_year,experience_level,employment_type,job_title,salary,salary_currency,salary_in_usd,employee_residence,remote_ratio,company_location,company_size
3522,2020,MI,FT,Research Scientist,450000,USD,450000,US,0,US,M
2011,2022,MI,FT,Data Analyst,350000,GBP,430967,GB,0,GB,M
528,2023,SE,FT,AI Scientist,1500000,ILS,423834,IL,0,IL,L
3747,2021,MI,FT,Applied Machine Learning Scientist,423000,USD,423000,US,50,US,L
3675,2021,EX,CT,Principal Data Scientist,416000,USD,416000,US,100,US,S


## Lowest Salaries in USD

In [21]:
bottom_locations = top_company_locations.tail()
bottom_locations = bottom_locations.sort_values('salary_in_usd')
bottom_locations.head(5)

Unnamed: 0,work_year,experience_level,employment_type,job_title,salary,salary_currency,salary_in_usd,employee_residence,remote_ratio,company_location,company_size
2684,2022,MI,FT,NLP Engineer,120000,CZK,5132,CZ,100,CZ,M
3537,2021,MI,PT,3D Computer Vision Researcher,400000,INR,5409,IN,50,IN,M
2578,2021,EN,FT,Power BI Developer,400000,INR,5409,IN,50,IN,L
3667,2021,MI,FT,Data Scientist,420000,INR,5679,IN,100,US,S
3685,2020,EN,FT,Data Science Consultant,423000,INR,5707,IN,50,IN,M


## Summary Statistics

In [22]:
mean = dt['salary'].groupby(dt['experience_level']).mean()
median = dt['salary'].groupby(dt['experience_level']).median()
var = dt['salary'].groupby(dt['experience_level']).var()
std = dt['salary'].groupby(dt['experience_level']).std()
sem = dt['salary'].groupby(dt['experience_level']).sem()

summary_stats = pd.DataFrame({"Mean Salary by Experience Level":mean, 
                            "Median Salary by Experience Level":median, 
                           "Salary by Experience Variance":var, 
                           "Salary by Experience Std. Dev.":std, 
                           "Salary by Experience Std. Err.":sem})
# Display the Summary statistics table grouped by 'Drug Regimen' column
summary_stats

Unnamed: 0_level_0,Mean Salary by Experience Level,Median Salary by Experience Level,Salary by Experience Variance,Salary by Experience Std. Dev.,Salary by Experience Std. Err.
experience_level,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
EN,188381.178125,80000.0,279899900000.0,529055.7,29575.110797
EX,246802.201754,200000.0,300512000000.0,548189.7,51342.674099
MI,248200.306832,100000.0,1696947000000.0,1302669.0,45913.058357
SE,170048.965421,148000.0,80289470000.0,283354.0,5649.031206


In [23]:
mean = dt['salary'].groupby(dt['employment_type']).mean()
median = dt['salary'].groupby(dt['employment_type']).median()
var = dt['salary'].groupby(dt['employment_type']).var()
std = dt['salary'].groupby(dt['employment_type']).std()
sem = dt['salary'].groupby(dt['employment_type']).sem()

summary_stats = pd.DataFrame({"Mean Salary by Employment Type":mean, 
                            "Median Salary by Employment Type":median, 
                           "Salary by Employment Type Variance":var, 
                           "Salary by Employment Type Std. Dev.":std, 
                           "Salary by Employment Type Std. Err.":sem})
# Display the Summary statistics table grouped by 'Drug Regimen' column
summary_stats

Unnamed: 0_level_0,Mean Salary by Employment Type,Median Salary by Employment Type,Salary by Employment Type Variance,Salary by Employment Type Std. Dev.,Salary by Employment Type Std. Err.
employment_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
CT,113300.0,75000.0,16973290000.0,130281.575401,41198.651542
FL,288755.5,50000.0,551102400000.0,742362.734803,234755.709201
FT,191146.539268,138900.0,454125800000.0,673888.538356,11051.809163
PT,79910.823529,50000.0,9434574000.0,97131.736582,23557.906443


In [24]:
mean = dt['salary'].groupby(dt['employee_residence']).mean()
median = dt['salary'].groupby(dt['employee_residence']).median()
var = dt['salary'].groupby(dt['employee_residence']).var()
std = dt['salary'].groupby(dt['employee_residence']).std()
sem = dt['salary'].groupby(dt['employee_residence']).sem()

summary_stats = pd.DataFrame({"Mean Salary by Employee Residence":mean, 
                            "Median Salary by Employee Residence":median, 
                           "Salary by Employee Residence Variance":var, 
                           "Salary by Employee Residence Std. Dev.":std, 
                           "Salary by Employee Residence Std. Err.":sem})
# Display the Summary statistics table grouped by 'Drug Regimen' column
summary_stats

Unnamed: 0_level_0,Mean Salary by Employee Residence,Median Salary by Employee Residence,Salary by Employee Residence Variance,Salary by Employee Residence Std. Dev.,Salary by Employee Residence Std. Err.
employee_residence,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
AE,100000.000000,115000.0,9.250000e+08,30413.812651,17559.422921
AM,50000.000000,50000.0,,,
AR,35500.000000,39000.0,4.111000e+08,20275.601101,8277.479488
AS,32777.500000,32777.5,3.265290e+08,18070.113793,12777.500000
AT,65833.333333,62000.0,1.925667e+08,13876.839217,5665.195888
...,...,...,...,...,...
TR,114600.000000,108000.0,1.040580e+10,102008.823148,45619.732573
UA,57850.000000,55000.0,1.519023e+09,38974.649881,19487.324940
US,152822.011651,145000.0,3.064662e+09,55359.391222,1010.046438
UZ,103000.000000,103000.0,1.800000e+07,4242.640687,3000.000000


In [25]:
mean = dt['salary'].groupby(dt['company_location']).mean()
median = dt['salary'].groupby(dt['company_location']).median()
var = dt['salary'].groupby(dt['company_location']).var()
std = dt['salary'].groupby(dt['company_location']).std()
sem = dt['salary'].groupby(dt['company_location']).sem()

summary_stats = pd.DataFrame({"Mean Salary by Company Location":mean, 
                            "Median Salary by Company Location":median, 
                           "Salary by Company Location Variance":var, 
                           "Salary by Company Location Std. Dev.":std, 
                           "Salary by Company Location Std. Err.":sem})
# Display the Summary statistics table grouped by 'Drug Regimen' column
summary_stats

Unnamed: 0_level_0,Mean Salary by Company Location,Median Salary by Company Location,Salary by Company Location Variance,Salary by Company Location Std. Dev.,Salary by Company Location Std. Err.
company_location,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
AE,100000.000000,115000.0,9.250000e+08,30413.812651,17559.422921
AL,10000.000000,10000.0,,,
AM,50000.000000,50000.0,,,
AR,25000.000000,13000.0,4.690000e+08,21656.407828,12503.332889
AS,468333.333333,50000.0,5.635583e+11,750705.223995,433419.863156
...,...,...,...,...,...
TH,618333.333333,840000.0,2.794083e+11,528590.894107,305182.095004
TR,114600.000000,108000.0,1.040580e+10,102008.823148,45619.732573
UA,57850.000000,55000.0,1.519023e+09,38974.649881,19487.324940
US,157665.357566,145000.0,4.852127e+10,220275.430650,3995.114835


In [26]:
mean = dt['salary'].groupby(dt['company_size']).mean()
median = dt['salary'].groupby(dt['company_size']).median()
var = dt['salary'].groupby(dt['company_size']).var()
std = dt['salary'].groupby(dt['company_size']).std()
sem = dt['salary'].groupby(dt['company_size']).sem()

summary_stats = pd.DataFrame({"Mean Salary by Company Size":mean, 
                            "Median Salary by Company Size":median, 
                           "Salary by Company Size Variance":var, 
                           "Salary by Company Size Std. Dev.":std, 
                           "Salary by Company Size Std. Err.":sem})
# Display the Summary statistics table grouped by 'Drug Regimen' column
summary_stats

Unnamed: 0_level_0,Mean Salary by Company Size,Median Salary by Company Size,Salary by Company Size Variance,Salary by Company Size Std. Dev.,Salary by Company Size Std. Err.
company_size,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
L,438794.372247,131300.0,3167709000000.0,1779806.0,83530.42651
M,150712.836029,140000.0,25383910000.0,159323.3,2837.378673
S,281430.101351,73000.0,982506200000.0,991214.5,81477.330395


## ANOVA Test

In [27]:
average_salary_ex["experience_level"] = average_salary_ex["experience_level"].map({"EN": 1, "MI": 2, "SE": 3, "EX": 4})
stat, p = f_oneway(average_salary_ex.salary, average_salary_ex.experience_level)

In [28]:
print(p)

4.0787388268269865e-05


In [29]:
if p > 0.05:
	print('Probably the same distribution')
else:
	print('Probably different distributions')

Probably different distributions


In [30]:
average_salary_emp["employment_type"] = average_salary_emp["employment_type"].map({"PT": 1, "CT": 2, "FT": 3, "FL": 4})
stat, p = f_oneway(average_salary_emp.salary, average_salary_emp.employment_type)

In [31]:
print(p)

0.011040526386662409


In [32]:
if p > 0.05:
	print('Probably the same distribution')
else:
	print('Probably different distributions')

Probably different distributions


In [33]:
average_salary_csize["company_size"] = average_salary_csize["company_size"].map({"S": 1, "M": 2, "L": 3})
stat, p = f_oneway(average_salary_csize.salary, average_salary_csize.company_size)

In [34]:
print(p)

0.025215667495040266


In [35]:
if p > 0.05:
	print('Probably the same distribution')
else:
	print('Probably different distributions')

Probably different distributions


## Analysis 

## Based on our findings, we can conclude that employees with Middle to Executive experience levels earn the highest salaries, as proven by average salary per experience level.
## Employment type also effects average salary, with freelance (FL) and full-time (FT) employees leading with the highest salaries as opposed to part-time (PT) and contract (CT) employees.
## Average salary varies significantly in relationship to job title, as is visualized on scatter plots 1, 2, 3. 
## BI Data Analyst (see "Average Salary Based on Job Title 1") & Data Analytics Lead are the most profitable job positions on average, sitting much higher than the cluster of average salary points aggregated towards the centre of the plot. 
## Further outliers can be noted on the second scatter plot (see "Average Salary Based on Job Title 2"), such as Data Science Manager, Data Science Tech Lead & Head of Data Science.
## "Average Salary Based on Job Title 3" likewise presents several outliers which defy the general pattern of aggregation, such as Lead Data Analyst and ML Engineer, as well as less drastic outliers, i.e., Power BI Developer and Product Data Analyst.
## Both Employee Residence & Company Location have a significant impact on overall salary. Highest Salaries measured in US dollars are reported in the Unites States of America (US), Great Britain (GB) and Israel (IL), with 3/5 highest reported salaries coming from US-based companies. 
## Lowest salaries are reported in Czechia (CZ) and India (IN), with one exception of a US-based company but in relation to an employee residence in India, suggesting lower salary paid out, likely due to outsourcing labour to cut costs.
## Company Size is another factor that displays an interesting and not entirely proportional relationship to average salary, as we can see that large and small companies, on average, pay their employees higher wages than mid-sized companies. This makes sense considering larger companies and corporations employ greater numbers of professionals & operate on international levels, incurring larger costs. Smaller companies have fewer employees to pay out and retain, and are forced to distribute income more competitively so as not lose employees to bigger, more widely-known companies.
## The ANOVA Test allows us to reject our hypothesis and accept the null because the p-score is lower than 0.05. However, as proven by Summary Statistics & our data visualizations, we can conclude that while the p-score does not reveal a clear or strong enough relationship between salary and experience/employment type/company size for us to explain salary growth as depending on the aforementioned factors alone, statistically and visually we can still summarize that each category impacts the salary significantly.
## We can conclude that the best-paying companies are US, GB and IL, with the exception of outsorcing in such instances when the employee residence differs from company location. Your odds at a high salary are also dependent on company size, and in certain instances, experience level (though salary is not always limited by hierarchical standards). 
## Some limitations of this dataset are its acceptance of a null hypothesis, lack of company names to relate to real-world scenarios & entice potential clients during the job-search process, as well as a timing constraint of one year's worth of data. While we can see varying degrees of impact on salary based on a number of factors, we are limited to finding an absolute positive relationship that proves our initial hypothesis using the ANOVA Test, despite other statistical findings. 