In [1]:
import pandas as pd
import numpy as np
import gender_guesser.detector as gender
data = pd.read_csv("Data/Salaries.csv")


  interactivity=interactivity, compiler=compiler, result=result)


In [2]:
# dropping columns with only useless or NaN values and dropping rows with names that were not provided
data= data.drop(['Notes', 'Agency', 'Status', 'Id', 'Benefits', 'Year'], axis=1)
data = data[data['EmployeeName'] != 'Not provided']

#converting column BasePay to type float and removing BasePay that is negative
data.dropna(subset = ['BasePay'], inplace = True)
data["BasePay"] = data.BasePay.astype(float)
data = data[data['BasePay'] >= 0]

In [3]:
# checking if there are any missing values
for col in data.columns:
    pct_missing = np.mean(data[col].isnull())
    print('{} - {}%'.format(col, round(pct_missing*100)))

EmployeeName - 0%
JobTitle - 0%
BasePay - 0%
OvertimePay - 0%
OtherPay - 0%
TotalPay - 0%
TotalPayBenefits - 0%


In [4]:
# splitting EmployeeName into first and last name, keeping first names as Name
first_name = data.EmployeeName.str.split(expand=True)
first_name = first_name.drop([1,2,3,4,5], axis=1)
data['Name'] = first_name
data = data.drop(['EmployeeName'], axis=1)

# moving column 'Name' to front of dataframe
data = data[['Name'] + [ col for col in data.columns if col != 'Name' ]]


In [5]:
# guessing gender based on name
d = gender.Detector(case_sensitive=False)
gender_list = []
def guess_gender(name):
    gender_list.append(d.get_gender(name))

name_list = data['Name'].to_list()
for i in name_list:
    guess_gender(i)

# creating new column in dataframe for 'Gender'
data['Gender'] = gender_list


In [6]:
# dropping ambiguous genders to ensure accuracy
data = data[data['Gender'] != 'unknown']
data = data[data['Gender'] != 'mostly_female']
data = data[data['Gender'] != 'mostly_male']
data = data[data['Gender'] != 'andy']


In [7]:
# making sure there are only strictly female and strictly male names
data["Gender"].value_counts()

male      69552
female    50236
Name: Gender, dtype: int64

In [8]:
# dropping job titles with less than 50 occurances
counts = data['JobTitle'].value_counts()

data = data[~data['JobTitle'].isin(counts[counts < 10].index)]

data['JobTitle'].value_counts()

Transit Operator                      5327
Special Nurse                         3305
Registered Nurse                      2763
Police Officer 3                      2198
Firefighter                           2103
                                      ... 
ELECTRICAL LINE WORKER                  10
Senior Museum Registrar                 10
ASSISTANT STOREKEEPER                   10
APPRENTICE GARDENER                     10
PRINCIPAL ENVIRONMENTAL SPECIALIST      10
Name: JobTitle, Length: 1131, dtype: int64

In [9]:
# find average total pay based on gender
female_salary = data[data['Gender'] =='female']
male_salary = data[data['Gender'] == 'male' ]

print('the average total pay for women is: $', female_salary['TotalPay'].mean())

print('the average total pay for men is: $',male_salary['TotalPay'].mean())



the average total pay for women is: $ 66767.29559030477
the average total pay for men is: $ 84176.39938818566


In [10]:
# creating separate dfs according to gender, grouped by Job Title
# new dfs only include BasePay and Total Pay
f_job_salaries = female_salary.groupby(['JobTitle']).mean()
f_job_salaries = f_job_salaries.drop(['TotalPayBenefits'], axis = 1)



m_job_salaries = male_salary.groupby(['JobTitle']).mean()
m_job_salaries = m_job_salaries.drop(['TotalPayBenefits'], axis = 1)

m_job_salaries.head()
f_job_salaries.head()

Unnamed: 0_level_0,BasePay,TotalPay
JobTitle,Unnamed: 1_level_1,Unnamed: 2_level_1
ACCOUNT CLERK,42556.832391,43441.577391
ACCOUNTANT INTERN,22107.143333,22424.294167
ADMINISTRATIVE ANALYST,64312.000513,64543.497949
ADMINISTRATIVE ENGINEER,129755.206667,129755.206667
AIRPORT COMMUNICATIONS OPERATOR,68245.566,82763.087333


In [11]:

salary_by_gender = pd.merge(f_job_salaries, m_job_salaries, on='JobTitle')
salary_by_gender.columns = ['FemaleBasePay', 'FemaleTotalPay', 'MaleBasePay', 'MaleTotalPay']
salary_by_gender.head()

Unnamed: 0_level_0,FemaleBasePay,FemaleTotalPay,MaleBasePay,MaleTotalPay
JobTitle,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ACCOUNT CLERK,42556.832391,43441.577391,46213.987333,46650.336667
ACCOUNTANT INTERN,22107.143333,22424.294167,30117.481667,30486.101111
ADMINISTRATIVE ANALYST,64312.000513,64543.497949,64920.955294,65811.11
ADMINISTRATIVE ENGINEER,129755.206667,129755.206667,117123.712857,127192.668571
AIRPORT COMMUNICATIONS OPERATOR,68245.566,82763.087333,64496.3725,72506.05375


In [12]:
# merging the two dfs together to compare pay differenes between gender for the same job
salary_by_gender['BasePayDiff']= salary_by_gender['MaleBasePay'] - salary_by_gender['FemaleBasePay']
salary_by_gender['TotalPayDiff'] = salary_by_gender['MaleTotalPay'] - salary_by_gender['FemaleTotalPay']
salary_by_gender.head()



Unnamed: 0_level_0,FemaleBasePay,FemaleTotalPay,MaleBasePay,MaleTotalPay,BasePayDiff,TotalPayDiff
JobTitle,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ACCOUNT CLERK,42556.832391,43441.577391,46213.987333,46650.336667,3657.154942,3208.759275
ACCOUNTANT INTERN,22107.143333,22424.294167,30117.481667,30486.101111,8010.338333,8061.806944
ADMINISTRATIVE ANALYST,64312.000513,64543.497949,64920.955294,65811.11,608.954781,1267.612051
ADMINISTRATIVE ENGINEER,129755.206667,129755.206667,117123.712857,127192.668571,-12631.49381,-2562.538095
AIRPORT COMMUNICATIONS OPERATOR,68245.566,82763.087333,64496.3725,72506.05375,-3749.1935,-10257.033583


In [13]:
# salary_by_gender.to_csv('salary_by_gender.csv')
# female_salary.to_csv('female_salary.csv')
# male_salary.to_csv('male_salary.csv')