In [38]:
import pandas as pd
import numpy as np
import gender_guesser.detector as gender
data = pd.read_csv("Data/Salaries.csv")

data.head(10)

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,Id,EmployeeName,JobTitle,BasePay,OvertimePay,OtherPay,Benefits,TotalPay,TotalPayBenefits,Year,Notes,Agency,Status
0,1,NATHANIEL FORD,GENERAL MANAGER-METROPOLITAN TRANSIT AUTHORITY,167411.18,0.0,400184.25,,567595.43,567595.43,2011,,San Francisco,
1,2,GARY JIMENEZ,CAPTAIN III (POLICE DEPARTMENT),155966.02,245131.88,137811.38,,538909.28,538909.28,2011,,San Francisco,
2,3,ALBERT PARDINI,CAPTAIN III (POLICE DEPARTMENT),212739.13,106088.18,16452.6,,335279.91,335279.91,2011,,San Francisco,
3,4,CHRISTOPHER CHONG,WIRE ROPE CABLE MAINTENANCE MECHANIC,77916.0,56120.71,198306.9,,332343.61,332343.61,2011,,San Francisco,
4,5,PATRICK GARDNER,"DEPUTY CHIEF OF DEPARTMENT,(FIRE DEPARTMENT)",134401.6,9737.0,182234.59,,326373.19,326373.19,2011,,San Francisco,
5,6,DAVID SULLIVAN,ASSISTANT DEPUTY CHIEF II,118602.0,8601.0,189082.74,,316285.74,316285.74,2011,,San Francisco,
6,7,ALSON LEE,"BATTALION CHIEF, (FIRE DEPARTMENT)",92492.01,89062.9,134426.14,,315981.05,315981.05,2011,,San Francisco,
7,8,DAVID KUSHNER,DEPUTY DIRECTOR OF INVESTMENTS,256576.96,0.0,51322.5,,307899.46,307899.46,2011,,San Francisco,
8,9,MICHAEL MORRIS,"BATTALION CHIEF, (FIRE DEPARTMENT)",176932.64,86362.68,40132.23,,303427.55,303427.55,2011,,San Francisco,
9,10,JOANNE HAYES-WHITE,"CHIEF OF DEPARTMENT, (FIRE DEPARTMENT)",285262.0,0.0,17115.73,,302377.73,302377.73,2011,,San Francisco,


In [39]:
# dropping columns with only useless or NaN values and dropping rows with names that were not provided
data= data.drop(['Notes', 'Agency', 'Status', 'Id', 'Benefits', 'Year'], axis=1)
data = data[data['EmployeeName'] != 'Not provided']

#converting column BasePay to type float and removing BasePay that is negative
data.dropna(subset = ['BasePay'], inplace = True)
data["BasePay"] = data.BasePay.astype(float)
data = data[data['BasePay'] >= 0]

In [40]:
# checking if there are any missing values
for col in data.columns:
    pct_missing = np.mean(data[col].isnull())
    print('{} - {}%'.format(col, round(pct_missing*100)))

EmployeeName - 0%
JobTitle - 0%
BasePay - 0%
OvertimePay - 0%
OtherPay - 0%
TotalPay - 0%
TotalPayBenefits - 0%


In [41]:
# splitting EmployeeName into first and last name, keeping first names as Name
first_name = data.EmployeeName.str.split(expand=True)
first_name = first_name.drop([1,2,3,4,5], axis=1)
data['Name'] = first_name
data = data.drop(['EmployeeName'], axis=1)

# moving column 'Name' to front of dataframe
data = data[['Name'] + [ col for col in data.columns if col != 'Name' ]]


In [42]:
# guessing gender based on name
d = gender.Detector(case_sensitive=False)
gender_list = []
def guess_gender(name):
    gender_list.append(d.get_gender(name))

name_list = data['Name'].to_list()
for i in name_list:
    guess_gender(i)

# creating new column in dataframe for 'Gender'
data['Gender'] = gender_list

    

In [43]:
# dropping ambiguous genders to ensure accuracy
data = data[data['Gender'] != 'unknown']
data = data[data['Gender'] != 'mostly_female']
data = data[data['Gender'] != 'mostly_male']
data = data[data['Gender'] != 'andy']


In [44]:
# making sure there are only strictly female and strictly male names
data["Gender"].value_counts()

male      69552
female    50236
Name: Gender, dtype: int64

In [45]:
# dropping job titles with less than 50 occurances
counts = data['JobTitle'].value_counts()

data = data[~data['JobTitle'].isin(counts[counts < 50].index)]

data['JobTitle'].value_counts()

Transit Operator            5327
Special Nurse               3305
Registered Nurse            2763
Police Officer 3            2198
Firefighter                 2103
                            ... 
HEALTH WORKER I               50
TESTING TECHNICIAN            50
Social Work Supervisor        50
Utility Analyst               50
IS Prg Analyst-Principal      50
Name: JobTitle, Length: 415, dtype: int64

In [46]:
# find average total pay based on gender
female_salary = data[data['Gender'] =='female']
male_salary = data[data['Gender'] == 'male' ]

print('the average total pay for women is: $', female_salary['TotalPay'].mean())

print('the average total pay for men is: $',male_salary['TotalPay'].mean())



the average total pay for women is: $ 65671.19191140805
the average total pay for men is: $ 83809.87672083464


In [47]:
# creating separate dfs according to gender, grouped by Job Title
# new dfs only include BasePay and Total Pay
f_job_salaries = female_salary.groupby(['JobTitle']).mean()
f_job_salaries = f_job_salaries.drop(['TotalPayBenefits'], axis = 1)



m_job_salaries = male_salary.groupby(['JobTitle']).mean()
m_job_salaries = m_job_salaries.drop(['TotalPayBenefits'], axis = 1)

m_job_salaries.head()
f_job_salaries.head()

Unnamed: 0_level_0,BasePay,TotalPay
JobTitle,Unnamed: 1_level_1,Unnamed: 2_level_1
ACCOUNT CLERK,42556.832391,43441.577391
ADMINISTRATIVE ANALYST,64312.000513,64543.497949
AIRPORT POLICE SERVICES AIDE,49780.150345,57998.895
ASR Senior Office Specialist,60577.726512,62563.65093
ASSISTANT ENGINEER,80079.541111,80689.304815


In [48]:

salary_by_gender = pd.merge(f_job_salaries, m_job_salaries, on='JobTitle')
salary_by_gender.columns = ['FemaleBasePay', 'FemaleTotalPay', 'MaleBasePay', 'MaleTotalPay']
salary_by_gender.head()

Unnamed: 0_level_0,FemaleBasePay,FemaleTotalPay,MaleBasePay,MaleTotalPay
JobTitle,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ACCOUNT CLERK,42556.832391,43441.577391,46213.987333,46650.336667
ADMINISTRATIVE ANALYST,64312.000513,64543.497949,64920.955294,65811.11
AIRPORT POLICE SERVICES AIDE,49780.150345,57998.895,52180.554167,61626.364688
ASR Senior Office Specialist,60577.726512,62563.65093,61041.301111,62685.674444
ASSISTANT ENGINEER,80079.541111,80689.304815,83160.205227,84712.798864


In [49]:
# merging the two dfs together to compare pay differenes between gender for the same job
salary_by_gender['BasePayProp']= (salary_by_gender['FemaleBasePay'] *100 )/ salary_by_gender['MaleBasePay']
salary_by_gender['TotalPayProp'] = (salary_by_gender['FemaleTotalPay'] *100 )/ salary_by_gender['MaleTotalPay']
salary_by_gender.head()



Unnamed: 0_level_0,FemaleBasePay,FemaleTotalPay,MaleBasePay,MaleTotalPay,BasePayProp,TotalPayProp
JobTitle,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ACCOUNT CLERK,42556.832391,43441.577391,46213.987333,46650.336667,92.086476,93.12168
ADMINISTRATIVE ANALYST,64312.000513,64543.497949,64920.955294,65811.11,99.062006,98.073863
AIRPORT POLICE SERVICES AIDE,49780.150345,57998.895,52180.554167,61626.364688,95.399812,94.11377
ASR Senior Office Specialist,60577.726512,62563.65093,61041.301111,62685.674444,99.240556,99.805341
ASSISTANT ENGINEER,80079.541111,80689.304815,83160.205227,84712.798864,96.295507,95.25043


In [50]:
#change gender to binary variables
# from sklearn.preprocessing import LabelEncoder
# labelencoder = LabelEncoder()

# data['Gender'] = labelencoder.fit_transform(data['Gender'])


In [64]:
# salary_by_gender.to_csv('salary_by_gender.csv')
# female_salary.to_csv('female_salary.csv')
# male_salary.to_csv('male_salary.csv')
# data.to_csv('full_table.csv')
data.head(250)

Unnamed: 0,Name,JobTitle,BasePay,OvertimePay,OtherPay,TotalPay,TotalPayBenefits,Gender
22,GEORGE,"CAPTAIN, FIRE SUPPRESSION",140546.88,93200.58,39955.25,273702.71,273702.71,male
24,JOSEPH,"CAPTAIN, FIRE SUPPRESSION",140546.86,97868.77,31909.28,270324.91,270324.91,male
26,JOHN,"CAPTAIN, FIRE SUPPRESSION",92080.80,40008.0,133695.76,265784.56,265784.56,male
33,JOHN,"INSPECTOR III, (POLICE DEPARTMENT)",104861.39,50227.61,103499.39,258588.39,258588.39,male
37,JAMES,"INSPECTOR III, (POLICE DEPARTMENT)",110661.20,31162.04,111446.2,253269.44,253269.44,male
...,...,...,...,...,...,...,...,...
563,MAGALY,FIREFIGHTER,105934.68,58392.63,19409.94,183737.25,183737.25,female
564,ELISA,NURSE MANAGER,170596.03,0.0,13060.81,183656.84,183656.84,female
566,JACK,FIREFIGHTER,105934.66,59761.06,17944.28,183640.00,183640.00,male
568,SUSAN,SENIOR PHYSICIAN SPECIALIST,159624.81,0.0,23943.72,183568.53,183568.53,female
