In [10]:
import os
import sys
import warnings

import numpy as np
import pandas as pd

from mizani.formatters import percent_format
from plotnine import *
from datetime import datetime
import statsmodels.api as sm
import statsmodels.formula.api as smf
from scipy.stats import norm
from IPython.core.display import HTML
from stargazer.stargazer import Stargazer
import statsmodels.nonparametric.kernel_regression as loess

from mizani.transforms import log_trans
from mizani.formatters import percent_format
from mizani.formatters import log_format
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import matplotlib.pyplot as plt

warnings.filterwarnings("ignore")

In [11]:
# Loading data and checking
main_df = pd.read_csv('https://osf.io/4ay9x/download')

In [12]:
# Doing exploratory data analysis for filtering data to my needs
# Selected weekly earnings, weekly work hours, and age as they were the changing variables for each categories

main_df[['earnwke','uhours','age']].describe().round(2)

#Note that earnwke (weekly earnings) have a min of 0.01
#Note that uhours (weekly hours) have a min of 1.00
#Note that age has a min of 16

Unnamed: 0,earnwke,uhours,age
count,149316.0,149316.0,149316.0
mean,888.83,38.94,40.71
std,643.74,10.26,12.78
min,0.01,1.0,16.0
25%,430.0,40.0,30.0
50%,719.6,40.0,41.0
75%,1153.84,40.0,52.0
max,2884.61,99.0,64.0


In [13]:
# I decided to check the gender wage gap for Chief Executives 
# Using the documentation I found out the codes for the Chief executives
# I also filtered out (hours worked > 20) & (age > 18) to suit it further to my analyses. 


df = main_df.loc[(main_df['occ2012'] == 10) # Filtering for Chief Executives
                 & (main_df['uhours'] >= 20) # Filtering for at least 20 hours/week worked 
                 & (main_df['age'] >= 18) # Filtering for at least 18 years of age
                ]                     
df.shape

(1266, 23)

In [14]:
# To filter even more, I selected the individuals that have a education level (grade92) 
#BETWEEN 41 (Associate degree - Vocational/occupational)
#AND 46 (Doctorate degree)

df = df[((df['grade92'] >= 41) & (df['grade92'] <= 46))]  
df.shape

(1044, 23)

In [15]:
# Creating columns for female indicator, hourly wage, and log(hourly wage)

df['female'] = (df['sex'] == 2).astype(int)
df['w'] = df['earnwke'] / df['uhours']
df["lnw"] = np.log(df["w"])
df.head().T

Unnamed: 0,38,191,540,599,696
Unnamed: 0,80,435,1230,1344,1572
hhid,97973400095118,90947006014265,4003608069611,60901093001001,260018057903296
intmonth,January,January,January,January,January
stfips,AL,AK,CA,CA,CA
weight,3206.3662,332.7025,2996.9309,3716.5958,2985.998
earnwke,2000.0,1413.0,2750.0,1230.0,500.0
uhours,60,40,40,50,40
grade92,43,43,43,44,43
race,1,1,1,4,2
ethnic,,,,,
