In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from mizani.formatters import percent_format
import os
from plotnine import *
import numpy as np
import sys
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf
from stargazer import stargazer
from statsmodels.tools.eval_measures import mse,rmse

In [3]:
# Reading the data
total_data = pd.read_csv("https://osf.io/download/4ay9x/")

# Checking if the data was correctly loaded
total_data.head()

Unnamed: 0.1,Unnamed: 0,hhid,intmonth,stfips,weight,earnwke,uhours,grade92,race,ethnic,...,ownchild,chldpres,prcitshp,state,ind02,occ2012,class,unionmme,unioncov,lfsr94
0,3,2600310997690,January,AL,3151.6801,1692.0,40,43,1,,...,0,0,"Native, Born In US",63,Employment services (5613),630,"Private, For Profit",No,No,Employed-At Work
1,5,75680310997590,January,AL,3457.1138,450.0,40,41,2,,...,2,6,"Native, Born In US",63,Outpatient care centers (6214),5400,"Private, For Profit",No,No,Employed-Absent
2,6,75680310997590,January,AL,3936.911,1090.0,60,41,2,,...,2,6,"Native, Born In US",63,Motor vehicles and motor vehicle equipment man...,8140,"Private, For Profit",No,No,Employed-At Work
3,10,179140131100930,January,AL,3288.364,769.23,40,40,1,,...,2,4,"Native, Born In US",63,"**Publishing, except newspapers and software (...",8255,"Private, For Profit",Yes,,Employed-At Work
4,11,179140131100930,January,AL,3422.85,826.92,40,43,1,,...,2,4,"Native, Born In US",63,"Banking and related activities (521, 52211,52219)",5940,"Private, For Profit",No,No,Employed-At Work


### Data manipulation
In this part, data is filtered and some additional variables are added for the modeling.

In [13]:
# filtering for civil engineers (occupation code = 1360)
data = total_data[total_data["occ2012"] == 1360]
print(f"There are {data.shape[0]} civil engineers in the dataset")

There are 396 civil engineers in the dataset


In [17]:
# checking data completeness
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 396 entries, 168 to 148438
Data columns (total 23 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  396 non-null    int64  
 1   hhid        396 non-null    int64  
 2   intmonth    396 non-null    object 
 3   stfips      396 non-null    object 
 4   weight      396 non-null    float64
 5   earnwke     396 non-null    float64
 6   uhours      396 non-null    int64  
 7   grade92     396 non-null    int64  
 8   race        396 non-null    int64  
 9   ethnic      36 non-null     float64
 10  age         396 non-null    int64  
 11  sex         396 non-null    int64  
 12  marital     396 non-null    int64  
 13  ownchild    396 non-null    int64  
 14  chldpres    396 non-null    int64  
 15  prcitshp    396 non-null    object 
 16  state       396 non-null    object 
 17  ind02       396 non-null    object 
 18  occ2012     396 non-null    int64  
 19  class       396 non-null    o

In [23]:
# creating hourly wage, binary gender variable (True for males, False for females) and "white" (True if race = 1 - i.e. white, else 0) variables 
data["wage"] = data["earnwke"] / data["uhours"]
data["gender"] = data["sex"] == 1
data["white"] = data["race"] == 1
data

Unnamed: 0.1,Unnamed: 0,hhid,intmonth,stfips,weight,earnwke,uhours,grade92,race,ethnic,...,state,ind02,occ2012,class,unionmme,unioncov,lfsr94,wage,gender,white
168,394,4540720924693,January,AK,938.8809,615.00,40,43,4,,...,94,"Architectural, engineering, and related servic...",1360,"Private, For Profit",No,No,Employed-At Work,15.375000,True,False
179,412,51250720790591,January,AK,374.8012,1250.00,38,44,1,,...,94,Administration of economic programs and space ...,1360,Government - State,Yes,,Employed-At Work,32.894737,True,True
697,1575,260177093001600,January,CA,3478.9719,1346.00,50,43,1,7.0,...,93,Petroleum refining (32411),1360,"Private, For Profit",No,No,Employed-At Work,26.920000,True,True
738,1679,310864092903826,January,CA,3089.1716,769.23,40,43,1,5.0,...,93,Hospitals (622),1360,"Private, For Profit",Yes,,Employed-At Work,19.230750,True,True
1548,3502,972046964079070,January,CA,3371.2601,2076.92,40,43,4,,...,93,Administration of economic programs and space ...,1360,Government - State,Yes,,Employed-At Work,51.923000,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
148115,314463,94100920950665,December,VT,263.7911,1194.00,40,40,1,,...,1,** Construction (23),1360,Government - State,Yes,,Employed-At Work,29.850000,True,True
148144,314533,441803003350100,December,VT,274.4192,1955.20,40,43,1,,...,1,Services incidental to transportation (488),1360,Government - State,No,No,Employed-At Work,48.880000,True,True
148172,314602,710008645900880,December,VT,270.2848,1346.15,40,43,1,,...,1,"Architectural, engineering, and related servic...",1360,"Private, For Profit",No,No,Employed-At Work,33.653750,True,True
148181,314621,840130301040503,December,VT,285.5574,1095.00,55,43,1,,...,1,"Architectural, engineering, and related servic...",1360,"Private, For Profit",No,No,Employed-At Work,19.909091,True,True


In [24]:
# creating highest education dummies: 
# "hs" for high-school or lower education
# "ba" for associates degrees or BAs 
# "ma" for masters and professional degrees
# "phd" for PhDs

data["edu_hs"] = data["grade92"] <= 39
data["edu_ba"] = data["grade92"].isin([39, 40, 41, 42, 43])
data["edu_ma"] = data["grade92"].isin([44, 45])
data["edu_phd"] = data["grade92"] == 46
data

Unnamed: 0.1,Unnamed: 0,hhid,intmonth,stfips,weight,earnwke,uhours,grade92,race,ethnic,...,unionmme,unioncov,lfsr94,wage,gender,white,edu_hs,edu_ba,edu_ma,edu_phd
168,394,4540720924693,January,AK,938.8809,615.00,40,43,4,,...,No,No,Employed-At Work,15.375000,True,False,False,True,False,False
179,412,51250720790591,January,AK,374.8012,1250.00,38,44,1,,...,Yes,,Employed-At Work,32.894737,True,True,False,False,True,False
697,1575,260177093001600,January,CA,3478.9719,1346.00,50,43,1,7.0,...,No,No,Employed-At Work,26.920000,True,True,False,True,False,False
738,1679,310864092903826,January,CA,3089.1716,769.23,40,43,1,5.0,...,Yes,,Employed-At Work,19.230750,True,True,False,True,False,False
1548,3502,972046964079070,January,CA,3371.2601,2076.92,40,43,4,,...,Yes,,Employed-At Work,51.923000,True,False,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
148115,314463,94100920950665,December,VT,263.7911,1194.00,40,40,1,,...,Yes,,Employed-At Work,29.850000,True,True,False,True,False,False
148144,314533,441803003350100,December,VT,274.4192,1955.20,40,43,1,,...,No,No,Employed-At Work,48.880000,True,True,False,True,False,False
148172,314602,710008645900880,December,VT,270.2848,1346.15,40,43,1,,...,No,No,Employed-At Work,33.653750,True,True,False,True,False,False
148181,314621,840130301040503,December,VT,285.5574,1095.00,55,43,1,,...,No,No,Employed-At Work,19.909091,True,True,False,True,False,False


In [25]:
# creating union dummy (True for members, False for non-members)
data["union"] = data["unionmme"] == 1

In [26]:
# creating squared and cubed age variables for potential non-linear associations
data["age_2"] = data["age"] ** 2
data["age_3"] = data["age"] ** 3