In [1]:
# Direct Python to plot all figures inline (i.e., not in a separate window)
%matplotlib inline

# Load libraries
import numpy as np
import pandas as pd
import seaborn as sns 
import matplotlib.pyplot as plt
import statsmodels.api as sm

In [2]:
# Directory where NLSY79_TeachingExtract.csv file is located
data =  '/Users/bgraham/Dropbox/Teaching/Berkeley_Courses/Ec240a/Ec240a_Fall2023/Data/NLSY97/'

In [3]:
# Read in NLSY79 Extract as a pandas dataframe
nlsy97 = pd.read_csv(data+'NLSY97.csv') # Reading .csv file as DataFrame
nlsy97.set_index('R0000100', inplace=True)
nlsy97.index.name = 'pubid'

# Read in CPI data as a pandas dataframe
# https://www.bls.gov/cpi/research-series/r-cpi-u-rs-home.htm on 27 Oct 2023
cpi  = pd.read_excel(data+'r-cpi-u-rs-allitems.xlsx', sheet_name='Table 1', skiprows=5) # Reading .xlsx file as DataFrame
cpi.set_index('YEAR', inplace=True)

In [4]:
nlsy97[0:3]

Unnamed: 0_level_0,R0489900,R0490000,R0490100,R0490200,R0490300,R0536300,R0536401,R0536402,R1200100,R1200200,...,Z9060100,Z9060200,Z9060300,Z9060400,Z9060500,Z9060600,Z9060700,Z9060800,Z9060900,Z9083800
pubid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,-4,-4,-4,-4,2,9,1981,19,26,...,0,0,0,0,0,0,26,52,52,16
2,0,-4,-4,-4,-4,1,7,1982,19,19,...,0,0,0,0,0,0,0,0,0,14
3,0,-4,-4,-4,-4,2,9,1983,26,26,...,0,0,0,0,0,0,0,0,0,16


In [5]:
cpi[0:3]

Unnamed: 0_level_0,JAN,FEB,MAR,APR,MAY,JUNE,JULY,AUG,SEP,OCT,NOV,DEC,AVG,Unnamed: 14
YEAR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1977,,,,,,,,,,,,100.0,,
1978,100.5,101.1,101.8,102.7,103.6,104.5,105.0,105.5,106.1,106.7,107.3,107.8,104.4,
1979,108.7,109.7,110.7,111.8,113.0,114.1,115.1,116.0,117.1,117.9,118.5,119.5,114.3,


In [6]:
nlsy97['female'] = 1*(nlsy97['R0536300']==2)

nlsy97.rename(columns={'R0536401': "birth_month", "R0536402": "birth_year"})

nlsy97['black'] = 1*(nlsy97['R1482600']==1)
nlsy97['hispanic'] = 1*(nlsy97['R1482600']==2)

nlsy97['asvab'] = np.nan
nlsy97.loc[nlsy97['R9829600']>=0,'asvab'] = nlsy97['R9829600'][nlsy97['R9829600']>=0]/1000

In [7]:
nlsy97['hgc_99'] = np.nan
nlsy97.loc[nlsy97['R3884801']>=0,'hgc_99'] = nlsy97['R3884801'][nlsy97['R3884801']>=0]
nlsy97['hgc_99'].replace(95, 0, inplace=True)

nlsy97['hgc_00'] = np.nan
nlsy97.loc[nlsy97['R5464001']>=0,'hgc_00'] = nlsy97['R5464001'][nlsy97['R5464001']>=0]
nlsy97['hgc_00'].replace(95, 0, inplace=True)

nlsy97['hgc_01'] = np.nan
nlsy97.loc[nlsy97['R7227701']>=0,'hgc_01'] = nlsy97['R7227701'][nlsy97['R7227701']>=0]
nlsy97['hgc_01'].replace(95, 0, inplace=True)

nlsy97['hgc_02'] = np.nan
nlsy97.loc[nlsy97['S1541601']>=0,'hgc_02'] = nlsy97['S1541601'][nlsy97['S1541601']>=0]
nlsy97['hgc_02'].replace(95, 0, inplace=True)

nlsy97['hgc_03'] = np.nan
nlsy97.loc[nlsy97['S2011401']>=0,'hgc_03'] = nlsy97['S2011401'][nlsy97['S2011401']>=0]
nlsy97['hgc_03'].replace(95, 0, inplace=True)

nlsy97['hgc_04'] = np.nan
nlsy97.loc[nlsy97['S3812301']>=0,'hgc_04'] = nlsy97['S3812301'][nlsy97['S3812301']>=0]
nlsy97['hgc_04'].replace(95, 0, inplace=True)

nlsy97['hgc_05'] = np.nan
nlsy97.loc[nlsy97['S5412700']>=0,'hgc_05'] = nlsy97['S5412700'][nlsy97['S5412700']>=0]
nlsy97['hgc_05'].replace(95, 0, inplace=True)

nlsy97['hgc_06'] = np.nan
nlsy97.loc[nlsy97['S7513600']>=0,'hgc_06'] = nlsy97['S7513600'][nlsy97['S7513600']>=0]
nlsy97['hgc_06'].replace(95, 0, inplace=True)

nlsy97['hgc_07'] = np.nan
nlsy97.loc[nlsy97['T0014000']>=0,'hgc_07'] = nlsy97['T0014000'][nlsy97['T0014000']>=0]
nlsy97['hgc_07'].replace(95, 0, inplace=True)

nlsy97['hgc_08'] = np.nan
nlsy97.loc[nlsy97['T2016100']>=0,'hgc_08'] = nlsy97['T2016100'][nlsy97['T2016100']>=0]
nlsy97['hgc_08'].replace(95, 0, inplace=True)

nlsy97['hgc_09'] = np.nan
nlsy97.loc[nlsy97['T3606400']>=0,'hgc_09'] = nlsy97['T3606400'][nlsy97['T3606400']>=0]
nlsy97['hgc_09'].replace(95, 0, inplace=True)

nlsy97['hgc_10'] = np.nan
nlsy97.loc[nlsy97['T5206800']>=0,'hgc_10'] = nlsy97['T5206800'][nlsy97['T5206800']>=0]
nlsy97['hgc_10'].replace(95, 0, inplace=True)

nlsy97['hgc_11'] = np.nan
nlsy97.loc[nlsy97['T6656600']>=0,'hgc_11'] = nlsy97['T6656600'][nlsy97['T6656600']>=0]
nlsy97['hgc_11'].replace(95, 0, inplace=True)

nlsy97['hgc_13'] = np.nan
nlsy97.loc[nlsy97['T8129000']>=0,'hgc_13'] = nlsy97['T8129000'][nlsy97['T8129000']>=0]
nlsy97['hgc_13'].replace(95, 0, inplace=True)

nlsy97['hgc_ever10'] = np.nan
nlsy97.loc[nlsy97['T5206700']>=0,'hgc_ever10'] = nlsy97['T5206700'][nlsy97['T5206700']>=0]
nlsy97['hgc_ever10'].replace(95, 0, inplace=True)

nlsy97['hgc_ever11'] = np.nan
nlsy97.loc[nlsy97['T6656500']>=0,'hgc_ever11'] = nlsy97['T6656500'][nlsy97['T6656500']>=0]
nlsy97['hgc_ever11'].replace(95, 0, inplace=True)

nlsy97['hgc_ever13'] = np.nan
nlsy97.loc[nlsy97['T8128900']>=0,'hgc_ever13'] = nlsy97['T8128900'][nlsy97['T8128900']>=0]
nlsy97['hgc_ever13'].replace(95, 0, inplace=True)

nlsy97['hgc_ever15'] = np.nan
nlsy97.loc[nlsy97['U0008800']>=0,'hgc_ever15'] = nlsy97['U0008800'][nlsy97['U0008800']>=0]
nlsy97['hgc_ever15'].replace(95, 0, inplace=True)

nlsy97['hgc_ever17'] = np.nan
nlsy97.loc[nlsy97['U1845400']>=0,'hgc_ever17'] = nlsy97['U1845400'][nlsy97['U1845400']>=0]
nlsy97['hgc_ever17'].replace(95, 0, inplace=True)

nlsy97['hgc_ever19'] = np.nan
nlsy97.loc[nlsy97['U3443900']>=0,'hgc_ever19'] = nlsy97['U3443900'][nlsy97['U3443900']>=0]
nlsy97['hgc_ever19'].replace(95, 0, inplace=True)

nlsy97['hgc_ever'] = np.nan
nlsy97.loc[nlsy97['Z9083800']>=0,'hgc_ever'] = nlsy97['Z9083800'][nlsy97['Z9083800']>=0]
nlsy97['hgc_ever'].replace(95, 0, inplace=True)


nlsy97[['hgc_99','hgc_00','hgc_01','hgc_02','hgc_03','hgc_04','hgc_05','hgc_06','hgc_07','hgc_08', \
        'hgc_09','hgc_10','hgc_11','hgc_13','hgc_ever10','hgc_ever11','hgc_ever13','hgc_ever15','hgc_ever17','hgc_ever19','hgc_ever']].describe()

Unnamed: 0,hgc_99,hgc_00,hgc_01,hgc_02,hgc_03,hgc_04,hgc_05,hgc_06,hgc_07,hgc_08,...,hgc_10,hgc_11,hgc_13,hgc_ever10,hgc_ever11,hgc_ever13,hgc_ever15,hgc_ever17,hgc_ever19,hgc_ever
count,8128.0,7972.0,7773.0,7815.0,7675.0,7425.0,7267.0,7480.0,7345.0,7419.0,...,7408.0,7342.0,7069.0,7411.0,7345.0,7060.0,7028.0,6663.0,6871.0,8884.0
mean,10.091905,10.689664,11.121832,11.359949,11.430358,11.452795,11.446952,11.440107,11.437304,11.439547,...,11.43372,11.433261,11.443344,13.418837,13.485636,13.645892,13.741747,13.903047,13.957357,13.676835
std,1.429632,1.303433,1.199997,1.162201,1.166798,1.176143,1.180882,1.182117,1.18136,1.180046,...,1.183196,1.174863,1.165588,2.930779,2.954833,3.040799,3.094643,3.130969,3.140395,3.163758
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,9.0,10.0,11.0,11.0,11.0,12.0,12.0,12.0,12.0,12.0,...,12.0,12.0,12.0,12.0,12.0,12.0,12.0,12.0,12.0,12.0
50%,10.0,11.0,12.0,12.0,12.0,12.0,12.0,12.0,12.0,12.0,...,12.0,12.0,12.0,13.0,13.0,13.0,14.0,14.0,14.0,13.0
75%,11.0,12.0,12.0,12.0,12.0,12.0,12.0,12.0,12.0,12.0,...,12.0,12.0,12.0,16.0,16.0,16.0,16.0,16.0,16.0,16.0
max,12.0,12.0,12.0,12.0,12.0,12.0,12.0,12.0,12.0,12.0,...,12.0,12.0,12.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0


In [8]:
# Construct annual earnings measure 1996
# NOTE: No top-coding during this calendar year
nlsy97['earn96'] = np.nan
nlsy97.loc[nlsy97['R0490200']>=0,'earn96'] = nlsy97['R0490200'][nlsy97['R0490200']>=0]   # Reports earnings
nlsy97.loc[pd.isna(nlsy97['earn96']) & (nlsy97['R0489900']==0),'earn96'] = 0             # Reports no earnings (#1)
nlsy97.loc[nlsy97['R0489900']==-1 & (nlsy97['R0490100']==0),'earn96'] = 0                # Reports no earnings (#2)
nlsy97.loc[nlsy97['R0489900']==-2 & (nlsy97['R0490000']==0),'earn96'] = 0                # Reports no earnings (#3)
nlsy97.loc[nlsy97['R0490200'].isin([-1,-2]) & (nlsy97['R0490300']==1),'earn96'] = 2500   # $1 to $5000 range
nlsy97.loc[nlsy97['R0490200'].isin([-1,-2]) & (nlsy97['R0490300']==2),'earn96'] = 7500   # $5001 to $10000 range
nlsy97.loc[nlsy97['R0490200'].isin([-1,-2]) & (nlsy97['R0490300']==3),'earn96'] = 17500  # $10001 to $25000 range
nlsy97.loc[nlsy97['R0490200'].isin([-1,-2]) & (nlsy97['R0490300']==4),'earn96'] = 37500  # $25001 to $50000 range
nlsy97.loc[nlsy97['R0490200'].isin([-1,-2]) & (nlsy97['R0490300']==5),'earn96'] = 75000  # $50001 to $100000 range
nlsy97.loc[nlsy97['R0490200'].isin([-1,-2]) & (nlsy97['R0490300']==6),'earn96'] = 175000 # $100001 to $250000 range

# Keep track of respondents reporting only an earnings range
nlsy97['earn96_range'] = np.nan
nlsy97.loc[~pd.isna(nlsy97['earn96']),'earn96_range'] = 0
nlsy97.loc[nlsy97['R0490300']>=1,'earn96_range'] = 1

# Deflate earnings measures
nlsy97['earn96'] = nlsy97['earn96']*(cpi.loc[2022]['AVG']/cpi.loc[1996]['AVG'])

# Construct annual earnings measure 1997
# NOTE: Top-coded at $14,048 (lowest value of top 2%)
nlsy97['earn97'] = np.nan
nlsy97.loc[nlsy97['R2341200']>=0,'earn97'] = nlsy97['R2341200'][nlsy97['R2341200']>=0]   # Reports earnings
nlsy97.loc[pd.isna(nlsy97['earn97']) & (nlsy97['R2340900']==0),'earn97'] = 0             # Reports no earnings (#1)
nlsy97.loc[nlsy97['R2340900']==-1 & (nlsy97['R2341100']==0),'earn97'] = 0                # Reports no earnings (#2)
nlsy97.loc[nlsy97['R2340900']==-2 & (nlsy97['R2341000']==0),'earn97'] = 0                # Reports no earnings (#3)
nlsy97.loc[nlsy97['R2341200'].isin([-1,-2]) & (nlsy97['R2341300']==1),'earn97'] = 2500   # $1 to $5000 range
nlsy97.loc[nlsy97['R2341200'].isin([-1,-2]) & (nlsy97['R2341300']==2),'earn97'] = 7500   # $5001 to $10000 range
nlsy97.loc[nlsy97['R2341200'].isin([-1,-2]) & (nlsy97['R2341300']==3),'earn97'] = 17500  # $10001 to $25000 range
nlsy97.loc[nlsy97['R2341200'].isin([-1,-2]) & (nlsy97['R2341300']==4),'earn97'] = 37500  # $25001 to $50000 range
nlsy97.loc[nlsy97['R2341200'].isin([-1,-2]) & (nlsy97['R2341300']==5),'earn97'] = 75000  # $50001 to $100000 range
nlsy97.loc[nlsy97['R2341200'].isin([-1,-2]) & (nlsy97['R2341300']==6),'earn97'] = 175000 # $100001 to $250000 range

# Keep track of respondents reporting only an earnings range
nlsy97['earn97_range'] = np.nan
nlsy97.loc[~pd.isna(nlsy97['earn97']),'earn97_range'] = 0
nlsy97.loc[nlsy97['R2341300']>=1,'earn97_range'] = 1

# Deflate earnings measures
nlsy97['earn97'] = nlsy97['earn97']*(cpi.loc[2022]['AVG']/cpi.loc[1997]['AVG'])

# Construct annual earnings measure 1998
# NOTE: Top-coded at $25,249 (lowest value of top 2%)
nlsy97['earn98'] = np.nan
nlsy97.loc[nlsy97['R3650200']>=0,'earn98'] = nlsy97['R3650200'][nlsy97['R3650200']>=0]   # Reports earnings
nlsy97.loc[pd.isna(nlsy97['earn98']) & (nlsy97['R3649900']==0),'earn98'] = 0             # Reports no earnings (#1)
nlsy97.loc[nlsy97['R3649900']==-1 & (nlsy97['R3650100']==0),'earn98'] = 0                # Reports no earnings (#2)
nlsy97.loc[nlsy97['R3649900']==-2 & (nlsy97['R3650000']==0),'earn98'] = 0                # Reports no earnings (#3)
nlsy97.loc[nlsy97['R3650200'].isin([-1,-2]) & (nlsy97['R3650300']==1),'earn98'] = 2500   # $1 to $5000 range
nlsy97.loc[nlsy97['R3650200'].isin([-1,-2]) & (nlsy97['R3650300']==2),'earn98'] = 7500   # $5001 to $10000 range
nlsy97.loc[nlsy97['R3650200'].isin([-1,-2]) & (nlsy97['R3650300']==3),'earn98'] = 17500  # $10001 to $25000 range
nlsy97.loc[nlsy97['R3650200'].isin([-1,-2]) & (nlsy97['R3650300']==4),'earn98'] = 37500  # $25001 to $50000 range
nlsy97.loc[nlsy97['R3650200'].isin([-1,-2]) & (nlsy97['R3650300']==5),'earn98'] = 75000  # $50001 to $100000 range
nlsy97.loc[nlsy97['R3650200'].isin([-1,-2]) & (nlsy97['R3650300']==6),'earn98'] = 175000 # $100001 to $250000 range

# Keep track of respondents reporting only an earnings range
nlsy97['earn98_range'] = np.nan
nlsy97.loc[~pd.isna(nlsy97['earn98']),'earn98_range'] = 0
nlsy97.loc[nlsy97['R3650300']>=1,'earn98_range'] = 1

# Deflate earnings measures
nlsy97['earn98'] = nlsy97['earn98']*(cpi.loc[2022]['AVG']/cpi.loc[1998]['AVG'])

# Construct annual earnings measure 1999
# NOTE: Top-coded at $23,000 (lowest value of top 2%)
nlsy97['earn99'] = np.nan
nlsy97.loc[nlsy97['R5098900']>=0,'earn99'] = nlsy97['R5098900'][nlsy97['R5098900']>=0]   # Reports earnings
nlsy97.loc[pd.isna(nlsy97['earn99']) & (nlsy97['R5098600']==0),'earn99'] = 0             # Reports no earnings (#1)
nlsy97.loc[nlsy97['R5098600']==-1 & (nlsy97['R5098800']==0),'earn99'] = 0                # Reports no earnings (#2)
nlsy97.loc[nlsy97['R5098600']==-2 & (nlsy97['R5098700']==0),'earn99'] = 0                # Reports no earnings (#3)
nlsy97.loc[nlsy97['R5098900'].isin([-1,-2]) & (nlsy97['R5099000']==1),'earn99'] = 2500   # $1 to $5000 range
nlsy97.loc[nlsy97['R5098900'].isin([-1,-2]) & (nlsy97['R5099000']==2),'earn99'] = 7500   # $5001 to $10000 range
nlsy97.loc[nlsy97['R5098900'].isin([-1,-2]) & (nlsy97['R5099000']==3),'earn99'] = 17500  # $10001 to $25000 range
nlsy97.loc[nlsy97['R5098900'].isin([-1,-2]) & (nlsy97['R5099000']==4),'earn99'] = 37500  # $25001 to $50000 range
nlsy97.loc[nlsy97['R5098900'].isin([-1,-2]) & (nlsy97['R5099000']==5),'earn99'] = 75000  # $50001 to $100000 range
nlsy97.loc[nlsy97['R5098900'].isin([-1,-2]) & (nlsy97['R5099000']==6),'earn99'] = 175000 # $100001 to $250000 range

# Keep track of respondents reporting only an earnings range
nlsy97['earn99_range'] = np.nan
nlsy97.loc[~pd.isna(nlsy97['earn99']),'earn99_range'] = 0
nlsy97.loc[nlsy97['R5099000']>=1,'earn99_range'] = 1

# Deflate earnings measures
nlsy97['earn99'] = nlsy97['earn99']*(cpi.loc[2022]['AVG']/cpi.loc[1999]['AVG'])

# Construct annual earnings measure 2000
# NOTE: Top-coded at $28,000 (lowest value of top 2%)
nlsy97['earn00'] = np.nan
nlsy97.loc[nlsy97['R6827500']>=0,'earn00'] = nlsy97['R6827500'][nlsy97['R6827500']>=0]   # Reports earnings
nlsy97.loc[pd.isna(nlsy97['earn00']) & (nlsy97['R6827200']==0),'earn00'] = 0             # Reports no earnings (#1)
nlsy97.loc[nlsy97['R6827200']==-1 & (nlsy97['R6827400']==0),'earn00'] = 0                # Reports no earnings (#2)
nlsy97.loc[nlsy97['R6827200']==-2 & (nlsy97['R6827300']==0),'earn00'] = 0                # Reports no earnings (#3)
nlsy97.loc[nlsy97['R6827500'].isin([-1,-2]) & (nlsy97['R6827600']==1),'earn00'] = 2500   # $1 to $5000 range
nlsy97.loc[nlsy97['R6827500'].isin([-1,-2]) & (nlsy97['R6827600']==2),'earn00'] = 7500   # $5001 to $10000 range
nlsy97.loc[nlsy97['R6827500'].isin([-1,-2]) & (nlsy97['R6827600']==3),'earn00'] = 17500  # $10001 to $25000 range
nlsy97.loc[nlsy97['R6827500'].isin([-1,-2]) & (nlsy97['R6827600']==4),'earn00'] = 37500  # $25001 to $50000 range
nlsy97.loc[nlsy97['R6827500'].isin([-1,-2]) & (nlsy97['R6827600']==5),'earn00'] = 75000  # $50001 to $100000 range
nlsy97.loc[nlsy97['R6827500'].isin([-1,-2]) & (nlsy97['R6827600']==6),'earn00'] = 175000 # $100001 to $250000 range

# Keep track of respondents reporting only an earnings range
nlsy97['earn00_range'] = np.nan
nlsy97.loc[~pd.isna(nlsy97['earn00']),'earn00_range'] = 0
nlsy97.loc[nlsy97['R6827600']>=1,'earn00_range'] = 1

# Deflate earnings measures
nlsy97['earn00'] = nlsy97['earn00']*(cpi.loc[2022]['AVG']/cpi.loc[2000]['AVG'])

# Construct annual earnings measure 2001
# NOTE: Top-coded at $34,000 (lowest value of top 2%)
nlsy97['earn01'] = np.nan
nlsy97.loc[nlsy97['S1055800']>=0,'earn01'] = nlsy97['S1055800'][nlsy97['S1055800']>=0]   # Reports earnings
nlsy97.loc[pd.isna(nlsy97['earn01']) & (nlsy97['S1055500']==0),'earn01'] = 0             # Reports no earnings (#1)
nlsy97.loc[nlsy97['S1055500']==-1 & (nlsy97['S1055700']==0),'earn01'] = 0                # Reports no earnings (#2)
nlsy97.loc[nlsy97['S1055500']==-2 & (nlsy97['S1055600']==0),'earn01'] = 0                # Reports no earnings (#3)
nlsy97.loc[nlsy97['S1055800'].isin([-1,-2]) & (nlsy97['S1055900']==1),'earn01'] = 2500   # $1 to $5000 range
nlsy97.loc[nlsy97['S1055800'].isin([-1,-2]) & (nlsy97['S1055900']==2),'earn01'] = 7500   # $5001 to $10000 range
nlsy97.loc[nlsy97['S1055800'].isin([-1,-2]) & (nlsy97['S1055900']==3),'earn01'] = 17500  # $10001 to $25000 range
nlsy97.loc[nlsy97['S1055800'].isin([-1,-2]) & (nlsy97['S1055900']==4),'earn01'] = 37500  # $25001 to $50000 range
nlsy97.loc[nlsy97['S1055800'].isin([-1,-2]) & (nlsy97['S1055900']==5),'earn01'] = 75000  # $50001 to $100000 range
nlsy97.loc[nlsy97['S1055800'].isin([-1,-2]) & (nlsy97['S1055900']==6),'earn01'] = 175000 # $100001 to $250000 range

# Keep track of respondents reporting only an earnings range
nlsy97['earn01_range'] = np.nan
nlsy97.loc[~pd.isna(nlsy97['earn01']),'earn01_range'] = 0
nlsy97.loc[nlsy97['S1055900']>=1,'earn01_range'] = 1

# Deflate earnings measures
nlsy97['earn01'] = nlsy97['earn01']*(cpi.loc[2022]['AVG']/cpi.loc[2001]['AVG'])

# Construct annual earnings measure 2002
# NOTE: Top-coded at $35,000 (lowest value of top 2%)
nlsy97['earn02'] = np.nan
nlsy97.loc[nlsy97['S3134600']>=0,'earn02'] = nlsy97['S3134600'][nlsy97['S3134600']>=0]   # Reports earnings
nlsy97.loc[pd.isna(nlsy97['earn02']) & (nlsy97['S3134400']==0),'earn02'] = 0             # Reports no earnings (#1)
nlsy97.loc[nlsy97['S3134400'].isin([-1,-2]) & (nlsy97['S3134500']==0),'earn02'] = 0      # Reports no earnings (#2)
nlsy97.loc[nlsy97['S3134600'].isin([-1,-2]) & (nlsy97['S3134700']==1),'earn02'] = 2500   # $1 to $5000 range
nlsy97.loc[nlsy97['S3134600'].isin([-1,-2]) & (nlsy97['S3134700']==2),'earn02'] = 7500   # $5001 to $10000 range
nlsy97.loc[nlsy97['S3134600'].isin([-1,-2]) & (nlsy97['S3134700']==3),'earn02'] = 17500  # $10001 to $25000 range
nlsy97.loc[nlsy97['S3134600'].isin([-1,-2]) & (nlsy97['S3134700']==4),'earn02'] = 37500  # $25001 to $50000 range
nlsy97.loc[nlsy97['S3134600'].isin([-1,-2]) & (nlsy97['S3134700']==5),'earn02'] = 75000  # $50001 to $100000 range
nlsy97.loc[nlsy97['S3134600'].isin([-1,-2]) & (nlsy97['S3134700']==6),'earn02'] = 175000 # $100001 to $250000 range

# Keep track of respondents reporting only an earnings range
nlsy97['earn02_range'] = np.nan
nlsy97.loc[~pd.isna(nlsy97['earn02']),'earn02_range'] = 0
nlsy97.loc[nlsy97['S3134700']>=1,'earn02_range'] = 1

# Deflate earnings measures
nlsy97['earn02'] = nlsy97['earn02']*(cpi.loc[2022]['AVG']/cpi.loc[2002]['AVG'])

# Construct annual earnings measure 2003
# NOTE: Top-coded at $43,000 (lowest value of top 2%)
nlsy97['earn03'] = np.nan
nlsy97.loc[nlsy97['S4799600']>=0,'earn03'] = nlsy97['S4799600'][nlsy97['S4799600']>=0]   # Reports earnings
nlsy97.loc[pd.isna(nlsy97['earn03']) & (nlsy97['S4799400']==0),'earn03'] = 0             # Reports no earnings (#1)
nlsy97.loc[nlsy97['S4799400'].isin([-1,-2]) & (nlsy97['S4799500']==0),'earn02'] = 0      # Reports no earnings (#2)
nlsy97.loc[nlsy97['S4799600'].isin([-1,-2]) & (nlsy97['S4799700']==1),'earn03'] = 2500   # $1 to $5000 range
nlsy97.loc[nlsy97['S4799600'].isin([-1,-2]) & (nlsy97['S4799700']==2),'earn03'] = 7500   # $5001 to $10000 range
nlsy97.loc[nlsy97['S4799600'].isin([-1,-2]) & (nlsy97['S4799700']==3),'earn03'] = 17500  # $10001 to $25000 range
nlsy97.loc[nlsy97['S4799600'].isin([-1,-2]) & (nlsy97['S4799700']==4),'earn03'] = 37500  # $25001 to $50000 range
nlsy97.loc[nlsy97['S4799600'].isin([-1,-2]) & (nlsy97['S4799700']==5),'earn03'] = 75000  # $50001 to $100000 range
nlsy97.loc[nlsy97['S4799600'].isin([-1,-2]) & (nlsy97['S4799700']==6),'earn03'] = 175000 # $100001 to $250000 range
nlsy97.loc[nlsy97['S4799600'].isin([-1,-2]) & (nlsy97['S4799700']==7),'earn03'] = 250000 # $100001 to $250000 range

# Keep track of respondents reporting only an earnings range
nlsy97['earn03_range'] = np.nan
nlsy97.loc[~pd.isna(nlsy97['earn03']),'earn03_range'] = 0
nlsy97.loc[nlsy97['S4799700']>=1,'earn03_range'] = 1

# Deflate earnings measures
nlsy97['earn03'] = nlsy97['earn03']*(cpi.loc[2022]['AVG']/cpi.loc[2003]['AVG'])

# Construct annual earnings measure 2004
# NOTE: Top-coded at $69,342 (lowest value of top 2%)
nlsy97['earn04'] = np.nan
nlsy97.loc[nlsy97['S6501000']>=0,'earn04'] = nlsy97['S6501000'][nlsy97['S6501000']>=0]   # Reports earnings
nlsy97.loc[pd.isna(nlsy97['earn04']) & (nlsy97['S6500800']==0),'earn04'] = 0             # Reports no earnings (#1)
nlsy97.loc[nlsy97['S6500800'].isin([-1,-2]) & (nlsy97['S6500900']==0),'earn04'] = 0      # Reports no earnings (#2)
nlsy97.loc[nlsy97['S6501000'].isin([-1,-2]) & (nlsy97['S6501100']==1),'earn04'] = 2500   # $1 to $5000 range
nlsy97.loc[nlsy97['S6501000'].isin([-1,-2]) & (nlsy97['S6501100']==2),'earn04'] = 7500   # $5001 to $10000 range
nlsy97.loc[nlsy97['S6501000'].isin([-1,-2]) & (nlsy97['S6501100']==3),'earn04'] = 17500  # $10001 to $25000 range
nlsy97.loc[nlsy97['S6501000'].isin([-1,-2]) & (nlsy97['S6501100']==4),'earn04'] = 37500  # $25001 to $50000 range
nlsy97.loc[nlsy97['S6501000'].isin([-1,-2]) & (nlsy97['S6501100']==5),'earn04'] = 75000  # $50001 to $100000 range
nlsy97.loc[nlsy97['S6501000'].isin([-1,-2]) & (nlsy97['S6501100']==6),'earn04'] = 175000 # $100001 to $250000 range
nlsy97.loc[nlsy97['S6501000'].isin([-1,-2]) & (nlsy97['S6501100']==7),'earn04'] = 250000 # $100001 to $250000 range

# Keep track of respondents reporting only an earnings range
nlsy97['earn04_range'] = np.nan
nlsy97.loc[~pd.isna(nlsy97['earn04']),'earn04_range'] = 0
nlsy97.loc[nlsy97['S6501100']>=1,'earn04_range'] = 1

# Deflate earnings measures
nlsy97['earn04'] = nlsy97['earn04']*(cpi.loc[2022]['AVG']/cpi.loc[2004]['AVG'])

# Construct annual earnings measure 2005
# NOTE: Top-coded at $80,471 (lowest value of top 2%)
nlsy97['earn05'] = np.nan
nlsy97.loc[nlsy97['S8496500']>=0,'earn05'] = nlsy97['S8496500'][nlsy97['S8496500']>=0]   # Reports earnings
nlsy97.loc[pd.isna(nlsy97['earn05']) & (nlsy97['S8496300']==0),'earn05'] = 0             # Reports no earnings (#1)
nlsy97.loc[nlsy97['S8496300'].isin([-1,-2]) & (nlsy97['S8496400']==0),'earn05'] = 0      # Reports no earnings (#2)
nlsy97.loc[nlsy97['S8496500'].isin([-1,-2]) & (nlsy97['S8496600']==1),'earn05'] = 2500   # $1 to $5000 range
nlsy97.loc[nlsy97['S8496500'].isin([-1,-2]) & (nlsy97['S8496600']==2),'earn05'] = 7500   # $5001 to $10000 range
nlsy97.loc[nlsy97['S8496500'].isin([-1,-2]) & (nlsy97['S8496600']==3),'earn05'] = 17500  # $10001 to $25000 range
nlsy97.loc[nlsy97['S8496500'].isin([-1,-2]) & (nlsy97['S8496600']==4),'earn05'] = 37500  # $25001 to $50000 range
nlsy97.loc[nlsy97['S8496500'].isin([-1,-2]) & (nlsy97['S8496600']==5),'earn05'] = 75000  # $50001 to $100000 range
nlsy97.loc[nlsy97['S8496500'].isin([-1,-2]) & (nlsy97['S8496600']==6),'earn05'] = 175000 # $100001 to $250000 range
nlsy97.loc[nlsy97['S8496500'].isin([-1,-2]) & (nlsy97['S8496600']==7),'earn05'] = 250000 # $100001 to $250000 range

# Keep track of respondents reporting only an earnings range
nlsy97['earn05_range'] = np.nan
nlsy97.loc[~pd.isna(nlsy97['earn05']),'earn05_range'] = 0
nlsy97.loc[nlsy97['S8496600']>=1,'earn05_range'] = 1

# Deflate earnings measures
nlsy97['earn05'] = nlsy97['earn05']*(cpi.loc[2022]['AVG']/cpi.loc[2005]['AVG'])

# Construct annual earnings measure 2006
# NOTE: Top-coded at $70,000 (lowest value of top 2%)
nlsy97['earn06'] = np.nan
nlsy97.loc[nlsy97['T0889800']>=0,'earn06'] = nlsy97['T0889800'][nlsy97['T0889800']>=0]   # Reports earnings
nlsy97.loc[pd.isna(nlsy97['earn06']) & (nlsy97['T0889600']==0),'earn06'] = 0             # Reports no earnings (#1)
nlsy97.loc[nlsy97['T0889600'].isin([-1,-2]) & (nlsy97['T0889700']==0),'earn06'] = 0      # Reports no earnings (#2)
nlsy97.loc[nlsy97['T0889800'].isin([-1,-2]) & (nlsy97['T0889900']==1),'earn06'] = 2500   # $1 to $5000 range
nlsy97.loc[nlsy97['T0889800'].isin([-1,-2]) & (nlsy97['T0889900']==2),'earn06'] = 7500   # $5001 to $10000 range
nlsy97.loc[nlsy97['T0889800'].isin([-1,-2]) & (nlsy97['T0889900']==3),'earn06'] = 17500  # $10001 to $25000 range
nlsy97.loc[nlsy97['T0889800'].isin([-1,-2]) & (nlsy97['T0889900']==4),'earn06'] = 37500  # $25001 to $50000 range
nlsy97.loc[nlsy97['T0889800'].isin([-1,-2]) & (nlsy97['T0889900']==5),'earn06'] = 75000  # $50001 to $100000 range
nlsy97.loc[nlsy97['T0889800'].isin([-1,-2]) & (nlsy97['T0889900']==6),'earn06'] = 175000 # $100001 to $250000 range
nlsy97.loc[nlsy97['T0889800'].isin([-1,-2]) & (nlsy97['T0889900']==7),'earn06'] = 250000 # $100001 to $250000 range

# Keep track of respondents reporting only an earnings range
nlsy97['earn06_range'] = np.nan
nlsy97.loc[~pd.isna(nlsy97['earn06']),'earn06_range'] = 0
nlsy97.loc[nlsy97['T0889900']>=1,'earn06_range'] = 1

# Deflate earnings measures
nlsy97['earn06'] = nlsy97['earn06']*(cpi.loc[2022]['AVG']/cpi.loc[2006]['AVG'])

# Construct annual earnings measure 2007
# NOTE: Top-coded at $74,900 (lowest value of top 2%)
nlsy97['earn07'] = np.nan
nlsy97.loc[nlsy97['T3003000']>=0,'earn07'] = nlsy97['T3003000'][nlsy97['T3003000']>=0]   # Reports earnings
nlsy97.loc[pd.isna(nlsy97['earn07']) & (nlsy97['T3002800']==0),'earn07'] = 0             # Reports no earnings (#1)
nlsy97.loc[nlsy97['T3002800'].isin([-1,-2]) & (nlsy97['T3002900']==0),'earn07'] = 0      # Reports no earnings (#2)
nlsy97.loc[nlsy97['T3003000'].isin([-1,-2]) & (nlsy97['T3003100']==1),'earn07'] = 2500   # $1 to $5000 range
nlsy97.loc[nlsy97['T3003000'].isin([-1,-2]) & (nlsy97['T3003100']==2),'earn07'] = 7500   # $5001 to $10000 range
nlsy97.loc[nlsy97['T3003000'].isin([-1,-2]) & (nlsy97['T3003100']==3),'earn07'] = 17500  # $10001 to $25000 range
nlsy97.loc[nlsy97['T3003000'].isin([-1,-2]) & (nlsy97['T3003100']==4),'earn07'] = 37500  # $25001 to $50000 range
nlsy97.loc[nlsy97['T3003000'].isin([-1,-2]) & (nlsy97['T3003100']==5),'earn07'] = 75000  # $50001 to $100000 range
nlsy97.loc[nlsy97['T3003000'].isin([-1,-2]) & (nlsy97['T3003100']==6),'earn07'] = 175000 # $100001 to $250000 range
nlsy97.loc[nlsy97['T3003000'].isin([-1,-2]) & (nlsy97['T3003100']==7),'earn07'] = 250000 # $100001 to $250000 range

# Keep track of respondents reporting only an earnings range
nlsy97['earn07_range'] = np.nan
nlsy97.loc[~pd.isna(nlsy97['earn07']),'earn07_range'] = 0
nlsy97.loc[nlsy97['T3003100']>=1,'earn07_range'] = 1

# Deflate earnings measures
nlsy97['earn07'] = nlsy97['earn07']*(cpi.loc[2022]['AVG']/cpi.loc[2007]['AVG'])

# Construct annual earnings measure 2008
# NOTE: Top-coded at $84,000 (lowest value of top 2%)
nlsy97['earn08'] = np.nan
nlsy97.loc[nlsy97['T4406000']>=0,'earn08'] = nlsy97['T4406000'][nlsy97['T4406000']>=0]   # Reports earnings
nlsy97.loc[pd.isna(nlsy97['earn08']) & (nlsy97['T4405800']==0),'earn08'] = 0             # Reports no earnings (#1)
nlsy97.loc[nlsy97['T4405800'].isin([-1,-2]) & (nlsy97['T4405900']==0),'earn08'] = 0      # Reports no earnings (#2)
nlsy97.loc[nlsy97['T4406000'].isin([-1,-2]) & (nlsy97['T4406100']==1),'earn08'] = 2500   # $1 to $5000 range
nlsy97.loc[nlsy97['T4406000'].isin([-1,-2]) & (nlsy97['T4406100']==2),'earn08'] = 7500   # $5001 to $10000 range
nlsy97.loc[nlsy97['T4406000'].isin([-1,-2]) & (nlsy97['T4406100']==3),'earn08'] = 17500  # $10001 to $25000 range
nlsy97.loc[nlsy97['T4406000'].isin([-1,-2]) & (nlsy97['T4406100']==4),'earn08'] = 37500  # $25001 to $50000 range
nlsy97.loc[nlsy97['T4406000'].isin([-1,-2]) & (nlsy97['T4406100']==5),'earn08'] = 75000  # $50001 to $100000 range
nlsy97.loc[nlsy97['T4406000'].isin([-1,-2]) & (nlsy97['T4406100']==6),'earn08'] = 175000 # $100001 to $250000 range
nlsy97.loc[nlsy97['T4406000'].isin([-1,-2]) & (nlsy97['T4406100']==7),'earn08'] = 250000 # $100001 to $250000 range

# Keep track of respondents reporting only an earnings range
nlsy97['earn08_range'] = np.nan
nlsy97.loc[~pd.isna(nlsy97['earn08']),'earn08_range'] = 0
nlsy97.loc[nlsy97['T4406100']>=1,'earn08_range'] = 1

# Deflate earnings measures
nlsy97['earn08'] = nlsy97['earn08']*(cpi.loc[2022]['AVG']/cpi.loc[2008]['AVG'])

# Construct annual earnings measure 2009
# NOTE: Top-coded at $88,000 (lowest value of top 2%)
nlsy97['earn09'] = np.nan
nlsy97.loc[nlsy97['T6055500']>=0,'earn09'] = nlsy97['T6055500'][nlsy97['T6055500']>=0]   # Reports earnings
nlsy97.loc[pd.isna(nlsy97['earn09']) & (nlsy97['T6055300']==0),'earn09'] = 0             # Reports no earnings (#1)
nlsy97.loc[nlsy97['T6055300'].isin([-1,-2]) & (nlsy97['T6055400']==0),'earn09'] = 0      # Reports no earnings (#2)
nlsy97.loc[nlsy97['T6055500'].isin([-1,-2]) & (nlsy97['T6055600']==1),'earn09'] = 2500   # $1 to $5000 range
nlsy97.loc[nlsy97['T6055500'].isin([-1,-2]) & (nlsy97['T6055600']==2),'earn09'] = 7500   # $5001 to $10000 range
nlsy97.loc[nlsy97['T6055500'].isin([-1,-2]) & (nlsy97['T6055600']==3),'earn09'] = 17500  # $10001 to $25000 range
nlsy97.loc[nlsy97['T6055500'].isin([-1,-2]) & (nlsy97['T6055600']==4),'earn09'] = 37500  # $25001 to $50000 range
nlsy97.loc[nlsy97['T6055500'].isin([-1,-2]) & (nlsy97['T6055600']==5),'earn09'] = 75000  # $50001 to $100000 range
nlsy97.loc[nlsy97['T6055500'].isin([-1,-2]) & (nlsy97['T6055600']==6),'earn09'] = 175000 # $100001 to $250000 range
nlsy97.loc[nlsy97['T6055500'].isin([-1,-2]) & (nlsy97['T6055600']==7),'earn09'] = 250000 # $100001 to $250000 range

# Keep track of respondents reporting only an earnings range
nlsy97['earn09_range'] = np.nan
nlsy97.loc[~pd.isna(nlsy97['earn09']),'earn09_range'] = 0
nlsy97.loc[nlsy97['T6055600']>=1,'earn09_range'] = 1

# Deflate earnings measures
nlsy97['earn09'] = nlsy97['earn09']*(cpi.loc[2022]['AVG']/cpi.loc[2009]['AVG'])


# Construct annual earnings measure 2010
# NOTE: Top-coded at $94,000 (lowest value of top 2%)
nlsy97['earn10'] = np.nan
nlsy97.loc[nlsy97['T7545600']>=0,'earn10'] = nlsy97['T7545600'][nlsy97['T7545600']>=0]   # Reports earnings
nlsy97.loc[pd.isna(nlsy97['earn10']) & (nlsy97['T7545400']==0),'earn10'] = 0             # Reports no earnings (#1)
nlsy97.loc[nlsy97['T7545400'].isin([-1,-2]) & (nlsy97['T7545500']==0),'earn10'] = 0      # Reports no earnings (#2)
nlsy97.loc[nlsy97['T7545600'].isin([-1,-2]) & (nlsy97['T7545700']==1),'earn10'] = 2500   # $1 to $5000 range
nlsy97.loc[nlsy97['T7545600'].isin([-1,-2]) & (nlsy97['T7545700']==2),'earn10'] = 7500   # $5001 to $10000 range
nlsy97.loc[nlsy97['T7545600'].isin([-1,-2]) & (nlsy97['T7545700']==3),'earn10'] = 17500  # $10001 to $25000 range
nlsy97.loc[nlsy97['T7545600'].isin([-1,-2]) & (nlsy97['T7545700']==4),'earn10'] = 37500  # $25001 to $50000 range
nlsy97.loc[nlsy97['T7545600'].isin([-1,-2]) & (nlsy97['T7545700']==5),'earn10'] = 75000  # $50001 to $100000 range
nlsy97.loc[nlsy97['T7545600'].isin([-1,-2]) & (nlsy97['T7545700']==6),'earn10'] = 175000 # $100001 to $250000 range
nlsy97.loc[nlsy97['T7545600'].isin([-1,-2]) & (nlsy97['T7545700']==7),'earn10'] = 250000 # $100001 to $250000 range

# Keep track of respondents reporting only an earnings range
nlsy97['earn10_range'] = np.nan
nlsy97.loc[~pd.isna(nlsy97['earn10']),'earn10_range'] = 0
nlsy97.loc[nlsy97['T7545700']>=1,'earn10_range'] = 1

# Deflate earnings measures
nlsy97['earn10'] = nlsy97['earn10']*(cpi.loc[2022]['AVG']/cpi.loc[2010]['AVG'])

# Construct annual earnings measure 2012
# NOTE: Top-coded at $112,000 (lowest value of top 2%)
nlsy97['earn12'] = np.nan
nlsy97.loc[nlsy97['T8976700']>=0,'earn12'] = nlsy97['T8976700'][nlsy97['T8976700']>=0]   # Reports earnings
nlsy97.loc[pd.isna(nlsy97['earn12']) & (nlsy97['T8976500']==0),'earn12'] = 0             # Reports no earnings (#1)
nlsy97.loc[nlsy97['T8976500'].isin([-1,-2]) & (nlsy97['T8976600']==0),'earn12'] = 0      # Reports no earnings (#2)
nlsy97.loc[nlsy97['T8976700'].isin([-1,-2]) & (nlsy97['T8976800']==1),'earn12'] = 2500   # $1 to $5000 range
nlsy97.loc[nlsy97['T8976700'].isin([-1,-2]) & (nlsy97['T8976800']==2),'earn12'] = 7500   # $5001 to $10000 range
nlsy97.loc[nlsy97['T8976700'].isin([-1,-2]) & (nlsy97['T8976800']==3),'earn12'] = 17500  # $10001 to $25000 range
nlsy97.loc[nlsy97['T8976700'].isin([-1,-2]) & (nlsy97['T8976800']==4),'earn12'] = 37500  # $25001 to $50000 range
nlsy97.loc[nlsy97['T8976700'].isin([-1,-2]) & (nlsy97['T8976800']==5),'earn12'] = 75000  # $50001 to $100000 range
nlsy97.loc[nlsy97['T8976700'].isin([-1,-2]) & (nlsy97['T8976800']==6),'earn12'] = 175000 # $100001 to $250000 range
nlsy97.loc[nlsy97['T8976700'].isin([-1,-2]) & (nlsy97['T8976800']==7),'earn12'] = 250000 # $100001 to $250000 range

# Keep track of respondents reporting only an earnings range
nlsy97['earn12_range'] = np.nan
nlsy97.loc[~pd.isna(nlsy97['earn12']),'earn12_range'] = 0
nlsy97.loc[nlsy97['T8976800']>=1,'earn12_range'] = 1

# Deflate earnings measures
nlsy97['earn12'] = nlsy97['earn12']*(cpi.loc[2022]['AVG']/cpi.loc[2012]['AVG'])

# Construct annual earnings measure 2014
# NOTE: Top-coded at $135,000 (lowest value of top 2%)
nlsy97['earn14'] = np.nan
nlsy97.loc[nlsy97['U0956900']>=0,'earn14'] = nlsy97['U0956900'][nlsy97['U0956900']>=0]   # Reports earnings
nlsy97.loc[pd.isna(nlsy97['earn14']) & (nlsy97['U0956700']==0),'earn14'] = 0             # Reports no earnings (#1)
nlsy97.loc[nlsy97['U0956700'].isin([-1,-2]) & (nlsy97['U0956800']==0),'earn14'] = 0      # Reports no earnings (#2)
nlsy97.loc[nlsy97['U0956900'].isin([-1,-2]) & (nlsy97['U0957000']==1),'earn14'] = 2500   # $1 to $5000 range
nlsy97.loc[nlsy97['U0956900'].isin([-1,-2]) & (nlsy97['U0957000']==2),'earn14'] = 7500   # $5001 to $10000 range
nlsy97.loc[nlsy97['U0956900'].isin([-1,-2]) & (nlsy97['U0957000']==3),'earn14'] = 17500  # $10001 to $25000 range
nlsy97.loc[nlsy97['U0956900'].isin([-1,-2]) & (nlsy97['U0957000']==4),'earn14'] = 37500  # $25001 to $50000 range
nlsy97.loc[nlsy97['U0956900'].isin([-1,-2]) & (nlsy97['U0957000']==5),'earn14'] = 75000  # $50001 to $100000 range
nlsy97.loc[nlsy97['U0956900'].isin([-1,-2]) & (nlsy97['U0957000']==6),'earn14'] = 175000 # $100001 to $250000 range
nlsy97.loc[nlsy97['U0956900'].isin([-1,-2]) & (nlsy97['U0957000']==7),'earn14'] = 250000 # $100001 to $250000 range

# Keep track of respondents reporting only an earnings range
nlsy97['earn14_range'] = np.nan
nlsy97.loc[~pd.isna(nlsy97['earn14']),'earn14_range'] = 0
nlsy97.loc[nlsy97['U0957000']>=1,'earn14_range'] = 1

# Deflate earnings measures
nlsy97['earn14'] = nlsy97['earn14']*(cpi.loc[2022]['AVG']/cpi.loc[2014]['AVG'])

# Construct annual earnings measure 2016
# NOTE: Top-coded at $149,000 (lowest value of top 2%)
nlsy97['earn16'] = np.nan
nlsy97.loc[nlsy97['U2857200']>=0,'earn16'] = nlsy97['U2857200'][nlsy97['U2857200']>=0]   # Reports earnings
nlsy97.loc[pd.isna(nlsy97['earn16']) & (nlsy97['U2857000']==0),'earn16'] = 0             # Reports no earnings (#1)
nlsy97.loc[nlsy97['U2857000'].isin([-1,-2]) & (nlsy97['U2857100']==0),'earn16'] = 0      # Reports no earnings (#2)
nlsy97.loc[nlsy97['U2857200'].isin([-1,-2]) & (nlsy97['U2857300']==1),'earn16'] = 2500   # $1 to $5000 range
nlsy97.loc[nlsy97['U2857200'].isin([-1,-2]) & (nlsy97['U2857300']==2),'earn16'] = 7500   # $5001 to $10000 range
nlsy97.loc[nlsy97['U2857200'].isin([-1,-2]) & (nlsy97['U2857300']==3),'earn16'] = 17500  # $10001 to $25000 range
nlsy97.loc[nlsy97['U2857200'].isin([-1,-2]) & (nlsy97['U2857300']==4),'earn16'] = 37500  # $25001 to $50000 range
nlsy97.loc[nlsy97['U2857200'].isin([-1,-2]) & (nlsy97['U2857300']==5),'earn16'] = 75000  # $50001 to $100000 range
nlsy97.loc[nlsy97['U2857200'].isin([-1,-2]) & (nlsy97['U2857300']==6),'earn16'] = 175000 # $100001 to $250000 range
nlsy97.loc[nlsy97['U2857200'].isin([-1,-2]) & (nlsy97['U2857300']==7),'earn16'] = 250000 # $100001 to $250000 range

# Keep track of respondents reporting only an earnings range
nlsy97['earn16_range'] = np.nan
nlsy97.loc[~pd.isna(nlsy97['earn16']),'earn16_range'] = 0
nlsy97.loc[nlsy97['U2857300']>=1,'earn16_range'] = 1

# Deflate earnings measures
nlsy97['earn16'] = nlsy97['earn16']*(cpi.loc[2022]['AVG']/cpi.loc[2016]['AVG'])

# Construct annual earnings measure 2018
# NOTE: Top-coded at $178,000 (lowest value of top 2%)
nlsy97['earn18'] = np.nan
nlsy97.loc[nlsy97['U4282300']>=0,'earn18'] = nlsy97['U4282300'][nlsy97['U4282300']>=0]   # Reports earnings
nlsy97.loc[pd.isna(nlsy97['earn18']) & (nlsy97['U4282100']==0),'earn18'] = 0             # Reports no earnings (#1)
nlsy97.loc[nlsy97['U4282100'].isin([-1,-2]) & (nlsy97['U4282200']==0),'earn18'] = 0      # Reports no earnings (#2)
nlsy97.loc[nlsy97['U4282300'].isin([-1,-2]) & (nlsy97['U4282400']==1),'earn18'] = 2500   # $1 to $5000 range
nlsy97.loc[nlsy97['U4282300'].isin([-1,-2]) & (nlsy97['U4282400']==2),'earn18'] = 7500   # $5001 to $10000 range
nlsy97.loc[nlsy97['U4282300'].isin([-1,-2]) & (nlsy97['U4282400']==3),'earn18'] = 17500  # $10001 to $25000 range
nlsy97.loc[nlsy97['U4282300'].isin([-1,-2]) & (nlsy97['U4282400']==4),'earn18'] = 37500  # $25001 to $50000 range
nlsy97.loc[nlsy97['U4282300'].isin([-1,-2]) & (nlsy97['U4282400']==5),'earn18'] = 75000  # $50001 to $100000 range
nlsy97.loc[nlsy97['U4282300'].isin([-1,-2]) & (nlsy97['U4282400']==6),'earn18'] = 175000 # $100001 to $250000 range
nlsy97.loc[nlsy97['U4282300'].isin([-1,-2]) & (nlsy97['U4282400']==7),'earn18'] = 250000 # $100001 to $250000 range

# Keep track of respondents reporting only an earnings range
nlsy97['earn18_range'] = np.nan
nlsy97.loc[~pd.isna(nlsy97['earn18']),'earn18_range'] = 0
nlsy97.loc[nlsy97['U4282400']>=1,'earn18_range'] = 1

# Deflate earnings measures
nlsy97['earn18'] = nlsy97['earn18']*(cpi.loc[2022]['AVG']/cpi.loc[2018]['AVG'])

In [9]:
nlsy97[['earn97','earn97_range','earn98','earn98_range','earn99','earn99_range', \
        'earn00','earn00_range','earn01','earn01_range','earn02','earn02_range', \
        'earn03','earn03_range','earn04','earn04_range','earn05','earn05_range', \
        'earn06','earn06_range','earn07','earn07_range','earn08','earn08_range',\
        'earn09','earn09_range','earn10','earn10_range','earn12','earn12_range',\
        'earn14','earn14_range','earn16','earn16_range','earn18','earn18_range']].describe()

Unnamed: 0,earn97,earn97_range,earn98,earn98_range,earn99,earn99_range,earn00,earn00_range,earn01,earn01_range,...,earn10,earn10_range,earn12,earn12_range,earn14,earn14_range,earn16,earn16_range,earn18,earn18_range
count,8341.0,8341.0,8144.0,8144.0,8002.0,8002.0,7816.0,7816.0,7792.0,7792.0,...,7354.0,7354.0,7065.0,7065.0,7003.0,7003.0,6644.0,6645.0,6862.0,6862.0
mean,1542.851499,0.061863,2944.712002,0.093197,5228.377046,0.127343,7958.254153,0.111182,10741.705502,0.186473,...,35347.656072,0.07207,38902.618388,0.077849,42885.315082,0.102242,48576.823031,0.055681,54126.821427,0.040367
std,3519.989258,0.240921,6472.282364,0.290727,8943.241954,0.333378,11577.131997,0.314378,14025.114265,0.389513,...,35814.116317,0.258621,39714.497691,0.267952,44619.46736,0.302988,50500.695596,0.229322,60689.188064,0.196833
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,4047.842402,0.0,7684.77293,0.0,9307.592752,0.0,12230.725624,0.0,14028.718504,0.0
50%,0.0,0.0,90.196488,0.0,1413.595414,0.0,3417.821782,0.0,4153.831344,0.0,...,29684.177611,0.0,32019.887207,0.0,35989.358643,0.0,40361.394558,0.0,43255.215389,0.0
75%,1462.711864,0.0,3607.859532,0.0,5300.982801,0.0,11962.376238,0.0,15950.71236,0.0,...,51272.670419,0.0,56355.001484,0.0,61461.137475,0.0,67268.99093,0.0,73650.772148,0.0
max,68564.618644,1.0,67647.366221,1.0,66262.285012,1.0,128168.316832,1.0,290768.19407,1.0,...,236124.140088,1.0,320198.872069,1.0,310253.091746,1.0,305768.14059,1.0,383978.885126,1.0


In [10]:
nlsy97['avg_earn_2014_to_2018'] = nlsy97[['earn14','earn16','earn18']].mean(1)

In [11]:
nlsy97[['avg_earn_2014_to_2018', 'hgc_ever', 'asvab', 'female', 'black', 'hispanic']].dropna(how='any', axis=0).to_csv(data+'nlsy97ss.csv',index=True)