In [1]:
import pandas as pd
import numpy as np

In [2]:
csv_path = "/Users/alina/Documents/ECO 726_Policy/cwhsc_min.csv"

df = pd.read_csv(csv_path)
df.head()

Unnamed: 0,white,byr,year,eligible,type,smplsz,var_cm,nomearn,cpi
0,0,51,81,0,TOTAL,44.968002,42235.441,12219.284,1.394
1,1,51,81,0,TAXAB,160.916,9900.9258,16528.439,1.394
2,0,52,81,0,TAXAB,33.0,27642.42,14171.673,1.394
3,1,51,81,0,ADJ,192.88499,12715.132,18027.689,1.394
4,1,51,81,0,TOTAL,184.011,28836.432,17668.795,1.394


In [3]:
#collapsing average of variables and reshaping to be wide
wide = df.pivot_table(
    index=["white", "byr", "year", "eligible"],
    columns="type",
    values="nomearn"
).reset_index()

wide.head()

type,white,byr,year,eligible,ADJ,TAXAB,TOTAL
0,0,50,81,0,14256.892,13704.102,14241.764
1,0,50,81,1,14910.902,14238.623,15044.354
2,0,50,82,0,15560.586,14846.608,16141.942
3,0,50,82,1,15896.007,15131.768,16467.982
4,0,50,83,0,16639.266,16189.537,17626.727


In [4]:
#separating elgible and noneligible 
wide_eligible = wide[wide["eligible"]==1].set_index(["white", "byr", "year"])
wide_noneligible = wide[wide["eligible"]==0].set_index(["white", "byr", "year"])

#finding difference between eligible and noneligible
diff = wide_eligible[["ADJ", "TAXAB", "TOTAL"]] - wide_noneligible[["ADJ", "TAXAB", "TOTAL"]]

#renaming columns for easier calculation later
diff = diff.rename(columns={
    "TAXAB": "FICA effects",           #column 1, FICA earnings
    "ADJ": "AdjFICA effects",          #column 2, Adjusted FICA earnings
    "TOTAL": "TotalW2 effects"         #column 3, total W-2 earnings
}).reset_index()

diff.head()

type,white,byr,year,AdjFICA effects,FICA effects,TotalW2 effects
0,0,50,81,654.01,534.521,802.59
1,0,50,82,335.421,285.16,326.04
2,0,50,83,169.08,96.066,315.48
3,0,50,84,-65.064,-76.869,-287.444
4,0,51,81,450.563881,371.793116,393.121123


In [5]:
#averaging CPI for birth cohorts and years
cpi_by = (
    df.groupby(["byr", "year"])["cpi"]
    .mean()
    .reset_index()
    .rename(columns={"cpi": "cpi_year"})
)

#putting CPI back into table
diff = diff.merge(cpi_by, on=["byr", "year"], how="left")

#estimating the first stage estimates (p^e-p^n, column 4)
sippp_map = {50: 0.159, 51: 0.136, 52: 0.105}
diff["p^e-p^n"] = diff["byr"].map(sippp_map)

diff.head()

Unnamed: 0,white,byr,year,AdjFICA effects,FICA effects,TotalW2 effects,cpi_year,p^e-p^n
0,0,50,81,654.01,534.521,802.59,1.394,0.159
1,0,50,82,335.421,285.16,326.04,1.48,0.159
2,0,50,83,169.08,96.066,315.48,1.527,0.159
3,0,50,84,-65.064,-76.869,-287.444,1.592,0.159
4,0,51,81,450.563881,371.793116,393.121123,1.394,0.136


In [6]:
#math for wald estimator (column 5)
diff["Service Effect 1978"] = diff["AdjFICA effects"] / (diff["p^e-p^n"] * diff["cpi_year"])

diff.head()

Unnamed: 0,white,byr,year,AdjFICA effects,FICA effects,TotalW2 effects,cpi_year,p^e-p^n,Service Effect 1978
0,0,50,81,654.01,534.521,802.59,1.394,0.159,2950.695943
1,0,50,82,335.421,285.16,326.04,1.48,0.159,1425.382458
2,0,50,83,169.08,96.066,315.48,1.527,0.159,696.395695
3,0,50,84,-65.064,-76.869,-287.444,1.592,0.159,-257.039917
4,0,51,81,450.563881,371.793116,393.121123,1.394,0.136,2376.592166


In [7]:
#replicating table 3 from Anrgist
#keeping whites only 
table3 = diff[diff["white"] == 1].copy()

#dropping CPI and white so it's not in final table
table3 = table3.drop(columns=["cpi_year"])
table3 = table3.drop(columns=["white"])

#renaming columns to match with paper
table3 = table3.rename(columns={
    "byr": "Cohort",
    "year": "Year",
    "FICA effects": "FICA Earnings",
    "AdjFICA effects": "Adjusted FICA Earnings",
    "TotalW2 effects": "Total W-2 Earnings",
    "p^e-p^n": "p^e-p^n",
    "Service Effect 1978": "Service Effect in 1978 $"
})

#sorting by year and cohort to match paper 
table3 = table3.sort_values(["Cohort", "Year"])

#rounding decimals to only include one point 
table3 = table3.round(1)
table3

Unnamed: 0,Cohort,Year,Adjusted FICA Earnings,FICA Earnings,Total W-2 Earnings,p^e-p^n,Service Effect in 1978 $
12,50,81,-487.8,-435.8,-589.7,0.2,-2200.9
13,50,82,-396.1,-320.2,-305.5,0.2,-1683.2
14,50,83,-450.1,-349.6,-512.9,0.2,-1853.9
15,50,84,-638.8,-484.4,-1143.3,0.2,-2523.4
16,51,81,-419.4,-346.8,-63.2,0.1,-2212.4
17,51,82,-277.5,-115.4,-101.5,0.1,-1378.8
18,51,83,-444.1,-299.4,-891.4,0.1,-2138.6
19,51,84,-572.2,-391.8,-825.4,0.1,-2642.7
20,52,81,-383.9,-333.5,-431.5,0.1,-2622.6
21,52,82,-268.0,-249.8,-522.7,0.1,-1724.9


In [9]:
table3.to_latex(
    "table3_python.tex",
    index=False,
    float_format="%.1f",
    caption="Table 3 - Wald Estimates",
    label="tab:wald",
    longtable=False,
    escape=False
)
