In [16]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from statsmodels.iolib.summary2 import summary_col
from linearmodels.iv import IV2SLS

In [17]:
file_path = r"C:\Users\jeffr\Downloads\NEW7080.dta"
df = pd.read_stata(file_path)

In [18]:
rename_dict = {
    'v1': 'AGE', 'v2': 'AGEQ', 'v4': 'EDUC', 'v5': 'ENOCENT', 'v6': 'ESOCENT',
    'v9': 'LWKLYWGE', 'v10': 'MARRIED', 'v11': 'MIDATL', 'v12': 'MT',
    'v13': 'NEWENG', 'v16': 'CENSUS', 'v18': 'QOB', 'v19': 'RACE',
    'v20': 'SMSA', 'v21': 'SOATL', 'v24': 'WNOCENT', 'v25': 'WSOCENT', 'v27': 'YOB'
}
df.rename(columns=rename_dict, inplace=True)

In [19]:
df.loc[df['YOB'] >= 1900, 'YOB'] -= 1900
for i in range(10):
    df[f'YR{i}'] = ((df['YOB'] == 20+i) | (df['YOB'] == 30+i) | (df['YOB'] == 40+i)).astype(int)

In [20]:
for i in range(1, 5):
    df[f'QTR{i}'] = (df['QOB'] == i).astype(int)

In [21]:
for j in range(1, 4):
    for i in range(10):
        df[f'QTR{j}YR{i}'] = df[f'QTR{j}'] * df[f'YR{i}']

In [22]:
df = df.drop(columns=['QTR3YR7', 'QTR3YR9'])

In [23]:
df['COHORT'] = 2029
df.loc[(df['YOB'] <= 39) & (df['YOB'] >= 30), 'COHORT'] = 3039
df.loc[(df['YOB'] <= 49) & (df['YOB'] >= 40), 'COHORT'] = 4049
df.loc[df['CENSUS'] == 80, 'AGEQ'] -= 1900
df['AGEQSQ'] = df['AGEQ'] ** 2

In [24]:
df = df[df['COHORT'] < 2030]

In [25]:
# Regressions
models = {}

In [26]:
# Model 1
X = sm.add_constant(df[['EDUC'] + [f'YR{i}' for i in range(9)]])
y = df['LWKLYWGE']
models['model1'] = sm.OLS(y, X).fit()

In [39]:
# Define variables for IV regression
y = df['LWKLYWGE']
X = df[['YR0', 'YR1', 'YR2', 'YR3', 'YR4', 'YR5', 'YR6', 'YR7', 'YR8']]
Z = df[['QTR1YR0', 'QTR1YR1', 'QTR1YR2', 'QTR1YR3', 'QTR1YR4', 'QTR1YR5', 'QTR1YR6', 'QTR1YR7', 'QTR1YR8', 'QTR1YR9',
          'QTR2YR0', 'QTR2YR1', 'QTR2YR2', 'QTR2YR3', 'QTR2YR4', 'QTR2YR5', 'QTR2YR6', 'QTR2YR7', 'QTR2YR8', 'QTR2YR9',
          'QTR3YR0', 'QTR3YR1', 'QTR3YR2', 'QTR3YR3', 'QTR3YR4', 'QTR3YR5', 'QTR3YR6',  'QTR3YR8']]

# Perform IV regression (2SLS)
iv_model2 = IV2SLS(y, X, None, Z).fit()
iv_model2

0,1,2,3
Dep. Variable:,LWKLYWGE,R-squared:,0.8857
Estimator:,IV-2SLS,Adj. R-squared:,0.8857
No. Observations:,247199,F-statistic:,1.387e+07
Date:,"Sat, Jul 06 2024",P-value (F-stat),0.0000
Time:,19:27:40,Distribution:,chi2(9)
Cov. Estimator:,robust,,
,,,

0,1,2,3,4,5,6
,Parameter,Std. Err.,T-stat,P-value,Lower CI,Upper CI
YR0,5.1323,0.0043,1191.4,0.0000,5.1238,5.1407
YR1,5.1498,0.0041,1253.1,0.0000,5.1417,5.1578
YR2,5.1439,0.0043,1204.5,0.0000,5.1355,5.1522
YR3,5.1540,0.0042,1213.9,0.0000,5.1457,5.1624
YR4,5.1598,0.0041,1247.4,0.0000,5.1517,5.1679
YR5,5.1629,0.0041,1253.4,0.0000,5.1549,5.1710
YR6,5.1682,0.0041,1267.1,0.0000,5.1602,5.1762
YR7,5.1626,0.0041,1259.9,0.0000,5.1546,5.1706
YR8,5.1663,0.0040,1278.6,0.0000,5.1584,5.1742


In [40]:
X3 = df[['EDUC', 'YR0', 'YR1', 'YR2', 'YR3', 'YR4', 'YR5', 'YR6', 'YR7', 'YR8', 'AGEQ', 'AGEQSQ']]
X3 = sm.add_constant(X3)
model3 = sm.OLS(df['LWKLYWGE'], X3).fit()

<statsmodels.regression.linear_model.RegressionResultsWrapper at 0x1dfaaad31d0>

In [30]:
y = df['LWKLYWGE']
X = sm.add_constant(df[[f'YR{i}' for i in range(9)] + ['AGEQ', 'AGEQSQ']])


# Endogenous variable (EDUC)
endog = df['EDUC']

# Perform IV regression (2SLS)
iv_model4 = IV2SLS(y, X, endog, Z).fit()

# Print the summary of the IV regression model
print(iv_model4.summary)

                          IV-2SLS Estimation Summary                          
Dep. Variable:               LWKLYWGE   R-squared:                      0.1023
Estimator:                    IV-2SLS   Adj. R-squared:                 0.1023
No. Observations:              247199   F-statistic:                    102.66
Date:                Sat, Jul 06 2024   P-value (F-stat)                0.0000
Time:                        19:22:50   Distribution:                 chi2(12)
Cov. Estimator:                robust                                         
                                                                              
                             Parameter Estimates                              
            Parameter  Std. Err.     T-stat    P-value    Lower CI    Upper CI
------------------------------------------------------------------------------
const          0.1338     1.6605     0.0806     0.9358     -3.1207      3.3882
YR0           -0.1134     0.0714    -1.5880     0.11

In [31]:
X5 = df[['EDUC', 'YR0', 'YR1', 'YR2', 'YR3', 'YR4', 'YR5', 'YR6', 'YR7', 'YR8', 'RACE', 'MARRIED', 'SMSA', 'NEWENG', 'MIDATL',
          'ENOCENT', 'WNOCENT', 'SOATL', 'ESOCENT', 'WSOCENT', 'MT']]
X5 = sm.add_constant(X5)
model5 = sm.OLS(df['LWKLYWGE'], X5).fit()

In [1]:
X_iv6 = df[[f'YR{i}' for i in range(9)] + ['RACE', 'MARRIED', 'SMSA', 'NEWENG', 'MIDATL', 'ENOCENT', 'WNOCENT', 'SOATL', 'ESOCENT', 'WSOCENT', 'MT']]
Z_iv6 = df[[f'QTR{j}YR{i}' for j in range(1, 4) for i in range(10)] + [f'YR{i}' for i in range(9)]]
Z_iv6 = sm.add_constant(Z_iv6)
models['model6'] = IV2SLS(y, X_iv6, Z_iv6, df['EDUC']).fit()

NameError: name 'df' is not defined

In [35]:
X7 = df[['EDUC', 'YR0', 'YR1', 'YR2', 'YR3', 'YR4', 'YR5', 'YR6', 'YR7', 'YR8', 'RACE', 'MARRIED', 'SMSA', 'NEWENG', 'MIDATL',
           'ENOCENT', 'WNOCENT', 'SOATL', 'ESOCENT', 'WSOCENT', 'MT', 'AGEQ', 'AGEQSQ']]
X7 = sm.add_constant(X7)
model7 = sm.OLS(df['LWKLYWGE'], X7).fit()

In [38]:
# Define the dependent variable
y = df['LWKLYWGE']

# Define the exogenous variables
exog_vars = ['YR0', 'YR1', 'YR2', 'YR3', 'YR4', 'YR5', 'YR6', 'YR7', 'YR8', 
             'RACE', 'MARRIED', 'SMSA', 'NEWENG', 'MIDATL', 'ENOCENT', 
             'WNOCENT', 'SOATL', 'ESOCENT', 'WSOCENT', 'MT', 'AGEQ', 'AGEQSQ']
X = sm.add_constant(df[exog_vars])

# Define the instrument variables
instr_vars = [f'QTR{j}YR{i}' for j in range(1, 4) for i in range(10)] + [f'YR{i}' for i in range(9)]


# Perform the IV regression
iv_model = IV2SLS(y, X, df['EDUC'], Z).fit()
iv_model

0,1,2,3
Dep. Variable:,LWKLYWGE,R-squared:,0.2065
Estimator:,IV-2SLS,Adj. R-squared:,0.2064
No. Observations:,247199,F-statistic:,2.883e+04
Date:,"Sat, Jul 06 2024",P-value (F-stat),0.0000
Time:,19:26:29,Distribution:,chi2(23)
Cov. Estimator:,robust,,
,,,

0,1,2,3,4,5,6
,Parameter,Std. Err.,T-stat,P-value,Lower CI,Upper CI
const,0.9994,1.6002,0.6246,0.5323,-2.1370,4.1359
YR0,-0.0680,0.0671,-1.0133,0.3109,-0.1994,0.0635
YR1,-0.0669,0.0614,-1.0906,0.2754,-0.1873,0.0534
YR2,-0.0660,0.0531,-1.2412,0.2145,-0.1701,0.0382
YR3,-0.0601,0.0472,-1.2718,0.2035,-0.1526,0.0325
YR4,-0.0525,0.0403,-1.3024,0.1928,-0.1316,0.0265
YR5,-0.0343,0.0323,-1.0605,0.2889,-0.0976,0.0291
YR6,-0.0248,0.0257,-0.9628,0.3357,-0.0752,0.0257
YR7,-0.0095,0.0166,-0.5735,0.5663,-0.0421,0.0230
