In [54]:
import pandas as pd
import numpy as np

df = pd.read_stata("angrist.dta")
# drop columns what we generate later (e.g. education, wage) 
# and ones that are directly correlated with those columns (year of birth) or irrelevant (census)
df = df.drop(columns=['v2', 'v4', 'v8', 'v9', 'v16', 'v22', 'v27'])

# helper function to generate synthetic data
def generate_synthetic_data(df, n):
    synthetic_data = {}
    
    for col in df.columns:
        value_counts = df[col].value_counts(normalize=True)  # Get probabilities
        values = value_counts.index.tolist()  # Unique values
        probabilities = value_counts.values  # Probabilities
        
        # Generate synthetic samples
        synthetic_data[col] = np.random.choice(values, size=n, p=probabilities)
    
    return pd.DataFrame(synthetic_data)

# Generate synthetic DataFrame
synthetic_df = generate_synthetic_data(df, 100000)
synthetic_df.head()

Unnamed: 0,v1,v3,v5,v6,v7,v10,v11,v12,v13,v14,v15,v17,v18,v19,v20,v21,v23,v24,v25,v26
0,44,2,0,0,16,1,1,0,0,0,1,17,3,0,0,0,52,0,0,0
1,44,2,1,0,14,1,0,1,0,0,0,26,4,0,1,0,45,0,0,0
2,32,2,0,0,14,1,0,0,0,0,1,34,1,0,0,0,52,0,0,0
3,42,2,0,0,11,0,0,0,0,0,0,37,4,0,1,0,52,0,0,0
4,44,2,0,0,14,1,1,0,0,0,1,39,3,0,0,0,52,0,0,0


In [47]:
def generate_synthetic_data(df, num_samples=1000):
    synthetic_data = {}
    
    for col in df.columns:
        value_counts = df[col].value_counts(normalize=True)  # Get probabilities
        values = value_counts.index.tolist()  # Unique values
        probabilities = value_counts.values  # Probabilities
        
        # Generate synthetic samples
        synthetic_data[col] = np.random.choice(values, size=num_samples, p=probabilities)
    
    return pd.DataFrame(synthetic_data)

# Generate synthetic DataFrame
synthetic_df = generate_synthetic_data(df, num_samples=100000)
synthetic_df.head()

Unnamed: 0,v3,v5,v6,v7,v10,v11,v12,v13,v14,v15,v17,v19,v20,v21,v23,v24,v25,v26
0,2,0,0,14,1,0,1,0,0,0,47,0,0,0,20,0,0,0
1,2,0,0,15,1,0,0,0,0,0,49,0,0,0,52,0,0,0
2,2,0,0,14,0,0,0,0,0,0,27,0,0,0,52,0,0,0
3,2,0,0,10,1,0,0,0,0,0,42,0,0,0,5,0,0,1
4,2,0,0,18,1,1,0,0,0,0,36,1,1,0,17,0,0,0


In [48]:
# verify the synthetic data is similar to original
summary = synthetic_df.describe().loc[['mean', 'std', 'min', 'max']]
print(summary)
summary = df.describe().loc[['mean', 'std', 'min', 'max']]
print(summary)

            v3        v5        v6         v7       v10       v11       v12  \
mean  1.880000  0.201770  0.063320  15.022690  0.843960  0.165250  0.048410   
std   0.565122  0.401323  0.243539   3.236044  0.362895  0.371408  0.214632   
min   0.000000  0.000000  0.000000   0.000000  0.000000  0.000000  0.000000   
max   3.000000  1.000000  1.000000  22.000000  1.000000  1.000000  1.000000   

           v13      v14       v15        v17       v19      v20       v21  \
mean  0.055870  0.00658  0.128780  30.775940  0.082340  0.21340  0.165300   
std   0.229672  0.08085  0.334958  14.664968  0.274883  0.40971  0.371453   
min   0.000000  0.00000  0.000000   1.000000  0.000000  0.00000  0.000000   
max   1.000000  1.00000  1.000000  99.000000  1.000000  1.00000  1.000000   

           v23       v24       v25       v26  
mean  38.63186  0.075390  0.094170  0.149830  
std   19.93551  0.264021  0.292067  0.365161  
min    0.00000  0.000000  0.000000 -1.000000  
max   52.00000  1.000000  1.00

In [49]:
synthetic_df.columns = ['x' + str(i) for i in range(2, len(df.columns) + 2)]
synthetic_df

Unnamed: 0,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16,x17,x18,x19
0,2,0,0,14,1,0,1,0,0,0,47,0,0,0,20,0,0,0
1,2,0,0,15,1,0,0,0,0,0,49,0,0,0,52,0,0,0
2,2,0,0,14,0,0,0,0,0,0,27,0,0,0,52,0,0,0
3,2,0,0,10,1,0,0,0,0,0,42,0,0,0,5,0,0,1
4,2,0,0,18,1,1,0,0,0,0,36,1,1,0,17,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,2,0,0,12,1,0,0,0,0,0,8,1,0,0,52,0,0,0
99996,2,0,0,11,1,0,0,0,0,0,39,0,0,0,49,0,0,0
99997,2,0,1,14,1,0,1,0,0,0,36,0,0,0,52,0,1,0
99998,1,0,0,11,1,0,0,0,0,0,45,0,0,0,5,0,0,0


In [51]:
N = 100000  # Number of observations

# maybe this works? vanilla does not see the effect of compulsory schooling on wages
u1 = np.random.normal(0, 1, N) # educ shock
v1 = np.random.normal(0, 1, N)  # wage shock
qob = np.random.randint(1, 5, size=N) 
age = np.random.randint(30, 51, size=N)


educ = 10 + 10 * qob + u1
wage = 5 + 3.5 * educ + v1

# Create DataFrame with labeled columns
df = pd.DataFrame({
    'y1': wage,        # Outcome: Wages
    'x1': age,        # Other variable: Schooling
    'z1': qob,        # Instrument
    't1': educ         # Treatment
})

In [52]:
combined_df = pd.concat([df, synthetic_df], axis=1)
print(combined_df)

               y1  x1  z1         t1  x2  x3  x4  x5  x6  x7  ...  x10  x11  \
0      136.261940  31   3  37.874034   2   0   0  14   1   0  ...    0    0   
1      104.398756  39   2  28.590830   2   0   0  15   1   0  ...    0    0   
2      142.348621  45   3  39.689950   2   0   0  14   0   0  ...    0    0   
3       75.545057  39   1  19.618804   2   0   0  10   1   0  ...    0    0   
4       73.887589  48   1  19.853521   2   0   0  18   1   1  ...    0    0   
...           ...  ..  ..        ...  ..  ..  ..  ..  ..  ..  ...  ...  ...   
99995  116.102439  31   2  31.524157   2   0   0  12   1   0  ...    0    0   
99996   74.339567  33   1  20.025006   2   0   0  11   1   0  ...    0    0   
99997  108.541587  46   2  29.323880   2   0   1  14   1   0  ...    0    0   
99998  102.210638  34   2  27.805130   1   0   0  11   1   0  ...    0    0   
99999  150.446263  50   3  41.503265   3   0   0  11   1   0  ...    0    0   

       x12  x13  x14  x15  x16  x17  x18  x19  
0  

In [None]:
from sklearn.linear_model import LinearRegression

# Set seed for reproducibility
# np.random.seed(42)

# Simulate data
N = 100000  # Number of observations

# maybe this works? vanilla does not see the effect of compulsory schooling on wages
u1 = np.random.normal(0, 1, N) # educ shock
v1 = np.random.normal(0, 1, N)  # wage shock
qob = np.random.randint(1, 5, size=N) 
age = np.random.randint(30, 41, size=N)


educ = 10 + 10 * qob + u1
wage = 5 + 3.5 * educ + v1

# Create DataFrame with labeled columns
df = pd.DataFrame({
    'y1': wage,        # Outcome: Wages
    'x1': age,        # Other variable: Schooling
    'z1': qob,        # Instrument
    't1': educ         # Treatment
})

# Save DataFrame to CSV
df.to_csv('generated_data.csv', index=False)

df = pd.read_csv("generated_data.csv")
train_df, temp_df = train_test_split(df, test_size=0.3, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)
train_df.to_csv("train.csv", index=False)
val_df.to_csv("valid.csv", index=False)
test_df.to_csv("test.csv", index=False)

data = CausalDataset('./')

model = Vanilla2SLS()
model.fit(data)
ITE = model.predict(data.train)
ATE,_ = model.ATE(data.train)

ATE


Run -1-th experiment for Vanilla2SLS. 
End. --------------------


3.4997517247474366

In [9]:
from mliv.inference import Vanilla2SLS
from mliv.utils import CausalDataset
from mliv.inference import Poly2SLS

In [16]:
from sklearn.model_selection import train_test_split
df = pd.read_csv("generated_data.csv")
train_df, temp_df = train_test_split(df, test_size=0.3, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)
train_df.to_csv("train.csv", index=False)
val_df.to_csv("valid.csv", index=False)
test_df.to_csv("test.csv", index=False)

data = CausalDataset('./')

model = Vanilla2SLS()
model.fit(data)
ITE = model.predict(data.train)
ATE,_ = model.ATE(data.train)

ATE

Run -1-th experiment for Vanilla2SLS. 
End. --------------------


3.1659505273539454

In [None]:
### vanilla 2sls works, simple model?
u1 = np.random.normal(0, 1, N) # educ shock
v1 = np.random.normal(0, 1, N)  # wage shock
qob = np.random.randint(1, 5, size=N) 
age = np.random.randint(30, 41, size=N)

educ = 10 + 5 * qob + u1
wage = 5 + 3.5 * educ + v1

# weak instrument test, vanilla 2sls doesn't work
educ = 10 + 0.01 * qob + u1
wage = 5 + 3.5 * educ + v1

In [55]:
epsilon = np.random.normal(0, 1, N)  # wage shock
nu = np.random.normal(0, 1, N) # educ shock

# x1 = np.random.normal(0, 1, N) # age

t1 = np.random.binomial(10, 0.5, N)  # compulsory
z1 = np.random.normal(0, 1, N) # qob
e = np.random.normal(0, 1, N) 

In [None]:
# gives nothing
x1 = e + 1.5 * t1 + 0.5 * z1  #educ
y1 = 10 + 3 * x1

# gives 1.5
x1 = 3 + 1.5 * t1 + 0.5 * z1  #educ
y1 = 10 + 3 + 1.5 * t1


# Create DataFrame with labeled columns
df = pd.DataFrame({
    'y1': y1,        # Outcome: Wages
    'x1': x1,        # Other variable: Schooling
    'z1': z1,        # Instrument
    't1': t1         # Treatment
})

# Save DataFrame to CSV
df.to_csv('generated_data.csv', index=False)
print("Data saved to 'generated_data.csv'")

# ------------------------- 2SLS Implementation -------------------------
# Stage 1: Regress treatment (t1) on instrument (z1)
X_first_stage = np.column_stack((z1, t1))  # IV and Treatment as predictors
first_stage_model = LinearRegression()
first_stage_model.fit(X_first_stage, x1)
x1_hat = first_stage_model.predict(X_first_stage)  # Predicted schooling

# Stage 2: Regress outcome (y1) on predicted schooling (x1_hat) and treatment (t1)
X_second_stage = np.column_stack((x1_hat, t1))  # Predicted schooling + Treatment
second_stage_model = LinearRegression()
second_stage_model.fit(X_second_stage, y1)
beta_s_2sls = second_stage_model.coef_[0]  # Effect of schooling on wages (from 2SLS)
beta_t_2sls = second_stage_model.coef_[1]  # Effect of treatment on wages (from 2SLS)

# Print 2SLS estimates
print(f"Estimated effect of schooling (2SLS): {beta_s_2sls:.4f}")
print(f"Estimated effect of treatment (2SLS): {beta_t_2sls:.4f}")

Data saved to 'generated_data.csv'
Estimated effect of schooling (2SLS): -0.0000
Estimated effect of treatment (2SLS): 1.5000
