In [10]:
import pandas as pd
import numpy as np

df = pd.read_stata("angrist.dta")
# drop columns what we generate later (e.g. education, wage) 
# and ones that are directly correlated with those columns (year of birth) or irrelevant (census)
df = df.drop(columns=['v2', 'v4', 'v8', 'v9', 'v16', 'v22', 'v27'])

# helper function to generate synthetic data
def generate_synthetic_data(df, n):
    synthetic_data = {}
    
    for col in df.columns:
        value_counts = df[col].value_counts(normalize=True) 
        values = value_counts.index.tolist()
        probabilities = value_counts.values

        synthetic_data[col] = np.random.choice(values, size=n, p=probabilities)
    
    return pd.DataFrame(synthetic_data)

synthetic_df = generate_synthetic_data(df, 100000)
synthetic_df.head()

Unnamed: 0,v1,v3,v5,v6,v7,v10,v11,v12,v13,v14,v15,v17,v18,v19,v20,v21,v23,v24,v25,v26
0,42,2,0,0,11,0,0,1,0,0,0,26,3,0,0,0,48,0,0,0
1,47,2,1,0,20,1,1,0,0,0,0,41,4,0,0,1,35,1,0,0
2,33,2,0,0,18,1,0,0,0,0,0,47,3,0,0,0,52,0,0,0
3,42,2,0,0,14,1,0,0,0,0,0,48,3,0,1,0,5,1,0,1
4,40,3,0,0,18,0,0,0,0,0,1,36,3,0,0,0,5,0,0,0


In [11]:
# verify the synthetic data is similar to original
summary = synthetic_df.describe().loc[['mean', 'std', 'min', 'max']]
print(summary)
summary = df.describe().loc[['mean', 'std', 'min', 'max']]
print(summary)

             v1        v3        v5        v6         v7       v10      v11  \
mean  39.948790  1.883640  0.203950  0.064720  15.022500  0.843000  0.16472   
std    5.920983  0.565087  0.402934  0.246032   3.239234  0.363803  0.37093   
min   30.000000  0.000000  0.000000  0.000000   0.000000  0.000000  0.00000   
max   50.000000  3.000000  1.000000  1.000000  22.000000  1.000000  1.00000   

           v12       v13       v14       v15        v17       v18       v19  \
mean  0.048620  0.054780  0.006870  0.126400  30.718710  2.519370  0.081050   
std   0.215073  0.227551  0.082601  0.332301  14.776483  1.111727  0.272913   
min   0.000000  0.000000  0.000000  0.000000   1.000000  1.000000  0.000000   
max   1.000000  1.000000  1.000000  1.000000  99.000000  4.000000  1.000000   

           v20       v21        v23       v24       v25      v26  
mean  0.212920  0.167060  38.629130  0.077300  0.093970  0.15194  
std   0.409374  0.373031  19.939165  0.267068  0.291788  0.36690  
min   0

In [12]:
from sklearn.model_selection import train_test_split
import os

In [13]:
# basic test set
N = 100000
filepath = './basic/'

u1 = np.random.normal(0, 1, N) # educ shock
v1 = np.random.normal(0, 1, N)  # wage shock
qob = np.random.randint(1, 5, size=N) 
age = np.random.randint(30, 51, size=N)

# straightforward, test set: ./basic/
educ = 10 + 1.5 * qob + u1
wage = 5 + 2.5 * educ + v1

df = pd.DataFrame({
    'y1': wage,        # Outcome: Wages
    'x1': age,        # Other variable: Schooling
    'z1': qob,        # Instrument
    't1' : educ         # Treatment
})

angrist = pd.read_stata("angrist.dta")
synth = generate_synthetic_data(angrist, N)
synth.columns = ['x' + str(i) for i in range(2, len(angrist.columns) + 2)]
combined_df = pd.concat([df, synthetic_df], axis=1)
if not os.path.isdir(filepath):
    os.makedirs(filepath)
train_df, temp_df = train_test_split(combined_df, test_size=0.3)
val_df, test_df = train_test_split(temp_df, test_size=0.5)
train_df.to_csv(filepath + "train.csv", index=False)
val_df.to_csv(filepath + "valid.csv", index=False)
test_df.to_csv(filepath + "test.csv", index=False)

In [14]:
# one weak instrument
N = 100000
filepath = './weak/'

u1 = np.random.normal(0, 1, N) # educ shock
v1 = np.random.normal(0, 1, N)  # wage shock
qob = np.random.randint(1, 5, size=N) 
age = np.random.randint(30, 51, size=N)

educ = 10 + 0.1 * qob + u1
wage = 5 + 2.5 * educ + v1

df = pd.DataFrame({
    'y1': wage,        # Outcome: Wages
    'x1': age,        # Other variable: Schooling
    'z1': qob,        # Instrument
    't1' : educ         # Treatment
})

angrist = pd.read_stata("angrist.dta")
synth = generate_synthetic_data(angrist, N)
synth.columns = ['x' + str(i) for i in range(2, len(angrist.columns) + 2)]
combined_df = pd.concat([df, synthetic_df], axis=1)
if not os.path.isdir(filepath):
    os.makedirs(filepath)
train_df, temp_df = train_test_split(combined_df, test_size=0.3)
val_df, test_df = train_test_split(temp_df, test_size=0.5)
train_df.to_csv(filepath + "train.csv", index=False)
val_df.to_csv(filepath + "valid.csv", index=False)
test_df.to_csv(filepath + "test.csv", index=False)

In [16]:
# strong but endogenous instrument
N = 100000
filepath = './endog/'

u1 = np.random.normal(0, 1, N) # educ shock
v1 = np.random.normal(0, 1, N)  # wage shock
g1 = np.random.normal(0, 1, N)  # 'health' shock
qob = np.random.randint(1, 5, size=N) 
age = np.random.randint(30, 51, size=N)

educ = 10 + 1.5 * qob + u1
health = 3 + 0.5 * qob + g1 # arbitrary unobserved variable for endogenous effect
wage = 5 + 2.5 * educ + health + v1

df = pd.DataFrame({
    'y1': wage,        # Outcome: Wages
    'x1': age,        # Other variable: Schooling
    'z1': qob,        # Instrument
    't1' : educ         # Treatment
})

angrist = pd.read_stata("angrist.dta")
synth = generate_synthetic_data(angrist, N)
synth.columns = ['x' + str(i) for i in range(2, len(angrist.columns) + 2)]
combined_df = pd.concat([df, synthetic_df], axis=1)
if not os.path.isdir(filepath):
    os.makedirs(filepath)
train_df, temp_df = train_test_split(combined_df, test_size=0.3)
val_df, test_df = train_test_split(temp_df, test_size=0.5)
train_df.to_csv(filepath + "train.csv", index=False)
val_df.to_csv(filepath + "valid.csv", index=False)
test_df.to_csv(filepath + "test.csv", index=False)

In [None]:
# many weak
N = 100000
filepath = './manyweak/'

angrist = pd.read_stata("angrist.dta")
synth = generate_synthetic_data(angrist, N)
synth.columns = ['x' + str(i) for i in range(2, len(angrist.columns) + 2)]

u1 = np.random.normal(0, 1, N) # educ shock
v1 = np.random.normal(0, 1, N)  # wage shock
qob = np.random.randint(1, 5, size=N) 
age = np.random.randint(30, 51, size=N)

df = pd.DataFrame({
    'u1': u1,    
    'v1': v1,   
    'x1': age,       
    'z1' : qob         
})

combined_df = pd.concat([df, synthetic_df], axis=1)
combined_df['const'] = 1

combined_df.head()


weights = {'v1': 0, 'const': 10, 'u1': 1}  
for col in combined_df.columns:
    if col not in weights:
        weights[col] = 0.1 + np.random.normal(0, 1)

selected_columns = list(weights.keys())

t1 = np.array([
    sum(row[col] * weights[col] for col in selected_columns)
    for _, row in combined_df.iterrows()
])

t1 = t1[:,0]
combined_df['t1'] = t1

# wage = 5 + 2.5 educ + v1
weights = { 'const': 5, 'v1': 1, 't1': 2.5}  
selected_columns = ['const', 'v1', 't1']
y1 = np.array([
    sum(row[col] * weights[col] for col in selected_columns)
    for _, row in combined_df.iterrows()
])

y1 = y1[:,0]
combined_df['y1'] = y1

if not os.path.isdir(filepath):
    os.makedirs(filepath)
train_df, temp_df = train_test_split(combined_df, test_size=0.3)
val_df, test_df = train_test_split(temp_df, test_size=0.5)
train_df.to_csv(filepath + "train.csv", index=False)
val_df.to_csv(filepath + "valid.csv", index=False)
test_df.to_csv(filepath + "test.csv", index=False)

In [None]:
# many weak, one strong
N = 100000
filepath = './manyweak1strong/'

angrist = pd.read_stata("angrist.dta")
synth = generate_synthetic_data(angrist, N)
synth.columns = ['x' + str(i) for i in range(2, len(angrist.columns) + 2)]

u1 = np.random.normal(0, 1, N) # educ shock
v1 = np.random.normal(0, 1, N)  # wage shock
qob = np.random.randint(1, 5, size=N) 
age = np.random.randint(30, 51, size=N)

df = pd.DataFrame({
    'u1': u1,    
    'v1': v1,   
    'x1': age,       
    'z1' : qob         
})

combined_df = pd.concat([df, synthetic_df], axis=1)
combined_df['const'] = 1

combined_df.head()


weights = {'z1': 1.5, 'v1': 0, 'const': 10, 'u1': 1}  
for col in combined_df.columns:
    if col not in weights:
        weights[col] = 0.1 + np.random.normal(0, 1)

selected_columns = list(weights.keys())

t1 = np.array([
    sum(row[col] * weights[col] for col in selected_columns)
    for _, row in combined_df.iterrows()
])

t1 = t1[:,0]
combined_df['t1'] = t1

# wage = 5 + 2.5 educ + v1
weights = { 'const': 5, 'v1': 1, 't1': 2.5}  
selected_columns = ['const', 'v1', 't1']
y1 = np.array([
    sum(row[col] * weights[col] for col in selected_columns)
    for _, row in combined_df.iterrows()
])

y1 = y1[:,0]
combined_df['y1'] = y1

if not os.path.isdir(filepath):
    os.makedirs(filepath)
train_df, temp_df = train_test_split(combined_df, test_size=0.3)
val_df, test_df = train_test_split(temp_df, test_size=0.5)
train_df.to_csv(filepath + "train.csv", index=False)
val_df.to_csv(filepath + "valid.csv", index=False)
test_df.to_csv(filepath + "test.csv", index=False)

In [3]:
from mliv.inference import Vanilla2SLS
from mliv.utils import CausalDataset

In [19]:
data = CausalDataset('./manyweak/')

model = Vanilla2SLS()
model.fit(data)
ITE = model.predict(data.train)
ATE,_ = model.ATE(data.train)

ATE

Run -1-th experiment for Vanilla2SLS. 
End. --------------------


1.462296907614629