### Load Data

In [6]:
import pandas as pd

columns = ["training",   # Treatment assignment indicator
           "age",        # Age of participant
           "education",  # Years of education
           "black",      # Indicate whether individual is black
           "hispanic",   # Indicate whether individual is hispanic
           "married",    # Indicate whether individual is married
           "no_degree",  # Indicate if individual has no high-school diploma
           "re74",       # Real earnings in 1974, prior to study participation
           "re75",       # Real earnings in 1975, prior to study participation
           "re78"]       # Real earnings in 1978, after study end

#treated = pd.read_csv("http://www.nber.org/~rdehejia/data/nswre74_treated.txt", 
#                      delim_whitespace=True, header=None, names=columns)
#control = pd.read_csv("http://www.nber.org/~rdehejia/data/nswre74_control.txt",
#                      delim_whitespace=True, header=None, names=columns)
file_names = ["http://www.nber.org/~rdehejia/data/nswre74_treated.txt",
              "http://www.nber.org/~rdehejia/data/nswre74_control.txt",
              "http://www.nber.org/~rdehejia/data/psid_controls.txt",
              "http://www.nber.org/~rdehejia/data/psid2_controls.txt",
              "http://www.nber.org/~rdehejia/data/psid3_controls.txt",
              "http://www.nber.org/~rdehejia/data/cps_controls.txt",
              "http://www.nber.org/~rdehejia/data/cps2_controls.txt",
              "http://www.nber.org/~rdehejia/data/cps3_controls.txt"]
files = [pd.read_csv(file_name, delim_whitespace=True, header=None, names=columns) for file_name in file_names]
lalonde = pd.concat(files, ignore_index=True)
lalonde = lalonde.sample(frac=1.0, random_state=42)  # Shuffle

print(lalonde.shape)
lalonde.head()

(22106, 10)


Unnamed: 0,training,age,education,black,hispanic,married,no_degree,re74,re75,re78
16827,0.0,26.0,13.0,0.0,0.0,0.0,0.0,58.778,50.12903,31.03226
5412,0.0,27.0,12.0,0.0,0.0,1.0,0.0,16297.18,13429.21,19562.14
15399,0.0,26.0,12.0,0.0,0.0,0.0,0.0,5217.527,3174.242,25564.67
13077,0.0,38.0,16.0,0.0,0.0,1.0,0.0,23713.01,9178.984,18814.41
2189,0.0,55.0,8.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0


In [7]:
import warnings
from causalnex.structure import StructureModel

warnings.filterwarnings("ignore")  # silence warnings

sm = StructureModel()

from causalnex.structure.notears import from_pandas

sm = from_pandas(lalonde)
sm.remove_edges_below_threshold(0.8)

### DAG

In [12]:
from causalvis import DAG

In [13]:
dg = DAG(nx_graph=sm)
dg

DAG(component='DAG', props={'attributes': None, 'graph': {'nodes': [{'x': 0.0348850974558358, 'y': -0.39330873…

In [14]:
# confounds = [c["name"] for c in dg.confounds]
prognostics = [p["name"] for p in dg.prognostics]

In [None]:
X = lalonde[confounders]
X.dtypes

In [4]:
lalonde = lalonde.join((lalonde[["re74", "re75"]] == 0).astype(int), rsuffix=("=0"))
lalonde.head()

Unnamed: 0,training,age,education,black,hispanic,married,no_degree,re74,re75,re78,re74=0,re75=0
16827,0.0,26.0,13.0,0.0,0.0,0.0,0.0,58.778,50.12903,31.03226,0,0
5412,0.0,27.0,12.0,0.0,0.0,1.0,0.0,16297.18,13429.21,19562.14,0,0
15399,0.0,26.0,12.0,0.0,0.0,0.0,0.0,5217.527,3174.242,25564.67,0,0
13077,0.0,38.0,16.0,0.0,0.0,1.0,0.0,23713.01,9178.984,18814.41,0,0
2189,0.0,55.0,8.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1,1


In [5]:
lalonde = pd.get_dummies(lalonde, columns=["education"], drop_first=True)
print(lalonde.shape)
lalonde.head()

(22106, 29)


Unnamed: 0,training,age,black,hispanic,married,no_degree,re74,re75,re78,re74=0,...,education_9.0,education_10.0,education_11.0,education_12.0,education_13.0,education_14.0,education_15.0,education_16.0,education_17.0,education_18.0
16827,0.0,26.0,0.0,0.0,0.0,0.0,58.778,50.12903,31.03226,0,...,0,0,0,0,1,0,0,0,0,0
5412,0.0,27.0,0.0,0.0,1.0,0.0,16297.18,13429.21,19562.14,0,...,0,0,0,1,0,0,0,0,0,0
15399,0.0,26.0,0.0,0.0,0.0,0.0,5217.527,3174.242,25564.67,0,...,0,0,0,1,0,0,0,0,0,0
13077,0.0,38.0,0.0,0.0,1.0,0.0,23713.01,9178.984,18814.41,0,...,0,0,0,0,0,0,0,1,0,0
2189,0.0,55.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1,...,0,0,0,0,0,0,0,0,0,0


In [4]:
a = lalonde.pop("training")
y = lalonde.pop("re78")
X = lalonde
X.shape, a.shape, y.shape

((22106, 27), (22106,), (22106,))