# Constructing the DAG

In [1]:
from causalvis import DAG

In [2]:
import json

In [3]:
import pandas as pd

data = pd.read_csv('../student-por.csv', delimiter=';')
data.head(5)

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,4,3,4,1,1,3,4,0,11,11
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,5,3,3,1,1,3,2,9,11,11
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,4,3,2,2,3,3,6,12,13,12
3,GP,F,15,U,GT3,T,4,2,health,services,...,3,2,2,1,1,5,0,14,14,14
4,GP,F,16,U,GT3,T,3,3,other,other,...,4,3,2,1,2,5,0,11,13,13


In [4]:
drop_col = ['school','sex','age','Mjob', 'Fjob','reason','guardian']
data = data.drop(columns=drop_col)
data.head(5)

Unnamed: 0,address,famsize,Pstatus,Medu,Fedu,traveltime,studytime,failures,schoolsup,famsup,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,U,GT3,A,4,4,2,2,0,yes,no,...,4,3,4,1,1,3,4,0,11,11
1,U,GT3,T,1,1,1,2,0,no,yes,...,5,3,3,1,1,3,2,9,11,11
2,U,LE3,T,1,1,1,2,0,yes,no,...,4,3,2,2,3,3,6,12,13,12
3,U,GT3,T,4,2,1,3,0,no,yes,...,3,2,2,1,1,5,0,14,14,14
4,U,GT3,T,3,3,1,2,0,no,yes,...,4,3,2,1,2,5,0,11,13,13


In [5]:
import numpy as np

struct_data = data.copy()
non_numeric_columns = list(struct_data.select_dtypes(exclude=[np.number]).columns)

print(non_numeric_columns)

['address', 'famsize', 'Pstatus', 'schoolsup', 'famsup', 'paid', 'activities', 'nursery', 'higher', 'internet', 'romantic']


In [6]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

for col in non_numeric_columns:
    struct_data[col] = le.fit_transform(struct_data[col])

struct_data.head(5)

Unnamed: 0,address,famsize,Pstatus,Medu,Fedu,traveltime,studytime,failures,schoolsup,famsup,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,1,0,0,4,4,2,2,0,1,0,...,4,3,4,1,1,3,4,0,11,11
1,1,0,1,1,1,1,2,0,0,1,...,5,3,3,1,1,3,2,9,11,11
2,1,1,1,1,1,1,2,0,1,0,...,4,3,2,2,3,3,6,12,13,12
3,1,0,1,4,2,1,3,0,0,1,...,3,2,2,1,1,5,0,14,14,14
4,1,0,1,3,3,1,2,0,0,1,...,4,3,2,1,2,5,0,11,13,13


### Editing the DAG

You can pass all the attributes in the dataset to the DAG widget and construct your own DAG.
If you would like to skip this step, you can load the example from the saved DAG file in the next cell.

In [7]:
DAG(data=struct_data)

DAG(component='DAG', props={'attributes': ['absences', 'activities', 'address', 'Dalc', 'failures', 'famrel', …

### Identifying control variables

Once we are satisfied with the DAG, we can set the treatment and outcome variables. In this case, we are interested in `G1` as the outcome variable, while `absences` is the treatment variable of interest. After selecting these variables, we see that other attributes in the DAG are automatically colored to reflect their relationship to the treatment and outcome.

From the downloads button, we can get a list of the `confounds` and `prognostic` factors to control for.

In [8]:
with open('../student_por_DAG.json', 'r') as d:
    graph = json.load(d)

In [9]:
DAG(graph=graph)

DAG(component='DAG', props={'attributes': None, 'graph': {'nodes': [{'x': 500, 'y': 250, 'id': 1658176204175, …

In [10]:
confounds = ["failures", "address", "paid", "internet", "Pstatus"]
prognostics = ["higher", "schoolsup", "studytime"]

# Cohort Construction and Refinement

In [31]:
from causalvis import CohortEvaluator

In [19]:
data.shape

(649, 26)

In [20]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = pd.Series(le.fit_transform(data['G1']))
y.mean()

8.403697996918336

In [43]:
a = pd.Series(le.fit_transform(data['absences']))

def toBinary(x):
    if x < 10: return 0
    else: return 1
a = a.apply(toBinary)

a.mean()

0.10785824345146379

In [46]:
confounders = confounds + prognostics

X = data[confounders]
X.dtypes

failures      int64
address      object
paid         object
internet     object
Pstatus      object
higher       object
schoolsup    object
studytime     int64
dtype: object

In [47]:
X = pd.get_dummies(X, prefix_sep='=', drop_first=True)
X.head()

Unnamed: 0,failures,studytime,address=U,paid=yes,internet=yes,Pstatus=T,higher=yes,schoolsup=yes
0,0,2,1,0,0,0,1,1
1,0,2,1,0,1,1,1,0
2,0,2,1,0,1,1,1,1
3,0,3,1,0,1,1,1,0
4,0,2,1,0,0,1,1,0


In [48]:
from sklearn.linear_model import LogisticRegression
from causallib.estimation import IPW

lr = LogisticRegression(solver='lbfgs', max_iter=1000)
ipw = IPW(lr)

In [49]:
ipw.fit(X, a)

IPW(clip_max=None, clip_min=None, use_stabilized=False, verbose=False,
    learner=LogisticRegression(max_iter=1000))

In [50]:
propMatrix = ipw.compute_propensity_matrix(X, a).to_dict(orient="records")

In [51]:
unadjustedData = []

confounds = X.to_dict(orient="records")

for i in range(len(confounds)):
    newDataInstance = confounds[i]
    newDataInstance['treatment'] = a[i]
    newDataInstance['outcome'] = y[i]
    newDataInstance['propensity'] = propMatrix[i]
    
    unadjustedData.append(newDataInstance)

In [52]:
CohortEvaluator(unadjustedCohort=unadjustedData)

CohortEvaluator(component='CohortEvaluator', props={'unadjustedCohort': [{'failures': 0, 'studytime': 2, 'addr…

Since the population is well balanced, we can go ahead and estimate the treatment effect. This can be done quite easily in causallib, using the `estimate_population_outcome` function. From the result, we can see that having < 10 absences a year causes a slightly higher score for `G1`.

In [53]:
ipw.estimate_population_outcome(X, a, y)

0    8.439417
1    7.776299
dtype: float64