In [3]:
import numpy as np
import pandas as pd
import sklearn

# The Cohort Evaluator Module

After a cohort has been selected for treatment effect estimation, the cohort evaluator module can be used to visualize the propensity distribution of the selected cohort. It also visualizes the standard mean differences and distributions of each covariate.

In [4]:
from causalvis import CohortEvaluator

### Load Data

For this example, we are using the [UCI Student Performance dataset](https://archive.ics.uci.edu/ml/datasets/Student+Performance), which includes data from a two Portuguese schools of student grades throughout the year. The treatment of interest is the `absences` variable, which records whether the student had more than or less than 4 absences in a year. The outcome variable is `G1`, which is their grade during the 1st period of the year.

This example is modified from [causalnex](https://causalnex.readthedocs.io/en/latest/03_tutorial/01_first_tutorial.html).

In [5]:
data = pd.read_csv('../student-por.csv', delimiter=';')
data.head(5)

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,4,3,4,1,1,3,4,0,11,11
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,5,3,3,1,1,3,2,9,11,11
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,4,3,2,2,3,3,6,12,13,12
3,GP,F,15,U,GT3,T,4,2,health,services,...,3,2,2,1,1,5,0,14,14,14
4,GP,F,16,U,GT3,T,3,3,other,other,...,4,3,2,1,2,5,0,11,13,13


In [6]:
drop_col = ['school','sex','age','Mjob', 'Fjob','reason','guardian']
data = data.drop(columns=drop_col)
data.head(5)

Unnamed: 0,address,famsize,Pstatus,Medu,Fedu,traveltime,studytime,failures,schoolsup,famsup,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,U,GT3,A,4,4,2,2,0,yes,no,...,4,3,4,1,1,3,4,0,11,11
1,U,GT3,T,1,1,1,2,0,no,yes,...,5,3,3,1,1,3,2,9,11,11
2,U,LE3,T,1,1,1,2,0,yes,no,...,4,3,2,2,3,3,6,12,13,12
3,U,GT3,T,4,2,1,3,0,no,yes,...,3,2,2,1,1,5,0,14,14,14
4,U,GT3,T,3,3,1,2,0,no,yes,...,4,3,2,1,2,5,0,11,13,13


In [7]:
import numpy as np

struct_data = data.copy()
non_numeric_columns = list(struct_data.select_dtypes(exclude=[np.number]).columns)

print(non_numeric_columns)

['address', 'famsize', 'Pstatus', 'schoolsup', 'famsup', 'paid', 'activities', 'nursery', 'higher', 'internet', 'romantic']


In [8]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

for col in non_numeric_columns:
    struct_data[col] = le.fit_transform(struct_data[col])

struct_data.head(5)

Unnamed: 0,address,famsize,Pstatus,Medu,Fedu,traveltime,studytime,failures,schoolsup,famsup,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,1,0,0,4,4,2,2,0,1,0,...,4,3,4,1,1,3,4,0,11,11
1,1,0,1,1,1,1,2,0,0,1,...,5,3,3,1,1,3,2,9,11,11
2,1,1,1,1,1,1,2,0,1,0,...,4,3,2,2,3,3,6,12,13,12
3,1,0,1,4,2,1,3,0,0,1,...,3,2,2,1,1,5,0,14,14,14
4,1,0,1,3,3,1,2,0,0,1,...,4,3,2,1,2,5,0,11,13,13


In [12]:
data.shape

(649, 26)

In [13]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = pd.Series(le.fit_transform(data['G1']))
y.mean()

8.403697996918336

In [14]:
a = pd.Series(le.fit_transform(data['absences']))
a.mean()

3.596302003081664

In [15]:
def toBinary(x):
    if x < 4: return 0
    else: return 1
a = a.apply(toBinary)

In [16]:
confounds = ["failures", "address", "paid", "internet", "Pstatus"]
prognostics = ["higher", "schoolsup", "studytime"]

In [17]:
confounds = confounds + prognostics

In [18]:
X = data[confounds]
X.dtypes

failures      int64
address      object
paid         object
internet     object
Pstatus      object
higher       object
schoolsup    object
studytime     int64
dtype: object

In [19]:
X = pd.get_dummies(X, prefix_sep='=', drop_first=True)
X.head()

Unnamed: 0,failures,studytime,address=U,paid=yes,internet=yes,Pstatus=T,higher=yes,schoolsup=yes
0,0,2,1,0,0,0,1,1
1,0,2,1,0,1,1,1,0
2,0,2,1,0,1,1,1,1
3,0,3,1,0,1,1,1,0
4,0,2,1,0,0,1,1,0


### Calculate Propensity Scores

Using the causallib IPW module, we can perform inverse propensity weighting and obtain the propensity score for each instance in the data set.

Note that while we use causallib for this example, any other method of calculating the propensity score would work just as well.

In [20]:
from sklearn.linear_model import LogisticRegression
from causallib.estimation import IPW

lr = LogisticRegression(solver='lbfgs', max_iter=1000)
ipw = IPW(lr)

In [21]:
ipw.fit(X, a)

IPW(clip_max=None, clip_min=None, use_stabilized=False, verbose=False,
    learner=LogisticRegression(max_iter=1000))

In [22]:
propMatrix = ipw.compute_propensity_matrix(X, a).to_dict(orient="records")

### Prepare Data

The `CohortEvaluator` module expects data in the form of a `List` of `[{instance_1}, {instance_2}, ...]`, where each data instance is represented by a `Dict` of relevant attributes such that:

<pre>
{confound1: x1,
 confound2: x2,
 ...,
 treatment: 0 or 1,
 outcome: y1,
 propensity: {0: propensity_1, 1:propensity_2}
}</pre>

In [23]:
unadjustedData = []

confounds = X.to_dict(orient="records")

for i in range(len(confounds)):
    newDataInstance = confounds[i]
    newDataInstance['treatment'] = a[i]
    newDataInstance['outcome'] = y[i]
    newDataInstance['propensity'] = propMatrix[i]
    
    unadjustedData.append(newDataInstance)

# Uncomment the following line to see an example of the data format
# unadjustedData[0:1]

### Cohort Evaluator with Propensity Scores

Once the data has been prepared, it can be passed to the CohortEvaluator using the `unadjustedCohort` prop.

On load, the propensity score distribution plot is visualized on the left, and the standardized mean difference plot (or Love plot) is visualized on the right. The buttons above this SMD plot can be used to toggle between the summary view and the details view. In the details view, the distribution of each covariate is visualized. By default, only covariates with an adjusted SMD > 0.1 are shown.

In [24]:
ceval = CohortEvaluator(unadjustedCohort=unadjustedData)
ceval

CohortEvaluator(component='CohortEvaluator', props={'unadjustedCohort': [{'failures': 0, 'studytime': 2, 'addr…

### Selecting subgroups

In situations where the propensity distributions of the treatment and control groups are highly unbalanced (such as the above), it is often useful to select the unbalanced subpopulation to exclude or characterize them in some way. This can be done by brushing (clicking and dragging) over relevant value ranges in the propensity score plot. The selected items can be downloaded using the `Download` button above the visualization, or by accessing the python variable `ceval.selection`. The inverse of the selection can also be obtained by accessing the python variable `ceval.iselection`.

Task:

1) in the above visualization, **select** the range of elements in the propensity score plot that are imbalanced

- Click and drag over the propensity score plot to select a range of elements

2) **print the first three items of the selection** in the following cell

3) **print the first three items of the inverse selection**

- Add a cell and access the inverse selection using `ceval.iselection["confounds"][0:3]`

In [21]:
ceval.selection["confounds"]

[]

### Estimate the Average Treatment Effect (ATE)

Since the selected cohort is well-balanced, we can go on to estimate the ATE. In this case, we will simply call the causallib function.

In [25]:
outcomes = ipw.estimate_population_outcome(X, a, y)
outcomes

0    8.662879
1    8.040071
dtype: float64