In [25]:
from causalvis import CohortEvaluator

In [26]:
import numpy as np
import pandas as pd

def read_data_from_UCI():
    """Reads the bank-marketing data table from a zip file directly from UCI"""
    import zipfile
    import io
    from urllib import request

    url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00222/bank-additional.zip"
    with request.urlopen(url) as r:
        with zipfile.ZipFile(io.BytesIO(r.read())) as zf:
            csv_file = zf.open("bank-additional/bank-additional-full.csv")
            df = pd.read_csv(csv_file, sep=";")
    return df

In [27]:
data = read_data_from_UCI()
data.shape

(41188, 21)

In [28]:
data.columns

Index(['age', 'job', 'marital', 'education', 'default', 'housing', 'loan',
       'contact', 'month', 'day_of_week', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome', 'emp.var.rate', 'cons.price.idx',
       'cons.conf.idx', 'euribor3m', 'nr.employed', 'y'],
      dtype='object')

In [29]:
print(data['y'].unique())

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = pd.Series(le.fit_transform(data['y']))
y.mean()

['no' 'yes']


0.11265417111780131

In [30]:
print(data['contact'].unique())
a = pd.Series(le.fit_transform(data['contact']))
a.mean()

['telephone' 'cellular']


0.3652520151500437

In [31]:
confounders = ['age', 'job', 'marital', 'education', 'default', 'housing', 'loan', 'pdays',
       'previous', 'poutcome', 'emp.var.rate', 'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed']

In [32]:
confounders += ['month', 'campaign']

In [33]:
X = data[confounders]
X.dtypes

age                 int64
job                object
marital            object
education          object
default            object
housing            object
loan               object
pdays               int64
previous            int64
poutcome           object
emp.var.rate      float64
cons.price.idx    float64
cons.conf.idx     float64
euribor3m         float64
nr.employed       float64
month              object
campaign            int64
dtype: object

In [34]:
X = pd.get_dummies(X, prefix_sep='=', drop_first=True)
X.head()

Unnamed: 0,age,pdays,previous,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,campaign,job=blue-collar,...,poutcome=success,month=aug,month=dec,month=jul,month=jun,month=mar,month=may,month=nov,month=oct,month=sep
0,56,999,0,1.1,93.994,-36.4,4.857,5191.0,1,0,...,0,0,0,0,0,0,1,0,0,0
1,57,999,0,1.1,93.994,-36.4,4.857,5191.0,1,0,...,0,0,0,0,0,0,1,0,0,0
2,37,999,0,1.1,93.994,-36.4,4.857,5191.0,1,0,...,0,0,0,0,0,0,1,0,0,0
3,40,999,0,1.1,93.994,-36.4,4.857,5191.0,1,0,...,0,0,0,0,0,0,1,0,0,0
4,56,999,0,1.1,93.994,-36.4,4.857,5191.0,1,0,...,0,0,0,0,0,0,1,0,0,0


In [35]:
from sklearn.linear_model import LogisticRegression
from causallib.estimation import IPW

lr = LogisticRegression(solver='lbfgs', max_iter=1000)
#lr = LogisticRegression(penalty='l1', solver='saga', max_iter=1000)
#lr = GradientBoostingClassifier()
ipw = IPW(lr)

In [36]:
ipw.fit(X, a)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


IPW(clip_max=None, clip_min=None, use_stabilized=False, verbose=False,
    learner=LogisticRegression(max_iter=1000))

In [37]:
outcomes = ipw.estimate_population_outcome(X, a, y)
outcomes

0    0.153863
1    0.208020
dtype: float64

In [38]:
propMatrix = ipw.compute_propensity_matrix(X, a).to_dict(orient="records")

In [39]:
unadjustedData = []

confounds = X.to_dict(orient="records")

for i in range(len(confounds)):
    newDataInstance = confounds[i]
    newDataInstance['treatment'] = a[i]
    newDataInstance['outcome'] = y[i]
    newDataInstance['propensity'] = propMatrix[i]
    
    unadjustedData.append(newDataInstance)

In [40]:
ceval = CohortEvaluator(unadjustedCohort=unadjustedData)
ceval

CohortEvaluator(component='CohortEvaluator', props={'unadjustedCohort': [{'age': 56, 'pdays': 999, 'previous':…

In [52]:
# df = pd.DataFrame(ceval.iselection["treatments"])
ceval.iselection

{'confounds': [], 'propensity': [], 'treatment': []}

# Refine Cohort

In [27]:
indExclude = (X['cons.price.idx'] > 93.92) & (X['euribor3m'] > 4.5)
a[indExclude].mean()

1.0

In [51]:
y2 = y.loc[~indExclude].reset_index(drop=True)
a2 = a.loc[~indExclude].reset_index(drop=True)
X2 = X.loc[~indExclude].reset_index(drop=True)
print(y2.mean())
print(a2.mean())

0.14498640322192008
0.1000654022236756


In [52]:
ipw.fit(X2, a2)

IPW(clip_max=None, clip_min=None, use_stabilized=False, verbose=False,
    learner=LogisticRegression(max_iter=1000))

In [53]:
propMatrix2 = ipw.compute_propensity_matrix(X2, a2).to_dict(orient="records")

In [57]:
refinedData = []

confounds2 = X2.to_dict(orient="records")

for i in range(len(confounds2)):
    newDataInstance = confounds2[i]
    newDataInstance['treatment'] = a2[i]
    newDataInstance['outcome'] = y2[i]
    newDataInstance['propensity'] = propMatrix2[i]
    
    refinedData.append(newDataInstance)

In [58]:
CohortEvaluator(unadjustedCohort=refinedData)

CohortEvaluator(component='CohortEvaluator', props={'unadjustedCohort': [{'age': 41, 'pdays': 999, 'previous':…