<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Load-Data" data-toc-modified-id="Load-Data-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Load Data</a></span></li><li><span><a href="#Data-Preparation" data-toc-modified-id="Data-Preparation-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Data Preparation</a></span><ul class="toc-item"><li><span><a href="#Hillstrom" data-toc-modified-id="Hillstrom-2.1"><span class="toc-item-num">2.1&nbsp;&nbsp;</span>Hillstrom</a></span></li><li><span><a href="#Mayo-PBC" data-toc-modified-id="Mayo-PBC-2.2"><span class="toc-item-num">2.2&nbsp;&nbsp;</span>Mayo PBC</a></span></li><li><span><a href="#CMF-Microfinance" data-toc-modified-id="CMF-Microfinance-2.3"><span class="toc-item-num">2.3&nbsp;&nbsp;</span>CMF Microfinance</a></span></li></ul></li><li><span><a href="#Iterative-Modeling" data-toc-modified-id="Iterative-Modeling-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Iterative Modeling</a></span></li><li><span><a href="#Evaluation-Table" data-toc-modified-id="Evaluation-Table-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Evaluation Table</a></span></li></ul></div>

In [None]:
import sys
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.ensemble import RandomForestClassifier

from causeinfer.data import hillstrom, mayo_pbc, cmf_micro
from causeinfer.utilities import plot_unit_distributions, train_test_split
from causeinfer.utilities import over_sample, mutli_cross_tab
from causeinfer.standard_algorithms import TwoModel, InteractionTerm
from causeinfer.standard_algorithms import BinaryClassTransformation
from causeinfer.standard_algorithms import QuaternaryClassTransformation
from causeinfer.evaluation import qini_score, auuc_score
from causeinfer.evaluation import plot_cum_effect, plot_cum_gain, plot_qini
from causeinfer.evaluation import plot_batch_responses, signal_to_noise
from causeinfer.evaluation import iterate_model, eval_table

pd.set_option("display.max_rows", 16)
pd.set_option('display.max_columns', None)
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:99% !important; }</style>"))

In [None]:
os.getcwd()

# Load Data

In [6]:
hillstrom.download_hillstrom()
mayo_pbc.download_mayo_pbc()

The dataset already exists at /Users/andrewmcallister/Documents/learning/programming/causeinfer/examples/datasets/hillstrom.csv
The dataset already exists at /Users/andrewmcallister/Documents/learning/programming/causeinfer/examples/datasets/mayo_pbc.text


In [None]:
data_hillstrom = hillstrom.load_hillstrom(user_file_path="datasets/hillstrom.csv",
                                          format_covariates=True, 
                                          normalize=True)
data_mayo_pbc = mayo_pbc.load_mayo_pbc(user_file_path="datasets/mayo_pbc.text",
                                       format_covariates=True, 
                                       normalize=True)
data_cmf_micro = cmf_micro.load_cmf_micro(user_file_path="datasets/cmf_micro",
                                          format_covariates=True, 
                                          normalize=True)

# Data Preparation

## Hillstrom

In [None]:
# Covariates, treatments and responses are loaded separately
X_hillstrom = data_hillstrom["features"]

y_hillstrom = data_hillstrom["response_visit"] # response_visit, response_spend or response_conversion

# 1 is men's campaign, 2 is women's, and 0 is control
w_hillstrom = data_hillstrom["treatment"]

In [None]:
# Counts for treatment
control_indexes = [i for i, e in enumerate(w) if e == 0]
mens_indexes = [i for i, e in enumerate(w) if e == 1]
womens_indexes = [i for i, e in enumerate(w) if e == 2]

womens_mens_indexes = womens_indexes + mens_indexes

print(len(control_indexes))
print(len(mens_indexes))
print(len(womens_indexes))
print(len(womens_mens_indexes))

In [None]:
X_control_hillstrom = X_hillstrom[control_indexes]
y_control_hillstrom = y_hillstrom[control_indexes]
w_control_hillstrom = w_hillstrom[control_indexes]

X_women = X_hillstrom[womens_indexes]
y_women = y_hillstrom[womens_indexes]
w_women = w_hillstrom[womens_indexes]

In [None]:
# Change 2s to 1s in women's campaign
w_women = [1 for i in w_women if i == 2]
w_women[:5]

In [None]:
# Over-sampling of control
X_os_hillstrom, y_os_hillstrom, w_os_hillstrom = over_sample(X_1=X_control_hillstrom, y_1=y_control_hillstrom, w_1=w_control_hillstrom, 
                                                             sample_2_size=len(X_women), shuffle=True)

In [None]:
X_split_hillstrom = np.append(X_os_hillstrom, X_women, axis=0)
y_split_hillstrom = np.append(y_os_hillstrom, y_women, axis=0)
w_split_hillstrom = np.append(w_os_hillstrom, w_women, axis=0)

X_split_hillstrom.shape, y_split_hillstrom.shape, w_split_hillstrom.shape # Should all be equal in the first dimension

In [None]:
X_train_hillstrom, X_test_hillstrom, \
y_train_hillstrom, y_test_hillstrom, \
w_train_hillstrom, w_test_hillstrom = train_test_split(X_split_hillstrom, y_split_hillstrom, w_split_hillstrom, 
                                                       percent_train=0.7, random_state=42, 
                                                       maintain_proportions=True)

## Mayo PBC

In [None]:
# Covariates, treatments and responses are loaded separately
X_mayo = data_mayo_pbc["features"]

# 0 is the patient is alive, 1 is a liver transplant, 2 is deceased
y_mayo = data_mayo_pbc["response"]

w_mayo = data_mayo_pbc["treatment"]

In [None]:
# Counts for response
alive_indexes = [i for i, e in enumerate(y) if e == 0]
transplant_indexes = [i for i, e in enumerate(y) if e == 1]
deceased_indexes = [i for i, e in enumerate(y) if e == 2]

transplant_deceased_indexes = transplant_indexes + deceased_indexes

print(len(alive_indexes))
print(len(transplant_indexes))
print(len(deceased_indexes))
print(len(transplant_deceased_indexes))

In [None]:
# Counts for treatment
control_indexes = [i for i, e in enumerate(w) if e == 0]
treatment_indexes = [i for i, e in enumerate(w) if e == 1]

print(len(control_indexes))
print(len(treatment_indexes))

In [None]:
X_control_mayo = X_mayo[control_indexes]
y_control_mayo = y_mayo[control_indexes]
w_control_mayo = w_mayo[control_indexes]

X_treatment_mayo = X_mayo[treatment_indexes]
y_treatment_mayo = y_mayo[treatment_indexes]
w_treatment_mayo = w_mayo[treatment_indexes]

In [None]:
# Over-sampling of control
X_os_mayo, y_os_mayo, w_os_mayo = over_sample(X_1=X_control_mayo, y_1=y_control_mayo, w_1=w_control_mayo, 
                                              sample_2_size=len(X_treatment_mayo), shuffle=True)

In [None]:
X_split_mayo = np.append(X_os_mayo, X_treatment_mayo, axis=0)
y_split_mayo = np.append(y_os_mayo, y_treatment_mayo, axis=0)
w_split_mayo = np.append(w_os_mayo, w_treatment_mayo, axis=0)

X_split.shape, y_split.shape, w_split.shape # Should all be equal in the first dimension

In [None]:
X_train_mayo, X_test_mayo, \
y_train_mayo, y_test_mayo, \
w_train_mayo, w_test_mayo = train_test_split(X_split_mayo, y_split_mayo, w_split_mayo, 
                                             percent_train=0.7, random_state=42, 
                                             maintain_proportions=True)

## CMF Microfinance

In [None]:
X_cmf = data_cmf_micro["features"]

y_cmf = data_cmf_micro["response_biz_index"] # response_biz_index or response_women_emp

w_cmf = data_cmf_micro["treatment"]

In [None]:
# Counts for treatment
control_indexes = [i for i, e in enumerate(w_cmf) if e == 0]
treatment_indexes = [i for i, e in enumerate(w_cmf) if e == 1]

print(len(control_indexes))
print(len(treatment_indexes))

In [None]:
X_control_cmf = X_cmf[control_indexes]
y_control_cmf = y_cmf[control_indexes]
w_control_cmf = w_cmf[control_indexes]

X_treatment_cmf = X_cmf[treatment_indexes]
y_treatment_cmf = y_cmf[treatment_indexes]
w_treatment_cmf = w_cmf[treatment_indexes]

In [None]:
# Over-sampling of control
X_os_cmf, y_os_cmf, w_os_cmf = over_sample(X_1=X_control_cmf, y_1=y_control_cmf, w_1=w_control_cmf, 
                               sample_2_size=len(X_treatment_cmf), shuffle=True)

In [None]:
X_split_cmf = np.append(X_os_cmf, X_treatment_cmf, axis=0)
y_split_cmf = np.append(y_os_cmf, y_treatment_cmf, axis=0)
w_split_cmf = np.append(w_os_cmf, w_treatment_cmf, axis=0)

X_split_cmf.shape, y_split_cmf.shape, w_split_cmf.shape # Should all be equal in the first dimension

In [None]:
X_train_cmf, X_test_cmf, \
y_train_cmf, y_test_cmf, \
w_train_cmf, w_test_cmf = train_test_split(X_split_cmf, y_split_cmf, w_split_cmf, 
                                           percent_train=0.7, random_state=42, 
                                           maintain_proportions=True)

# Iterative Modeling

In [None]:
dataset_keys = {'Hillstrom': {'pred_type': 'predict_proba',
                               'X_train': X_train_hillstrom,
                               'y_train': y_train_hillstrom,
                               'w_train': w_train_hillstrom,
                               'X_test': X_test_hillstrom,
                               'y_test': y_test_hillstrom,
                               'w_test': w_test_hillstrom}, 
                 'Mayo PBC': {'pred_type': 'predict_proba',
                               'X_train': X_train_mayo,
                               'y_train': y_train_mayo,
                               'w_train': w_train_mayo,
                               'X_test': X_test_mayo,
                               'y_test': y_test_mayo,
                               'w_test': w_test_mayo}, 
                 'CMF Microfinance': {'pred_type': 'predict',
                               'X_train': X_train_cmf,
                               'y_train': y_train_cmf,
                               'w_train': w_train_cmf,
                               'X_test': X_test_cmf,
                               'y_test': y_test_cmf,
                               'w_test': w_test_cmf}}

In [None]:
sklearn_base_model = RandomForestClassifier()

In [None]:
tm = TwoModel(treatment_model=sklearn_base_model,
              control_model=sklearn_base_model)
it = InteractionTerm(model=sklearn_base_model)
bct = BinaryClassTransformation(model=sklearn_base_model)
qct = QuaternaryClassTransformation(model=sklearn_base_model)

In [None]:
n=10

In [None]:
model_eval_dict = {}

In [None]:
# Format so that a two level dictionary is being filled

In [9]:
two_level_dict['2'] = {'2': 'Another second level str'}

In [10]:
two_level_dict

{'1': {'2': 'Second level term'}, '2': {'2': 'Another second level str'}}

In [None]:
for data_key in dataset_keys.keys():
    if data_key in ['Hillstrom', 'Mayo PBC']:
        for model in [tm, it, bct, qct]:
            print('Starting {} iterations:'.format(str(model).split('.')[-1].split(' ')[0]))
            avg_preds, all_preds, \
            avg_eval, eval_variance, \
            eval_sd, all_evals = iterate_model(model=model, X_train=X_train, y_train=y_train, w_train=w_train,
                                                X_test=X_test, y_test=y_test, w_test=w_test, tau_test=None, n=n,
                                                pred_type='predict_proba', eval_type='qini', 
                                                normalize_eval=False, notify_iter=int(n/10))
            model_eval_dict['{}'.format(data_key)] = {str(model).split('.')[-1].split(' ')[0]: = [avg_preds, all_preds, 
                                                                                                  avg_eval, eval_variance,
                                                                                                  eval_sd, all_evals]}
            print('-----')
        
    else:
        for model in [tm, it]:
            print('Starting {} iterations:'.format(str(model).split('.')[-1].split(' ')[0]))
            avg_preds, all_preds, \
            avg_eval, eval_variance, \
            eval_sd, all_evals = iterate_model(model=model, X_train=X_train, y_train=y_train, w_train=w_train,
                                                X_test=X_test, y_test=y_test, w_test=w_test, tau_test=None, n=n,
                                                pred_type='predict', eval_type='qini', 
                                                normalize_eval=False, notify_iter=int(n/10))
            model_eval_dict['{}'.format(data_key)] = {str(model).split('.')[-1].split(' ')[0]: = [avg_preds, all_preds, 
                                                                                                  avg_eval, eval_variance,
                                                                                                  eval_sd, all_evals]}
            print('-----')     

# Evaluation Table

In [None]:
model_eval_dict

In [None]:
iter_models = list(model_eval_dict.keys())
iter_evals = [i[2] for i in model_eval_dict.values()]
iter_vars = [i[3] for i in model_eval_dict.values()]
iter_sds = [i[4] for i in model_eval_dict.values()]

In [None]:
df_model_eval = eval_table(models=iter_models, datasets='Hillstrom', 
                           evals=iter_evals, variances=iter_vars, 
                           sds=iter_sds, annotate=True)
df_model_eval