**Hillstrom Dataset**

An email marketing dataset from Kevin Hillstrom's MineThatData blog.

<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Load-Data" data-toc-modified-id="Load-Data-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Load Data</a></span><ul class="toc-item"><li><span><a href="#Exploration-Data" data-toc-modified-id="Exploration-Data-1.1"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>Exploration Data</a></span></li><li><span><a href="#Modeling-Data" data-toc-modified-id="Modeling-Data-1.2"><span class="toc-item-num">1.2&nbsp;&nbsp;</span>Modeling Data</a></span></li></ul></li><li><span><a href="#Data-Exploration" data-toc-modified-id="Data-Exploration-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Data Exploration</a></span><ul class="toc-item"><li><span><a href="#Full-Data-Visualization" data-toc-modified-id="Full-Data-Visualization-2.1"><span class="toc-item-num">2.1&nbsp;&nbsp;</span>Full Data Visualization</a></span></li><li><span><a href="#Modeling-Data-Preparation" data-toc-modified-id="Modeling-Data-Preparation-2.2"><span class="toc-item-num">2.2&nbsp;&nbsp;</span>Modeling Data Preparation</a></span></li></ul></li><li><span><a href="#Standard-Models" data-toc-modified-id="Standard-Models-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Standard Models</a></span><ul class="toc-item"><li><span><a href="#Two-Model" data-toc-modified-id="Two-Model-3.1"><span class="toc-item-num">3.1&nbsp;&nbsp;</span>Two Model</a></span></li><li><span><a href="#Interaction-Term" data-toc-modified-id="Interaction-Term-3.2"><span class="toc-item-num">3.2&nbsp;&nbsp;</span>Interaction Term</a></span></li><li><span><a href="#Class-Transformations" data-toc-modified-id="Class-Transformations-3.3"><span class="toc-item-num">3.3&nbsp;&nbsp;</span>Class Transformations</a></span><ul class="toc-item"><li><span><a href="#Binary-Transformation" data-toc-modified-id="Binary-Transformation-3.3.1"><span class="toc-item-num">3.3.1&nbsp;&nbsp;</span>Binary Transformation</a></span></li><li><span><a href="#Quaternary-Transformation" data-toc-modified-id="Quaternary-Transformation-3.3.2"><span class="toc-item-num">3.3.2&nbsp;&nbsp;</span>Quaternary Transformation</a></span></li></ul></li></ul></li><li><span><a href="#Generalized-Random-Forest" data-toc-modified-id="Generalized-Random-Forest-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Generalized Random Forest</a></span></li><li><span><a href="#Evaluation" data-toc-modified-id="Evaluation-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Evaluation</a></span><ul class="toc-item"><li><span><a href="#Visual" data-toc-modified-id="Visual-5.1"><span class="toc-item-num">5.1&nbsp;&nbsp;</span>Visual</a></span></li><li><span><a href="#Variance" data-toc-modified-id="Variance-5.2"><span class="toc-item-num">5.2&nbsp;&nbsp;</span>Variance</a></span></li><li><span><a href="#Econometric" data-toc-modified-id="Econometric-5.3"><span class="toc-item-num">5.3&nbsp;&nbsp;</span>Econometric</a></span></li></ul></li><li><span><a href="#Summary" data-toc-modified-id="Summary-6"><span class="toc-item-num">6&nbsp;&nbsp;</span>Summary</a></span></li></ul></div>

In [2]:
import sys
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.ensemble import RandomForestClassifier

from causeinfer.data import hillstrom
from causeinfer.utilities import plot_unit_distributions, train_test_split
from causeinfer.utilities import over_sample, mutli_cross_tab
from causeinfer.standard_algorithms import TwoModel, InteractionTerm
from causeinfer.standard_algorithms import BinaryClassTransformation
from causeinfer.standard_algorithms import QuaternaryClassTransformation
from causeinfer.evaluation import qini_score, auuc_score
from causeinfer.evaluation import plot_cum_effect, plot_cum_gain, plot_qini
from causeinfer.evaluation import plot_batch_responses, signal_to_noise
from causeinfer.evaluation import iterate_model, eval_table

pd.set_option("display.max_rows", 16)
pd.set_option('display.max_columns', None)
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:99% !important; }</style>"))

ModuleNotFoundError: No module named 'causeinfer.data'

In [None]:
os.getcwd()

In [None]:
head_shape = helpers_py.head_shape

# Load Data

In [None]:
hillstrom.download_hillstrom()

## Exploration Data

In [None]:
# The full mostly unformatted dataset is loaded
data_raw = hillstrom.load_hillstrom(user_file_path="datasets/hillstrom.csv",
                                    format_covariates=False, 
                                    normalize=False)

df_full = pd.DataFrame(data_raw["dataset_full"], 
                       columns=data_raw["dataset_full_names"])

display(df_full.head())
df_full.shape

## Modeling Data

In [None]:
# The formatted dataset is loaded
data_hillstrom = hillstrom.load_hillstrom(user_file_path="datasets/hillstrom.csv",
                                          format_covariates=True, 
                                          normalize=True)

df = pd.DataFrame(data_hillstrom["dataset_full"], 
                  columns=data_hillstrom["dataset_full_names"])

In [None]:
# Covariates, treatments and responses are loaded separately
X = data_hillstrom["features"]

y = data_hillstrom["response_visit"] # response_visit, response_spend or response_conversion

# 1 is men's campaign, 2 is women's, and 0 is control
w = data_hillstrom["treatment"]

# Data Exploration

In [None]:
sns.set(style="whitegrid")

## Full Data Visualization

In [None]:
fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, sharey=True, figsize=(20,5))

plot_unit_distributions(df=df_full, variable='variable', treatment = None,
                        plot_x_label='x', plot_y_label='y', plot_title='title', 
                        bins=None, figsize=None, fontsize=25, axis=ax1),

plot_unit_distributions(df=df_full, variable='variable', treatment = 'treatment',
                        plot_x_label='x', plot_y_label='y', plot_title='title', 
                        bins=None, figsize=None, fontsize=25, axis=ax2)

## Modeling Data Preparation

In [None]:
mutli_cross_tab(df=df, w_col='treatment', y_cols = ['visit', 'conversion'], 
                            label_limit=6, margins=True, normalize=True)

In [None]:
df.pivot_table(values=['visit', 'conversion', 'spend'],
               index='treatment',
               aggfunc=[np.mean],
               margins=True)

In [None]:
# Counts for treatment
control_indexes = [i for i, e in enumerate(w) if e == 0]
mens_indexes = [i for i, e in enumerate(w) if e == 1]
womens_indexes = [i for i, e in enumerate(w) if e == 2]

womens_mens_indexes = womens_indexes + mens_indexes

print(len(control_indexes))
print(len(mens_indexes))
print(len(womens_indexes))
print(len(womens_mens_indexes))

In [None]:
X_control = X[control_indexes]
y_control = y[control_indexes]
w_control = w[control_indexes]

X_women = X[womens_indexes]
y_women = y[womens_indexes]
w_women = w[womens_indexes]

In [None]:
# Change 2s to 1s in women's campaign
w_women = [1 for i in w_women if i == 2]
w_women[:5]

In [None]:
# Over-sampling of control
X_os, y_os, w_os = over_sample(X_1=X_control, y_1=y_control, w_1=w_control, 
                               sample_2_size=len(X_women), shuffle=True)

In [None]:
X_split = np.append(X_os, X_women, axis=0)
y_split = np.append(y_os, y_women, axis=0)
w_split = np.append(w_os, w_women, axis=0)

X_split.shape, y_split.shape, w_split.shape # Should all be equal in the first dimension

In [None]:
X_train, X_test, y_train, y_test, w_train, w_test = train_test_split(X_split, y_split, w_split, 
                                                                     percent_train=0.7, random_state=42, 
                                                                     maintain_proportions=True)

In [None]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape, w_train.shape, w_test.shape

In [None]:
print(np.array(np.unique(y_train, return_counts=True)).T)
print(np.array(np.unique(y_test, return_counts=True)).T)

In [None]:
print(np.array(np.unique(w_train, return_counts=True)).T)
print(np.array(np.unique(w_test, return_counts=True)).T)

# Standard Models

In [None]:
sn_ratio = signal_to_noise(y=y_split, w=w_split)
sn_ratio

## Two Model

In [None]:
tm = TwoModel(treatment_model=RandomForestClassifier(),
              control_model=RandomForestClassifier())
tm.fit(X=X_train, y=y_train, w=w_train)

In [None]:
tm_probas = tm.predict_proba(X=X_test)

In [None]:
tm_probas

## Interaction Term

In [None]:
it = InteractionTerm(model=RandomForestClassifier())
it.fit(X=X_train, y=y_train, w=w_train)

In [None]:
it_probas = it.predict_proba(X=X_test)

In [None]:
it_probas

## Class Transformations

### Binary Transformation

In [None]:
bct = BinaryClassTransformation(model=RandomForestClassifier())
bct.fit(X=X_train, y=y_train, w=w_train)

In [None]:
bct_probas = bct.predict_proba(X=X_test, regularize=False)

In [None]:
bct_probas

### Quaternary Transformation

In [None]:
qct = QuaternaryClassTransformation(model=RandomForestClassifier())
qct.fit(X=X_train, y=y_train, w=w_train)

In [None]:
qct_probas = qct.predict_proba(X=X_test, regularize=False)

In [None]:
qct_probas

# Generalized Random Forest

# Evaluation

In [None]:
tm_effects = [tm_probas[i][0] - tm_probas[i][1] for i in range(len(tm_probas))]
it_effects = [it_probas[i][0] - it_probas[i][1] for i in range(len(it_probas))]
bct_effects = [bct_probas[i][0] - bct_probas[i][1] for i in range(len(bct_probas))]
qct_effects = [qct_probas[i][0] - qct_probas[i][1] for i in range(len(qct_probas))]

In [None]:
eval_dict = {'y_test': y_test, 'w_test': w_test, 
             'two_model': tm_effects, 'interaction_term': it_effects, 
             'binary_trans': bct_effects, 'quaternary_trans': qct_effects}

In [None]:
df_eval = pd.DataFrame(eval_dict, columns = eval_dict.keys())

display(df_eval.head())
df_eval.shape

In [None]:
models = [col for col in eval_dict.keys() if col not in ['y_test', 'w_test']]

## Visual

In [None]:
fig, (ax1, ax2) = plt.subplots(ncols=2, sharey=False, figsize=(20,5))

plot_cum_effect(df=df_eval, n=100, models=models, percent_of_pop=False, 
                outcome_col='y_test', treatment_col='w_test', random_seed=42, 
                figsize=(10,5), fontsize=20, axis=ax1, legend_metrics=False)

plot_batch_responses(df=df_eval, n=10, models=models, 
                     outcome_col='y_test', treatment_col='w_test', normalize=False,
                     figsize=None, fontsize=15, axis=ax2)

In [None]:
fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, sharey=False, figsize=(20,5))

plot_cum_gain(df=df_eval, n=100, models=models, percent_of_pop=True,
              outcome_col='y_test', treatment_col='w_test', normalize=True, random_seed=42, 
              figsize=None, fontsize=20, axis=ax1, legend_metrics=True)

plot_qini(df=df_eval, n=100, models=models, percent_of_pop=True, 
          outcome_col='y_test', treatment_col='w_test', normalize=True, random_seed=42, 
          figsize=None, fontsize=20, axis=ax2, legend_metrics=True)

## Variance

## Econometric

# Summary