In [1]:
### IPW ###

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
import pandas as pd

In [None]:
email_data = pd.read_csv('http://www.minethatdata.com/Kevin_Hillstrom_MineThatData_E-MailAnalytics_DataMiningChallenge_2008.03.20.csv')
email_data.head(3)

In [None]:
male_df = email_data[email_data["segment"] != "Womens E-Mail"]
male_df["treatment"] = male_df["segment"].apply(
    lambda x: 1 if x == 'Mens E-Mail' else 0
)

male_df.head(3)

In [None]:
treatment_data = male_df[male_df["treatment"] == 1]
control_data = male_df[male_df["treatment"] == 0]

treatment_biased = treatment_data.drop(treatment_data[~(
    (treatment_data['history'] > 300) |
    (treatment_data['recency'] < 6) |
    (treatment_data['recency'] == 'Multichannel')
)].sample(frac=0.5, random_state=1).index)

control_biased = control_data.drop(control_data[
    (control_data['history'] > 300) |
    (control_data['recency'] < 6) |
    (control_data['recency'] == 'Multichannel')
].sample(frac=0.5, random_state=1).index)

biased_data = pd.concat([treatment_biased, control_biased])
biased_data.head(3)

In [None]:
import statsmodels.api as sm
import statsmodels.formula.api as smf

In [None]:
ps_model = smf.glm(
    formula='treatment ~ recency + history + channel',
    data=biased_data,
    family=sm.families.Binomial()
).fit()

In [None]:
biased_data['pscore'] = ps_model.predict()
biased_data_treatment = biased_data[biased_data['treatment'] == 1].reset_index(drop=True)
biased_data_control = biased_data[biased_data['treatment'] == 0].reset_index(drop=True)

In [None]:
biased_data.head(3)

In [None]:
biased_data['spend_weighted'] = biased_data.apply(
    lambda x: x['spend']/x['pscore'] if x['treatment'] == 1 else x['spend']/(1-x['pscore']),
    axis=1,
)
biased_data.head(3)

In [None]:
effect = (
    sum(biased_data['treatment']*biased_data['spend']/biased_data['pscore'])/sum(biased_data['treatment']/biased_data['pscore']) -
    sum((1-biased_data['treatment'])*biased_data['spend']/(1-biased_data['pscore']))/sum((1-biased_data['treatment'])/(1-biased_data['pscore']))
)

effect

In [None]:
## IPW with ML ###

In [None]:
from sklearn import model_selection
import numpy as np

male_df_train, male_df_test = model_selection.train_test_split(
    male_df, 
    test_size=0.5, 
    random_state=1
)

male_df_train = male_df_train[male_df_train.treatment == 0]

In [None]:
predicted_model = smf.glm(
    formula='conversion ~ recency + history_segment + channel + zip_code',
    data=male_df_train,
    family=sm.families.Binomial()
).fit()

In [None]:
male_df_test['pred_cv'] = predicted_model.predict(male_df_test)
male_df_test['pred_cv_rank'] = male_df_test['pred_cv'].rank(pct=True)
male_df_test['mail_assign'] = male_df_test['pred_cv_rank'].map(
    lambda x: np.random.binomial(n=1, p=x)
)

In [None]:
ml_male_df = male_df_test[
    ((male_df_test['mail_assign'] == 1) & (male_df_test['treatment'] == 1)) |
    ((male_df_test['mail_assign'] == 0) & (male_df_test['treatment'] == 0))
]

ml_male_df.head(3)

In [None]:
ml_male_df['spend_weighted'] = ml_male_df.apply(
    lambda x: x['spend']/x['pred_cv_rank'] if x['treatment'] == 1 else x['spend']/(1-x['pred_cv_rank']), 
    axis=1
)

ml_male_df.head(3)


In [None]:
effect = (
    sum(ml_male_df['treatment']*ml_male_df['spend']/ml_male_df['pred_cv_rank'])/sum(ml_male_df['treatment']/ml_male_df['pred_cv_rank']) -
    sum((1-ml_male_df['treatment'])*ml_male_df['spend']/(1-ml_male_df['pred_cv_rank']))/sum((1-ml_male_df['treatment'])/(1-ml_male_df['pred_cv_rank']))
)

effect


Unnamed: 0,recency,history_segment,history,mens,womens,zip_code,newbie,channel,segment,visit,conversion,spend
0,10,2) $100 - $200,142.44,1,0,Surburban,0,Phone,Womens E-Mail,0,0,0.0
1,6,3) $200 - $350,329.08,1,1,Rural,1,Web,No E-Mail,0,0,0.0
2,7,2) $100 - $200,180.65,0,1,Surburban,1,Web,Womens E-Mail,0,0,0.0


In [4]:
male_df = email_data[email_data["segment"] != "Womens E-Mail"]
male_df["treatment"] = male_df["segment"].apply(
    lambda x: 1 if x == 'Mens E-Mail' else 0
)

male_df.head(3)

Unnamed: 0,recency,history_segment,history,mens,womens,zip_code,newbie,channel,segment,visit,conversion,spend,treatment
1,6,3) $200 - $350,329.08,1,1,Rural,1,Web,No E-Mail,0,0,0.0,0
3,9,5) $500 - $750,675.83,1,0,Rural,1,Web,Mens E-Mail,0,0,0.0,1
8,9,5) $500 - $750,675.07,1,1,Rural,1,Phone,Mens E-Mail,0,0,0.0,1


In [6]:
treatment_data = male_df[male_df["treatment"] == 1]
control_data = male_df[male_df["treatment"] == 0]

treatment_biased = treatment_data.drop(treatment_data[~(
    (treatment_data['history'] > 300) |
    (treatment_data['recency'] < 6) |
    (treatment_data['recency'] == 'Multichannel')
)].sample(frac=0.5, random_state=1).index)

control_biased = control_data.drop(control_data[
    (control_data['history'] > 300) |
    (control_data['recency'] < 6) |
    (control_data['recency'] == 'Multichannel')
].sample(frac=0.5, random_state=1).index)

biased_data = pd.concat([treatment_biased, control_biased])
biased_data.head(3)

Unnamed: 0,recency,history_segment,history,mens,womens,zip_code,newbie,channel,segment,visit,conversion,spend,treatment
3,9,5) $500 - $750,675.83,1,0,Rural,1,Web,Mens E-Mail,0,0,0.0,1
8,9,5) $500 - $750,675.07,1,1,Rural,1,Phone,Mens E-Mail,0,0,0.0,1
13,2,2) $100 - $200,101.64,0,1,Urban,0,Web,Mens E-Mail,1,0,0.0,1


In [7]:
import statsmodels.formula.api as smf

In [None]:
ps_model = smf.glm

In [None]:
import numpy as np
import pandas as pd

import rdata
from rdd import rdd
import statsmodels.api as sm
import statsmodels.formula.api as smf
from causalimpact import CausalImpact
from statsmodels.stats.weightstats import ttest_ind
from sklearn.model_selection import train_test_split

import seaborn as sns
import matplotlib.pyplot as plt

from propensity_score_matching import PropensityScoreMatching