In [2]:
import warnings
warnings.filterwarnings("ignore")

In [4]:
import pandas as pd

email_data = pd.read_csv("http://www.minethatdata.com/Kevin_Hillstrom_MineThatData_E-MailAnalytics_DataMiningChallenge_2008.03.20.csv")
email_data.head()

Unnamed: 0,recency,history_segment,history,mens,womens,zip_code,newbie,channel,segment,visit,conversion,spend
0,10,2) $100 - $200,142.44,1,0,Surburban,0,Phone,Womens E-Mail,0,0,0.0
1,6,3) $200 - $350,329.08,1,1,Rural,1,Web,No E-Mail,0,0,0.0
2,7,2) $100 - $200,180.65,0,1,Surburban,1,Web,Womens E-Mail,0,0,0.0
3,9,5) $500 - $750,675.83,1,0,Rural,1,Web,Mens E-Mail,0,0,0.0
4,2,1) $0 - $100,45.34,1,0,Urban,0,Web,Womens E-Mail,0,0,0.0


In [9]:
email_data.shape

(64000, 12)

In [8]:
mail_df = email_data[email_data["segment"] != "Womens E-Mail"]
mail_df.shape

(42613, 12)

In [10]:
mail_df["treatment"] = mail_df["segment"].map(lambda x: 1 if x == "Mens E-Mail" else 0)
mail_df.head()

Unnamed: 0,recency,history_segment,history,mens,womens,zip_code,newbie,channel,segment,visit,conversion,spend,treatment
1,6,3) $200 - $350,329.08,1,1,Rural,1,Web,No E-Mail,0,0,0.0,0
3,9,5) $500 - $750,675.83,1,0,Rural,1,Web,Mens E-Mail,0,0,0.0,1
8,9,5) $500 - $750,675.07,1,1,Rural,1,Phone,Mens E-Mail,0,0,0.0,1
13,2,2) $100 - $200,101.64,0,1,Urban,0,Web,Mens E-Mail,1,0,0.0,1
14,4,3) $200 - $350,241.42,0,1,Rural,1,Multichannel,No E-Mail,0,0,0.0,0


In [14]:
mail_df.isnull().sum()

recency            0
history_segment    0
history            0
mens               0
womens             0
zip_code           0
newbie             0
channel            0
segment            0
visit              0
conversion         0
spend              0
treatment          0
dtype: int64

In [16]:
import numpy as np

summary_by_segment = pd.pivot_table(
    data=mail_df,
    values=["conversion", "spend", "visit"],
    index=["treatment"],
    aggfunc={"conversion": np.mean, "spend": np.mean, "visit": np.ma.count}
)

summary_by_segment.columns = ["conversion_rate", "spend_mean", "count"]
summary_by_segment

Unnamed: 0_level_0,conversion_rate,spend_mean,count
treatment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.005726,0.652789,21306
1,0.012531,1.422617,21307


In [24]:
from statsmodels.stats.weightstats import ttest_ind

rct_ttest = ttest_ind(
    mail_df[mail_df["treatment"] == 1]["spend"],
    mail_df[mail_df["treatment"] == 0]["spend"],
    usevar="pooled",
)

for key, val in zip(["t", "p-value", "df"], rct_ttest):
    print(f"{key} = {val}")

t = 5.300090294465469
p-value = 1.1632008726058926e-07
df = 42611.0


In [37]:
treatment_data = mail_df[mail_df["treatment"] == 1]
control_data = mail_df[mail_df["treatment"] == 0]

treatment_biased = treatment_data.drop(treatment_data[~(
    (treatment_data['history'] > 300) |
    (treatment_data['recency'] < 6) |
    (treatment_data['recency'] == 'Multichannel')
)].sample(frac=0.5, random_state=1).index)

control_biased = control_data.drop(control_data[
    (control_data['history'] > 300) |
    (control_data['recency'] < 6) |
    (control_data['recency'] == 'Multichannel')
].sample(frac=0.5, random_state=1).index)

biased_data = pd.concat([treatment_biased, control_biased], axis=0)
biased_data.head()

Unnamed: 0,recency,history_segment,history,mens,womens,zip_code,newbie,channel,segment,visit,conversion,spend,treatment
3,9,5) $500 - $750,675.83,1,0,Rural,1,Web,Mens E-Mail,0,0,0.0,1
8,9,5) $500 - $750,675.07,1,1,Rural,1,Phone,Mens E-Mail,0,0,0.0,1
13,2,2) $100 - $200,101.64,0,1,Urban,0,Web,Mens E-Mail,1,0,0.0,1
16,5,1) $0 - $100,29.99,1,0,Surburban,0,Phone,Mens E-Mail,0,0,0.0,1
17,9,2) $100 - $200,112.35,1,0,Rural,0,Web,Mens E-Mail,0,0,0.0,1


In [39]:
summary_by_segment_biased = pd.pivot_table(
    data=biased_data,
    values=["conversion", "spend", "visit"],
    index=["treatment"],
    aggfunc={"conversion": np.mean, "spend": np.mean, "visit": np.ma.count},
)

summary_by_segment_biased.columns = ["conversion_rate", "spend_mean", "count"]
summary_by_segment_biased

Unnamed: 0_level_0,conversion_rate,spend_mean,count
treatment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.005165,0.634625,14907
1,0.013575,1.560682,17017


In [40]:
rct_ttest_biased = ttest_ind(
    biased_data[biased_data["treatment"] == 1]["spend"],
    biased_data[biased_data["treatment"] == 0]["spend"],
    usevar="pooled"
)

for key, val in zip(["t", "p-value", "df"], rct_ttest_biased):
    print(f"{key} = {val}")

t = 5.173854873955885
p-value = 2.30698009745056e-07
df = 31922.0
