In [1]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
import pandas as pd

In [None]:
email_data = pd.read_csv("http://www.minethatdata.com/Kevin_Hillstrom_MineThatData_E-MailAnalytics_DataMiningChallenge_2008.03.20.csv")
email_data.head(3)

In [None]:
male_df = email_data[email_data["segment"] != "Womens E-Mail"]
male_df["treatment"] = male_df["segment"].map(lambda x: 1 if x == "Mens E-Mail" else 0)
male_df.head(3)

In [None]:
treatment_data = male_df[male_df["treatment"] == 1]
control_data = male_df[male_df["treatment"] == 0]

treatment_biased = treatment_data.drop(treatment_data[(
    (treatment_data['history'] > 300) |
    (treatment_data['recency'] < 6) |
    (treatment_data['recency'] == 'Multichannel')
) == False].sample(frac=0.5, random_state=1).index)

control_biased = control_data.drop(control_data[
    (control_data['history'] > 300) |
    (control_data['recency'] < 6) |
    (control_data['recency'] == 'Multichannel')
].sample(frac=0.5, random_state=1).index)

biased_data = pd.concat([treatment_biased, control_biased], axis=0)
biased_data.head(3)

In [None]:
import statsmodels.formula.api as smf

In [None]:
biased_reg = smf.ols(
    data=biased_data,
    formula="spend ~ treatment + history",
).fit()

In [None]:
biased_reg.summary().tables[1]

In [None]:
rct_reg = smf.ols(
    data=male_df,
    formula="spend ~ treatment",
).fit()

rct_reg.summary().tables[1]

In [None]:
nonrct_reg = smf.ols(
    data=biased_data,
    formula="spend ~ treatment",
).fit()

nonrct_reg.summary().tables[1]

In [None]:
nonrct_mreg = smf.ols(
    data=biased_data,
    formula="spend ~ treatment + history + channel + recency",
).fit()

nonrct_mreg.summary().tables[1]

In [None]:
models = {
    "model_1":smf.ols(data=biased_data, formula="spend ~ treatment + recency + channel"),
    "model_2":smf.ols(data=biased_data, formula="spend ~ treatment + recency + channel + history"),
    "model_3":smf.ols(data=biased_data, formula="history ~ treatment + recency + channel"),
}

results = {}
for name, model in models.items():
    results[name] = model.fit()

In [None]:
OVB = results["model_2"].params["history"] * results["model_3"].params["treatment"]
coef_gap = results["model_1"].params["treatment"] - results["model_2"].params["treatment"]
print(f"OVB={OVB:.4f}, coef_gap={coef_gap:.4f}")

In [None]:
corr_visit_treatment = smf.ols(
    data=biased_data,
    formula="treatment ~ visit + channel + recency + history",
).fit()

corr_visit_treatment.summary().tables[1]

In [None]:
bad_control_reg = smf.ols(
    data=biased_data,
    formula="spend ~ treatment + visit + channel + recency + history",
).fit()

bad_control_reg.summary().tables[1]