In [12]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [13]:
# Double Machine Learning Implementation for Bank Marketing Dataset
# Final complete script (Google Colab friendly)
# This includes: preprocessing, DML cross-fitting, ATE, CATE, and saving outputs.

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LassoCV
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from scipy.stats import norm
import os

# ----------------------------------------------------
# 0. Create output folder
# ----------------------------------------------------
os.makedirs("plots", exist_ok=True)

# ----------------------------------------------------
# 1. Load Dataset
# ----------------------------------------------------
df = pd.read_csv("/content/drive/MyDrive/datasets/bank.csv")

# Convert outcome to binary
df["deposit"] = df["deposit"].map({"yes": 1, "no": 0})

# Convert treatment to binary
df["campaign"] = (df["campaign"] > 1).astype(int)

Y = df["deposit"]
T = df["campaign"]

# Prepare covariates
X = df.drop(columns=["deposit", "campaign"])
X = pd.get_dummies(X, drop_first=True)

# ----------------------------------------------------
# 2. Save outcome distribution plot
# ----------------------------------------------------
plt.figure(figsize=(6,4))
plt.hist(Y)
plt.title("Outcome Distribution (deposit)")
plt.savefig("plots/outcome_distribution.png")
plt.close()

# ----------------------------------------------------
# 3. Cross-Fitting Function
# ----------------------------------------------------
def cross_fit(X, Y, T, folds=5):
    n = len(Y)
    m_hat = np.zeros(n)
    g_hat = np.zeros(n)

    kf = KFold(n_splits=folds, shuffle=True, random_state=7)

    for train_idx, test_idx in kf.split(X):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        Y_train, Y_test = Y.iloc[train_idx], Y.iloc[test_idx]
        T_train, T_test = T.iloc[train_idx], T.iloc[test_idx]

        # Outcome model: Lasso
        m_model = make_pipeline(SimpleImputer(), StandardScaler(), LassoCV(cv=3))
        m_model.fit(X_train, Y_train)
        m_hat[test_idx] = m_model.predict(X_test)

        # Propensity model: Random Forest
        g_model = make_pipeline(SimpleImputer(), RandomForestClassifier(n_estimators=200))
        g_model.fit(X_train, T_train)
        g_hat[test_idx] = g_model.predict_proba(X_test)[:, 1]

    # clip extreme values
    g_hat = np.clip(g_hat, 0.01, 0.99)

    return m_hat, g_hat

# ----------------------------------------------------
# 4. Run Cross-Fitting
# ----------------------------------------------------
m_hat, g_hat = cross_fit(X, Y, T)

# ----------------------------------------------------
# 5. ATE Estimation
# ----------------------------------------------------
res_y = Y - m_hat
res_t = T - g_hat

theta = np.sum(res_y * res_t) / np.sum(res_t**2)

psi = (res_y * res_t) - theta * (res_t**2)
se = np.sqrt(np.var(psi) / (len(Y) * (np.mean(res_t**2)**2)))

ci_low = theta - 1.96 * se
ci_high = theta + 1.96 * se
z_value = theta / se
p_value = 2 * (1 - norm.cdf(abs(z_value)))

print("\n===== ATE RESULTS =====")
print("ATE:", theta)
print("Std Error:", se)
print("95% CI:", (ci_low, ci_high))
print("p-value:", p_value)

# ----------------------------------------------------
# 6. CATE Estimation
# ----------------------------------------------------
pseudo = (Y - m_hat) / (T - g_hat)
pseudo = np.where(np.isfinite(pseudo), pseudo, 0)

cate_model = RandomForestRegressor(n_estimators=300)
cate_model.fit(X, pseudo)
cate_vals = cate_model.predict(X)

df["CATE"] = cate_vals
df.to_csv("plots/dml_results.csv", index=False)

# Save CATE plot
plt.figure(figsize=(6,4))
plt.hist(cate_vals, bins=30)
plt.title("CATE Distribution")
plt.savefig("plots/cate_distribution.png")
plt.close()

print("\nFiles saved in /plots/ folder:")
print(" - outcome_distribution.png")
print(" - cate_distribution.png")
print(" - dml_results.csv")



===== ATE RESULTS =====
ATE: -0.03208280726765286
Std Error: 0.007700853387709566
95% CI: (np.float64(-0.047176479907563615), np.float64(-0.01698913462774211))
p-value: 3.0980508896938375e-05

Files saved in /plots/ folder:
 - outcome_distribution.png
 - cate_distribution.png
 - dml_results.csv
