<a href="https://colab.research.google.com/github/alfredqbit/datasciencecoursera/blob/master/sepulvedaADDS-8515-6.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# ANOVA and MANOVA on Digital Marketing Conversion Dataset

Dataset: `digital_marketing_campaign_dataset.csv`
Source: Kaggle / Opendatabay (Digital Marketing Conversion Dataset)

Goals:
 - One-way ANOVA: PreviousPurchases ~ CampaignChannel
 - Two-way ANOVA: ClickThroughRate ~ CampaignType * IncomeSegment
 - MANOVA: (ClickThroughRate, TimeOnSite) ~ CampaignChannel

In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from scipy import stats
import statsmodels.api as sm
from statsmodels.formula.api import ols
from statsmodels.multivariate.manova import MANOVA
from statsmodels.graphics.factorplots import interaction_plot

sns.set(style="whitegrid")

FIG_DIR = "figures"
os.makedirs(FIG_DIR, exist_ok=True)

# Step 1: Load and Explore the Dataset

In [None]:
# Adjust the filename/path as needed
df = pd.read_csv("digital_marketing_campaign_dataset.csv")

print("Head:")
display(df.head())

print("\nInfo:")
print(df.info())

print("\nSummary statistics (numeric variables):")
display(df.describe())

# Convert key categoricals to category dtype
cat_cols = ["CampaignChannel", "CampaignType", "Gender"]
for col in cat_cols:
    if col in df.columns:
        df[col] = df[col].astype("category")

# Check levels
for col in cat_cols:
    if col in df.columns:
        print(f"\nValue counts for {col}:")
        print(df[col].value_counts())

# Step 2: Feature Engineering (Income Segments)

We create three income-based segments (Low, Medium, High)
via tertiles, to use as a factor in the two-way ANOVA.

In [None]:
if "Income" not in df.columns:
    raise ValueError("Expected 'Income' column not found in dataset.")

df = df.dropna(subset=["Income"])  # ensure no missing for binning

df["IncomeSegment"] = pd.qcut(
    df["Income"],
    q=3,
    labels=["Low", "Medium", "High"]
)
df["IncomeSegment"] = df["IncomeSegment"].astype("category")

print("\nIncomeSegment distribution:")
print(df["IncomeSegment"].value_counts())

# Step 3: One-way ANOVA (PreviousPurchases ~ CampaignChannel)

In [None]:
if "PreviousPurchases" not in df.columns or "CampaignChannel" not in df.columns:
    raise ValueError("Expected 'PreviousPurchases' or 'CampaignChannel' missing.")

df_1way = df.dropna(subset=["PreviousPurchases", "CampaignChannel"])

model_1way = ols("PreviousPurchases ~ C(CampaignChannel)", data=df_1way).fit()
anova_1way = sm.stats.anova_lm(model_1way, typ=2)
print("One-way ANOVA: PreviousPurchases ~ CampaignChannel")
display(anova_1way)

resid_1way = model_1way.resid
fitted_1way = model_1way.fittedvalues

ssumption Checks: One-way ANOVA

In [None]:
# Levene's test across channels
groups_pp = [g["PreviousPurchases"].values for _, g in df_1way.groupby("CampaignChannel")]
lev_stat, lev_p = stats.levene(*groups_pp)
print(f"Levene's test: stat={lev_stat:.3f}, p={lev_p:.3f}")

# Shapiro-Wilk for residuals
sh_stat, sh_p = stats.shapiro(resid_1way)
print(f"Shapiro-Wilk (residuals): stat={sh_stat:.3f}, p={sh_p:.3f}")

# Q-Q plot
sm.qqplot(resid_1way, line="45")
plt.title("One-way ANOVA residuals Q-Q plot")
plt.tight_layout()
plt.savefig(os.path.join(FIG_DIR, "oneway_resid_qq.png"), dpi=300)
plt.show()

# Residual vs fitted
plt.figure()
plt.scatter(fitted_1way, resid_1way, alpha=0.6)
plt.axhline(0, color="gray", linewidth=0.8)
plt.xlabel("Fitted values")
plt.ylabel("Residuals")
plt.title("One-way ANOVA: Residuals vs Fitted")
plt.tight_layout()
plt.savefig(os.path.join(FIG_DIR, "oneway_resid_vs_fitted.png"), dpi=300)
plt.show()

Boxplot: PreviousPurchases by CampaignChannel

In [None]:
plt.figure(figsize=(8, 5))
sns.boxplot(x="CampaignChannel", y="PreviousPurchases", data=df_1way)
plt.title("Previous Purchases by Campaign Channel")
plt.tight_layout()
plt.savefig(os.path.join(FIG_DIR, "oneway_prev_purchases_channel_boxplot.png"), dpi=300)
plt.show()

# Step 4: Two-way ANOVA (ClickThroughRate ~ CampaignType * IncomeSegment)

In [None]:
required_cols = ["ClickThroughRate", "CampaignType", "IncomeSegment"]
missing = [c for c in required_cols if c not in df.columns]
if missing:
    raise ValueError(f"Missing required columns for two-way ANOVA: {missing}")

df_2way = df.dropna(subset=["ClickThroughRate", "CampaignType", "IncomeSegment"])

model_2way = ols("ClickThroughRate ~ C(CampaignType) * C(IncomeSegment)",
                 data=df_2way).fit()
anova_2way = sm.stats.anova_lm(model_2way, typ=2)
print("Two-way ANOVA: ClickThroughRate ~ CampaignType * IncomeSegment")
display(anova_2way)

resid_2way = model_2way.resid

# Shapiro-Wilk on residuals
sh2_stat, sh2_p = stats.shapiro(resid_2way)
print(f"Shapiro-Wilk (two-way residuals): stat={sh2_stat:.3f}, p={sh2_p:.3f}")

# Levene's test across CampaignType x IncomeSegment cells
group_ctr = [g["ClickThroughRate"].values for _, g in df_2way.groupby(["CampaignType", "IncomeSegment"])]
lev2_stat, lev2_p = stats.levene(*group_ctr)
print(f"Levene's test (two-way): stat={lev2_stat:.3f}, p={lev2_p:.3f}")

Interaction Plot: CampaignType x IncomeSegment on CTR

In [None]:
plt.figure(figsize=(8, 5))
interaction_plot(df_2way["CampaignType"],
                 df_2way["IncomeSegment"],
                 df_2way["ClickThroughRate"],
                 markers=["o", "s", "D"],
                 ms=6)
plt.ylabel("Mean ClickThroughRate")
plt.title("Interaction: CampaignType x IncomeSegment on CTR")
plt.tight_layout()
plt.savefig(os.path.join(FIG_DIR, "twoway_interaction_ctr.png"), dpi=300)
plt.show()

# Step 5: MANOVA (ClickThroughRate, TimeOnSite ~ CampaignChannel)

In [None]:
manova_cols = ["ClickThroughRate", "TimeOnSite", "CampaignChannel"]
missing_m = [c for c in manova_cols if c not in df.columns]
if missing_m:
    raise ValueError(f"Missing required columns for MANOVA: {missing_m}")

df_manova = df.dropna(subset=manova_cols)

formula_manova = "ClickThroughRate + TimeOnSite ~ C(CampaignChannel)"
manova = MANOVA.from_formula(formula_manova, data=df_manova)
print("MANOVA results (CTR, TimeOnSite ~ CampaignChannel):")
print(manova.mv_test())

MANOVA Assumption Checks

In [None]:
# 1. Univariate Shapiro-Wilk tests (approximate multivariate normality)
for dv in ["ClickThroughRate", "TimeOnSite"]:
    stat, p = stats.shapiro(df_manova[dv])
    print(f"Shapiro-Wilk for {dv}: stat={stat:.3f}, p={p:.3f}")

# 2. Box's M test for equality of covariance matrices across channels
from numpy.linalg import det, inv
from scipy.stats import chi2

def box_m_test(df_y, groups):
    """
    Box's M test for equality of covariance matrices across groups.

    Parameters
    ----------
    df_y : DataFrame of dependent variables (numeric).
    groups : array-like group labels.

    Returns
    -------
    M, chi2_approx, df, p_value
    """
    y = df_y.values
    g_levels = np.unique(groups)
    p = y.shape[1]
    N = y.shape[0]

    covs = []
    ns = []
    for g in g_levels:
        Yg = y[groups == g, :]
        ns.append(Yg.shape[0])
        covs.append(np.cov(Yg, rowvar=False))

    ns = np.array(ns)
    covs = np.array(covs)

    # pooled covariance
    Sp = sum((ns[i] - 1) * covs[i] for i in range(len(g_levels))) / (N - len(g_levels))

    M = (N - len(g_levels)) * np.log(det(Sp)) - sum(
        (ns[i] - 1) * np.log(det(covs[i])) for i in range(len(g_levels))
    )

    # correction factor
    C = ((2 * p**2 + 3 * p - 1) /
         (6 * (p + 1) * (len(g_levels) - 1))) * \
        (sum(1 / (ns[i] - 1) for i in range(len(g_levels))) -
         1 / (N - len(g_levels)))

    chi2_approx = M * (1 - C)
    df_val = (len(g_levels) - 1) * p * (p + 1) / 2
    p_value = 1 - chi2.cdf(chi2_approx, df_val)

    return M, chi2_approx, df_val, p_value

Y_man = df_manova[["ClickThroughRate", "TimeOnSite"]]
groups = df_manova["CampaignChannel"].values

M, chi2_val, df_box, p_box = box_m_test(Y_man, groups)
print(f"Box's M test: M={M:.3f}, chi2={chi2_val:.3f}, df={df_box:.0f}, p={p_box:.3f}")


Follow-up Univariate ANOVAs (CTR and TimeOnSite ~ CampaignChannel)

In [None]:
for dv in ["ClickThroughRate", "TimeOnSite"]:
    print(f"\nUnivariate ANOVA: {dv} ~ CampaignChannel")
    model_dv = ols(f"{dv} ~ C(CampaignChannel)", data=df_manova).fit()
    anova_dv = sm.stats.anova_lm(model_dv, typ=2)
    display(anova_dv)

## Step 6: Canonical Discriminant Analysis & Plot

In [None]:
Y = Y_man.values
overall_mean = Y.mean(axis=0)
channels = np.unique(groups)

W = np.zeros((2, 2))
B = np.zeros((2, 2))

for ch in channels:
    Yg = Y[groups == ch, :]
    n_g = Yg.shape[0]
    mean_g = Yg.mean(axis=0)
    # within-group SSCP
    W += (Yg - mean_g).T @ (Yg - mean_g)
    # between-group SSCP
    diff = (mean_g - overall_mean).reshape(-1, 1)
    B += n_g * (diff @ diff.T)

eigvals, eigvecs = np.linalg.eig(inv(W) @ B)

# sort eigenvalues descending
idx = np.argsort(eigvals)[::-1]
eigvals = eigvals[idx]
eigvecs = eigvecs[:, idx]

canonical_coeffs = eigvecs  # columns are canonical directions

# canonical scores
Y_centered = Y - overall_mean
canonical_scores = Y_centered @ canonical_coeffs

can_df = pd.DataFrame(canonical_scores[:, :2], columns=["Can1", "Can2"])
can_df["CampaignChannel"] = groups

plt.figure(figsize=(7, 5))
sns.scatterplot(x="Can1", y="Can2",
                hue="CampaignChannel", data=can_df, s=60, alpha=0.8)
plt.axhline(0, color="gray", linewidth=0.8)
plt.axvline(0, color="gray", linewidth=0.8)
plt.title("Canonical Score Plot: CTR & TimeOnSite by CampaignChannel")
plt.tight_layout()
plt.savefig(os.path.join(FIG_DIR, "manova_canonical_scores.png"), dpi=300)
plt.show()