In [None]:
# src/hypothesis_testing.py
"""
Task 3 - A/B Hypothesis Testing
Saves results to results/hypothesis_results.csv and plots to results/*.png

Tests included:
- Chi-square: Claim Frequency vs Province
- Chi-square: Claim Frequency vs Top N PostalCode (Zip)
- ANOVA (or Kruskal-Wallis): Margin across top N PostalCode
- Chi-square: Claim Frequency vs Gender
- t-test (Welch): Margin by Gender
"""

import os
import pandas as pd
import numpy as np
from scipy.stats import chi2_contingency, f_oneway, kruskal, ttest_ind
import statsmodels.stats.multicomp as mc
import matplotlib.pyplot as plt
import seaborn as sns

# -----------------------
# Config
# -----------------------
INPUT_CANDIDATES = [
    "data/processed/cleaned_data_sample.csv",
    "data/processed/cleaned_data.csv"
]
RESULTS_DIR = "results"
TOP_N_ZIPCODES = 10     # for zipcode-based tests, use top N zipcodes by count
ALPHA = 0.05

os.makedirs(RESULTS_DIR, exist_ok=True)

# -----------------------
# Load data (sample first, fallback to full)
# -----------------------
df = None
for p in INPUT_CANDIDATES:
    if os.path.exists(p):
        print(f"Loading {p}")
        # file may be pipe-separated or comma; try both
        try:
            df = pd.read_csv(p)
        except Exception:
            df = pd.read_csv(p, sep="|", engine="python")
        break

if df is None:
    raise FileNotFoundError("No processed data found. Run preprocess.py first.")

# Clean column names
df.columns = df.columns.str.strip()

# Ensure numeric columns
for col in ["TotalPremium", "TotalClaims"]:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors="coerce")

# -----------------------
# Create metrics
# -----------------------
df["ClaimFrequency"] = (df["TotalClaims"] > 0).astype(int)
df["Margin"] = df["TotalPremium"] - df["TotalClaims"]
df["Severity"] = df["TotalClaims"].where(df["TotalClaims"] > 0, np.nan)
# small check
n_rows = len(df)
print(f"Rows loaded: {n_rows}")

# Helper to record results
results = []

# -----------------------
# Utility functions
# -----------------------
def chi2_test_category_vs_freq(data, category_col, min_count=5):
    """
    Chi-square on contingency table of category_col vs ClaimFrequency.
    Returns statistic, p-value, conclusion, contingency table shape info.
    """
    table = pd.crosstab(data[category_col].fillna("MISSING"), data["ClaimFrequency"])
    # remove rows with very small counts?
    stat, p, dof, expected = chi2_contingency(table)
    conclusion = "Reject H0" if p < ALPHA else "Fail to reject H0"
    return dict(
        test="chi2",
        feature=category_col,
        stat=float(stat),
        p_value=float(p),
        conclusion=conclusion,
        size_rows=table.shape[0],
        size_cols=table.shape[1],
    )

def anova_margin_across_groups(data, group_col, top_n=None):
    """
    One-way ANOVA on Margin across groups in group_col.
    If many groups, restrict to top_n groups by count.
    If ANOVA assumptions seem violated (non-normal / heteroscedastic), run Kruskal-Wallis as robust alternative.
    Returns dict with results; if ANOVA significant, also runs Tukey HSD for pairwise comparisons.
    """
    grp = data[[group_col, "Margin"]].dropna()
    counts = grp[group_col].value_counts()
    if top_n:
        top_groups = counts.head(top_n).index.tolist()
        grp = grp[grp[group_col].isin(top_groups)]
    groups = [g["Margin"].dropna().values for _, g in grp.groupby(group_col)]
    # Need at least 2 groups with data
    if len(groups) < 2:
        return {"test":"anova","feature":group_col,"message":"Not enough groups"}
    # ANOVA
    try:
        f_stat, p = f_oneway(*groups)
    except Exception as e:
        # fallback to kruskal
        kw_stat, kw_p = kruskal(*groups)
        conclusion = "Reject H0" if kw_p < ALPHA else "Fail to reject H0"
        return {"test":"kruskal","feature":group_col,"stat":float(kw_stat),"p_value":float(kw_p),"conclusion":conclusion,"n_groups":len(groups)}
    conclusion = "Reject H0" if p < ALPHA else "Fail to reject H0"
    res = {"test":"anova","feature":group_col,"stat":float(f_stat),"p_value":float(p),"conclusion":conclusion,"n_groups":len(groups)}
    # If significant, run Tukey HSD (pairwise)
    if p < ALPHA:
        try:
            tukey = mc.pairwise_tukeyhsd(endog=grp["Margin"], groups=grp[group_col], alpha=ALPHA)
            # convert tukey summary to DataFrame
            tukey_df = pd.DataFrame(data=tukey._results_table.data[1:], columns=tukey._results_table.data[0])
            res["tukey_summary"] = tukey_df
        except Exception as e:
            res["tukey_summary"] = f"Failed to run Tukey: {e}"
    return res

def ttest_margin_between_two_groups(data, group_col, groupA, groupB, metric="Margin"):
    """
    Welch's t-test between two groups
    """
    a = data[data[group_col]==groupA][metric].dropna()
    b = data[data[group_col]==groupB][metric].dropna()
    if len(a) < 5 or len(b) < 5:
        return {"test":"ttest","feature":f"{group_col}:{groupA} vs {groupB}","message":"Insufficient samples"}
    stat, p = ttest_ind(a, b, equal_var=False, nan_policy="omit")
    conclusion = "Reject H0" if p < ALPHA else "Fail to reject H0"
    return {"test":"ttest","feature":f"{group_col}:{groupA} vs {groupB}","stat":float(stat),"p_value":float(p),"conclusion":conclusion,"n_a":len(a),"n_b":len(b)}

# -----------------------
# TEST A: H0: There are no risk differences across provinces
# Use chi-square on ClaimFrequency vs Province
# -----------------------
if "Province" in df.columns:
    print("\nRunning chi-square: ClaimFrequency vs Province")
    a_res = chi2_test_category_vs_freq(df, "Province")
    results.append(a_res)
    # plot average loss ratio by province
    loss_by_prov = df.groupby("Province").apply(lambda d: d["TotalClaims"].sum()/d["TotalPremium"].sum() if d["TotalPremium"].sum()>0 else np.nan).sort_values(ascending=False)
    loss_by_prov.plot(kind="bar", figsize=(12,5))
    plt.ylabel("Loss Ratio (TotalClaims/TotalPremium)")
    plt.title("Loss Ratio by Province")
    plt.tight_layout()
    plt.savefig(os.path.join(RESULTS_DIR,"lossratio_by_province.png"))
    plt.close()
else:
    print("Province column not found; skipping province test")

# -----------------------
# TEST B: H0: There are no risk differences between zip codes
# For practicality we will analyze top N zipcodes by record count
# Use chi-square on freq vs postalcode (top N)
# -----------------------
if "PostalCode" in df.columns:
    print(f"\nRunning chi-square: ClaimFrequency vs top {TOP_N_ZIPCODES} PostalCode")
    top_zips = df["PostalCode"].value_counts().head(TOP_N_ZIPCODES).index.tolist()
    df_topzip = df[df["PostalCode"].isin(top_zips)].copy()
    if df_topzip.shape[0] < 50:
        results.append({"test":"chi2","feature":"PostalCode_topN","message":"Too few rows for top zip test"})
    else:
        b_res = chi2_test_category_vs_freq(df_topzip, "PostalCode")
        results.append(b_res)
        # plot
        loss_by_zip = df_topzip.groupby("PostalCode").apply(lambda d: d["TotalClaims"].sum()/d["TotalPremium"].sum() if d["TotalPremium"].sum()>0 else np.nan).sort_values(ascending=False)
        loss_by_zip.plot(kind="bar", figsize=(12,5))
        plt.ylabel("Loss Ratio (TotalClaims/TotalPremium)")
        plt.title(f"Loss Ratio by Top {TOP_N_ZIPCODES} Postal Codes")
        plt.tight_layout()
        plt.savefig(os.path.join(RESULTS_DIR,"lossratio_top_zipcodes.png"))
        plt.close()
else:
    print("PostalCode column not found; skipping zipcode test")

# -----------------------
# TEST C: H0: There is no significant margin (profit) difference between zip codes
# We'll run ANOVA on Margin across top N zipcodes
# -----------------------
if "PostalCode" in df.columns:
    print(f"\nRunning ANOVA on Margin across top {TOP_N_ZIPCODES} PostalCodes")
    anova_res = anova_margin_across_groups(df, "PostalCode", top_n=TOP_N_ZIPCODES)
    # If Tukey exists, we will save it
    if isinstance(anova_res, dict) and "tukey_summary" in anova_res and isinstance(anova_res["tukey_summary"], pd.DataFrame):
        anova_res["tukey_summary"].to_csv(os.path.join(RESULTS_DIR, "tukey_margin_posthoc_top_zipcodes.csv"), index=False)
    results.append(anova_res)
else:
    print("PostalCode column not found; skipping margin-by-zipcode ANOVA")

# -----------------------
# TEST D: H0: There is no significant risk difference between Women and Men
# Use chi-square for ClaimFrequency vs Gender; t-test for Margin by Gender
# -----------------------
if "Gender" in df.columns:
    print("\nRunning chi-square: ClaimFrequency vs Gender")
    chi_gender = chi2_test_category_vs_freq(df, "Gender")
    results.append(chi_gender)

    # t-test on Margin (Welch)
    genders = df["Gender"].dropna().unique()
    if len(genders) >= 2:
        gvals = [g for g in genders if str(g).strip() != ""]
        if len(gvals) >= 2:
            # pick first two (typically 'M' and 'F' or 'Male','Female')
            gA, gB = gvals[0], gvals[1]
            print(f"\nRunning Welch t-test for Margin between {gA} and {gB}")
            t_res = ttest_margin_between_two_groups(df, "Gender", gA, gB, metric="Margin")
            results.append(t_res)
        else:
            results.append({"test":"ttest","feature":"Gender_margin","message":"Not enough gender categories for t-test"})
    else:
        results.append({"test":"ttest","feature":"Gender_margin","message":"Not enough gender data for t-test"})
else:
    print("Gender column not found; skipping gender tests")

# -----------------------
# Save results summary to CSV
# -----------------------
def normalise_result(r):
    # If dict contains tukey_summary as dataframe, save path and remove heavy data
    r = r.copy()
    if "tukey_summary" in r and isinstance(r["tukey_summary"], pd.DataFrame):
        r["tukey_summary_file"] = os.path.join(RESULTS_DIR, "tukey_margin_posthoc_top_zipcodes.csv")
        del r["tukey_summary"]
    return r

results_norm = [normalise_result(r) if isinstance(r, dict) else {"result":str(r)} for r in results]
res_df = pd.DataFrame(results_norm)
res_csv = os.path.join(RESULTS_DIR, "hypothesis_results.csv")
res_df.to_csv(res_csv, index=False)
print(f"\nSaved hypothesis test summary to {res_csv}")

# Also print nicely
print("\n--- Test Summary ---")
print(res_df.to_string(index=False))

print("\nCompleted Task 3 hypothesis tests. Check the results/ folder for plots and CSV outputs.")


FileNotFoundError: [Errno 2] No such file or directory: 'data/processed/cleaned_data.csv'