In [None]:
# build_predictions_static_and_dm_from_csv.py
# -*- coding: utf-8 -*-

import os, re
import numpy as np
import pandas as pd

# ========= Path configuration =========
ROOT = r"/Users/june/Documents/University of Manchester/Data Science/ERP/Project code/4_Portfolio"
CSV_DIR    = os.path.join(ROOT, "Predictions_by_window")
OUTPUT_DIR = os.path.join(ROOT, "DM_from_static_figures_fixed")
os.makedirs(OUTPUT_DIR, exist_ok=True)
os.makedirs(CSV_DIR,    exist_ok=True)

# ========= Options =========
USE_VALUE_WEIGHT = False   # True: cross-section is value-weighted; False: equal-weighted
MIN_TIME_T = 12           # Minimum number of valid trading days to compute DM
ALPHA = 0.05
ALPHA_STRICT = 0.01
NONSIG_ALPHA = 0.70

# ========= Possible synonymous column names =========
TRUE_CAND   = ["y_true","true","target","y","label","ret","return","excess_ret","excess_return"]
PRED_CAND   = ["y_pred","pred","prediction","forecast","yhat","y_hat","pred_mean","preds"]
DATE1_CAND  = ["signal_date","signaldate","sig_date"]
DATE2_CAND  = ["ret_date","retdate","date","trddt"]
PERMNO_CAND = ["permno","PERMNO","id","sid","ticker"]
MCAP_CAND   = ["market_cap","mktcap","cap","size"]
MODEL_CAND  = ["model","Model","MODEL"]
WINDOW_CAND = ["window","win","h","H"]

def _find_col(cols, candidates):
    m = {c.lower(): c for c in cols}
    for k in candidates:
        if k.lower() in m: return m[k.lower()]
    return None

def _extract_window_from_path(path):
    m = re.search(r'window[_\-\s]?(\d+)', path, re.I)
    return int(m.group(1)) if m else np.nan

# ========= Model name normalization =========
def canonical_model_name(name: str) -> str:
    raw = str(name).strip()
    s = re.sub(r'[\s_\-]+',' ', raw).lower()
    base = {
        'ols':'OLS','ridge':'Ridge','lasso':'Lasso','enet':'Enet','pls':'PLS','pcr':'PCR',
        'rf':'RF','random forest':'RF','xgb':'XGB','arima':'ARIMA',
        'lstm':'LSTM','nbeats':'NBEATS','autoformer':'Autoformer',
        'nn1':'NN1','nn2':'NN2','nn3':'NN3','nn4':'NN4','nn5':'NN5',
        'timesfm1.0':'TimesFm1.0','timesfm2.0':'TimesFm2.0','timesfm1':'TimesFm1.0','timesfm2':'TimesFm2.0'
    }
    if s in base: return base[s]
    if 'random' in s and 'forest' in s: return 'RF'
    if 'xgb' in s or 'xgboost' in s: return 'XGB'
    if 'chronos' in s:
        if 'tiny' in s:  return 'Chronos-Tiny'
        if 'mini' in s:  return 'Chronos-Mini'
        if 'small' in s: return 'Chronos-Small'
        if 'base' in s:  return 'Chronos-Base'
        if 'large' in s: return 'Chronos-Large'
        return 'Chronos-Base'
    if 'moirai' in s or 'uni2ts' in s or s.startswith('units'):
        if 'small' in s: return 'Uni2ts-Small'
        if 'base'  in s: return 'Uni2ts-Base'
        if 'large' in s: return 'Uni2ts-Large'
        return 'Uni2ts-Base'
    if 'timesfm' in s:
        return 'TimesFm2.0' if '2' in s else 'TimesFm1.0'
    m = re.match(r'(nn)\s*(\d+)', s)
    if m: return f"NN{m.group(2)}"
    return raw

# ========= Aggregate predictions_daily.csv =========
def collect_predictions(root: str) -> pd.DataFrame:
    files = []
    for r, _, fs in os.walk(root):
        for f in fs:
            if f.lower() == "predictions_daily.csv":
                files.append(os.path.join(r, f))
    if not files:
        raise FileNotFoundError("No predictions_daily.csv found.")

    rows = []
    for p in sorted(files):
        try:
            df = pd.read_csv(p)
        except Exception:
            df = pd.read_csv(p, sep=';')

        col_true = _find_col(df.columns, TRUE_CAND)
        col_pred = _find_col(df.columns, PRED_CAND)
        if col_true is None or col_pred is None:
            print(f"[SKIP] Missing y_true/y_pred -> {p}")
            continue

        col_perm = _find_col(df.columns, PERMNO_CAND)
        col_mdl  = _find_col(df.columns, MODEL_CAND)
        col_win  = _find_col(df.columns, WINDOW_CAND)
        col_sig  = _find_col(df.columns, DATE1_CAND)
        col_ret  = _find_col(df.columns, DATE2_CAND)
        col_mcap = _find_col(df.columns, MCAP_CAND)

        tmp = pd.DataFrame({
            "signal_date": df[col_sig] if col_sig else pd.NaT,
            "ret_date":    df[col_ret] if col_ret else pd.NaT,
            "permno":      df[col_perm] if col_perm else np.nan,
            "y_true":      df[col_true],
            "y_pred":      df[col_pred],
            "market_cap":  df[col_mcap] if col_mcap else np.nan,
            "model":       df[col_mdl]  if col_mdl  else os.path.basename(os.path.dirname(p)),
            "window":      df[col_win]  if col_win  else _extract_window_from_path(p)
        })

        tmp["model"]  = tmp["model"].map(canonical_model_name)
        tmp["window"] = pd.to_numeric(tmp["window"], errors="coerce").astype("Int64")
        rows.append(tmp)

    all_df = pd.concat(rows, ignore_index=True)
    all_df = all_df.dropna(subset=["y_true","y_pred","model","window"], how="any")

    for c in ["signal_date","ret_date"]:
        if c in all_df.columns:
            try: all_df[c] = pd.to_datetime(all_df[c], errors="coerce")
            except: pass
    return all_df

# ========= Split CSV by window =========
def export_csv_by_window(df_all: pd.DataFrame, out_dir: str):
    df = df_all.copy()
    df["window"] = pd.to_numeric(df["window"], errors="coerce")
    df = df.dropna(subset=["window"])
    df["window"] = df["window"].astype(int)

    need_cols = ["signal_date","ret_date","permno","y_true","y_pred","market_cap","model","window"]
    for c in need_cols:
        if c not in df.columns: df[c] = pd.NA
    df = df[need_cols]

    for c in ["signal_date","ret_date"]:
        df[c] = pd.to_datetime(df[c], errors="coerce").dt.strftime("%Y-%m-%d")

    wins = sorted(df["window"].unique().tolist())
    print("[DEBUG] windows detected:", wins)

    paths = {}
    for w, sub in df.groupby("window", sort=True):
        out_path = os.path.join(out_dir, f"predictions_window{int(w)}.csv")
        sub.to_csv(out_path, index=False)
        paths[int(w)] = out_path
        print(f"[OK] write {out_path} rows={len(sub)}")
    return paths

# ========= Main process =========
if __name__ == "__main__":
    all_pred = collect_predictions(ROOT)
    window_csv_map = export_csv_by_window(all_pred, CSV_DIR)

    import matplotlib.pyplot as plt
    import matplotlib.patches as mpatches
    from scipy import stats
    from statsmodels.tsa.stattools import acovf

    WINDOWS = sorted(window_csv_map.keys())

    DESIRED_ORDER = [
        "OLS","Ridge","Lasso","Enet","PCR","PLS",
        "RF","XGB",
        "NN1","NN2","NN3","NN4","NN5",
        "LSTM","NBEATS","Autoformer",
        "Chronos-T5 (Tiny)","Chronos-T5 (Mini)","Chronos-T5 (Small)","Chronos-T5 (Base)","Chronos-T5 (Large)",
        "TimesFm1.0","TimesFm2.0",
        "Moirai-1.1-R (Small)","Moirai-1.1-R (Base)","Moirai-1.1-R (Large)"
    ]
    DISPLAY_MAP = {
        "Chronos-Tiny":"Chronos-T5 (Tiny)","Chronos-Mini":"Chronos-T5 (Mini)",
        "Chronos-Small":"Chronos-T5 (Small)","Chronos-Base":"Chronos-T5 (Base)","Chronos-Large":"Chronos-T5 (Large)",
        "Uni2ts-Small":"Moirai-1.1-R (Small)","Uni2ts-Base":"Moirai-1.1-R (Base)","Uni2ts-Large":"Moirai-1.1-R (Large)"
    }
    def display_name(x): return DISPLAY_MAP.get(x, x)
    def normalize_key(s): return re.sub(r"[^0-9a-z]","", str(s).lower())
    def sort_models_by_desired(model_list):
        remaining = list(dict.fromkeys(model_list))
        ordered = []
        norm_remaining = {m: normalize_key(m) for m in remaining}
        for target in DESIRED_ORDER:
            tnorm = normalize_key(target)
            matched = [m for m, mn in norm_remaining.items() if tnorm in mn or mn in tnorm]
            for m in matched:
                if m in remaining:
                    ordered.append(m)
                    remaining.remove(m)
                    norm_remaining.pop(m, None)
        ordered += sorted(remaining)
        return ordered

    # ---- NW helper: bandwidth and variance (for time series) ----
    def nw_bandwidth(T: int) -> int:
        # Common Bartlett rule; you can also fix to 6 or use other data-driven rules
        return int(np.floor(4.0 * (T/100.0)**(2.0/9.0))) if T > 1 else 0

    def nw_var_of_mean_ts(series: pd.Series) -> float:
        x = np.asarray(series.values, dtype=float)
        T = len(x)
        if T <= 1:
            return np.nan
        L = max(1, nw_bandwidth(T))
        ac = acovf(x, fft=False, demean=True)
        s = ac[0]
        for k in range(1, min(L+1, T)):
            w = 1.0 - k / float(L+1)
            s += 2.0 * w * ac[k]
        return s / T

    # ---- Cross-sectional mean difference by date (equal-weighted / value-weighted) ----
    def cs_mean_diff_by_date(values_diff: np.ndarray, dates: np.ndarray, weights: np.ndarray | None):
        df = pd.DataFrame({"date": pd.to_datetime(dates), "d": values_diff})
        if USE_VALUE_WEIGHT and weights is not None:
            df["w"] = weights
            g = (df.dropna(subset=["d","w"])
                   .query("w > 0")
                   .groupby("date")
                   .apply(lambda s: np.average(s["d"].values, weights=s["w"].values))
                 )
            return g.sort_index()
        else:
            return df.dropna(subset=["d"]).groupby("date")["d"].mean().sort_index()

    per_win_df = {}
    for w in WINDOWS:
        p = window_csv_map[w]
        try:
            df = pd.read_csv(p)
        except Exception:
            df = pd.read_csv(p, sep=';')
        per_win_df[w] = df
        print(f"[DEBUG] load {p} rows={len(df)}")

    # ---- For each window: align -> DM (MSE/DA) time series NW -> McNemar ----
    results = {}
    for w in WINDOWS:
        df = per_win_df.get(int(w))
        if df is None or df.empty:
            print(f"[SKIP] window{w} has no data.")
            continue

        need = ["ret_date","permno","y_true","y_pred","model","market_cap"]
        miss = [c for c in need if c not in df.columns]
        if any(m not in df.columns for m in ["ret_date","permno","y_true","y_pred","model"]):
            raise ValueError(f"window{w} missing core columns.")

        d = df[need].copy()
        d["Model"] = d["model"].astype(str).map(display_name)
        d["ret_date"] = pd.to_datetime(d["ret_date"], errors="coerce")
        d = d.dropna(subset=["ret_date","permno","y_true","y_pred"])
        d["market_cap"] = pd.to_numeric(d["market_cap"], errors="coerce")

        keys = ["ret_date","permno"]
        pivot_pred = d.pivot_table(index=keys, columns="Model", values="y_pred", aggfunc="last").sort_index()
        aligned = (d.drop_duplicates(keys)
                    .set_index(keys)
                    .reindex(pivot_pred.index))
        y_true_aligned = aligned["y_true"].astype(float).values
        mcap_aligned   = aligned["market_cap"].astype(float).values

        model_names = sort_models_by_desired([c for c in pivot_pred.columns])

        ytrue = y_true_aligned.astype(float)
        valid_true = np.isfinite(ytrue)
        pred_mat  = {m: pivot_pred[m].astype(float).values for m in model_names}
        valid_mat = {m: valid_true & np.isfinite(pred_mat[m]) for m in model_names}

        dm_stats_mse = pd.DataFrame(np.nan, index=model_names, columns=model_names)
        dm_pvals_mse = pd.DataFrame(np.nan, index=model_names, columns=model_names)
        dm_stats_da  = pd.DataFrame(np.nan, index=model_names, columns=model_names)
        dm_pvals_da  = pd.DataFrame(np.nan, index=model_names, columns=model_names)
        mcn_std      = pd.DataFrame(np.nan, index=model_names, columns=model_names)
        mcn_pval     = pd.DataFrame(np.nan, index=model_names, columns=model_names)

        rows_mse, rows_da = [], []

        idx = pivot_pred.index
        dates_full = np.array([ix[0] for ix in idx])

        for i, mi in enumerate(model_names):
            for j, mj in enumerate(model_names):
                if mi == mj: continue
                mask = valid_mat[mi] & valid_mat[mj]
                if mask.sum() < 5:
                    continue

                yi = pred_mat[mi][mask]
                yj = pred_mat[mj][mask]
                yt = ytrue[mask]
                dd = dates_full[mask]
                ww = mcap_aligned[mask] if USE_VALUE_WEIGHT else None

                d_loss = (yt - yi)**2 - (yt - yj)**2
                d_ts = cs_mean_diff_by_date(d_loss, dd, ww)
                Tts = len(d_ts)
                if Tts >= MIN_TIME_T:
                    var_mean = nw_var_of_mean_ts(d_ts)
                    if not np.isfinite(var_mean) or var_mean <= 0:
                        var_mean = d_ts.var(ddof=1) / max(Tts,1)
                    DMm = float(d_ts.mean() / np.sqrt(var_mean))
                    pmm = float(2.0 * stats.norm.sf(abs(DMm)))
                else:
                    DMm, pmm = np.nan, np.nan

                dm_stats_mse.loc[mi, mj] = DMm
                dm_pvals_mse.loc[mi, mj] = pmm
                rows_mse.append({
                    "ModelA": mi, "ModelB": mj, "window": int(w),
                    "N_time": Tts, "DM": DMm, "pval": pmm,
                    "winner": (
                        mi if (pd.notna(pmm) and pmm < ALPHA and DMm < 0)
                        else (mj if (pd.notna(pmm) and pmm < ALPHA and DMm > 0) else "NoSig")
                    )
                })

                corr_i = (np.sign(yi) == np.sign(yt)).astype(float)
                corr_j = (np.sign(yj) == np.sign(yt)).astype(float)
                d_da = (1.0 - corr_i) - (1.0 - corr_j)
                d_da_ts = cs_mean_diff_by_date(d_da, dd, ww)
                Tts2 = len(d_da_ts)
                if Tts2 >= MIN_TIME_T:
                    var_mean_da = nw_var_of_mean_ts(d_da_ts)
                    if not np.isfinite(var_mean_da) or var_mean_da <= 0:
                        var_mean_da = d_da_ts.var(ddof=1) / max(Tts2,1)
                    DMd = float(d_da_ts.mean() / np.sqrt(var_mean_da))
                    pmd = float(2.0 * stats.norm.sf(abs(DMd)))
                else:
                    DMd, pmd = np.nan, np.nan

                dm_stats_da.loc[mi, mj] = DMd
                dm_pvals_da.loc[mi, mj] = pmd

                onlyA = int(np.sum((corr_i == 1) & (corr_j == 0)))
                onlyB = int(np.sum((corr_i == 0) & (corr_j == 1)))
                n = onlyA + onlyB
                if n > 0:
                    bc = onlyA - onlyB
                    z  = bc / np.sqrt(n)
                    try:
                        p_mc = stats.binomtest(min(onlyA, onlyB), n=n).pvalue
                    except AttributeError:
                        p_mc = stats.binom_test(min(onlyA, onlyB), n=n, p=0.5)
                    mcn_std.loc[mi, mj]  = z
                    mcn_pval.loc[mi, mj] = p_mc

                rows_da.append({
                    "ModelA": mi, "ModelB": mj, "window": int(w),
                    "N_time": Tts2, "DM": DMd, "pval": pmd,
                    "winner": (
                        mi if (pd.notna(pmd) and pmd < ALPHA and DMd < 0)
                        else (mj if (pd.notna(pmd) and pmd < ALPHA and DMd > 0) else "NoSig")
                    ),
                    "mcnemar_onlyA": onlyA, "mcnemar_onlyB": onlyB,
                    "mcnemar_bc": (onlyA - onlyB) if n>0 else np.nan,
                    "mcnemar_z": z if n>0 else np.nan,
                    "mcnemar_pval": p_mc if n>0 else np.nan
                })

        pd.DataFrame(rows_mse)[["ModelA","ModelB","window","N_time","DM","pval","winner"]].to_csv(
            os.path.join(OUTPUT_DIR, f"DM_summary_mse_w{w}.csv"), index=False)
        pd.DataFrame(rows_da)[["ModelA","ModelB","window","N_time","DM","pval","winner",
                               "mcnemar_onlyA","mcnemar_onlyB","mcnemar_bc","mcnemar_z","mcnemar_pval"]].to_csv(
            os.path.join(OUTPUT_DIR, f"DM_summary_da_w{w}.csv"), index=False)

        results[int(w)] = dict(
            dm_stats_mse=dm_stats_mse, dm_pvals_mse=dm_pvals_mse,
            dm_stats_da=dm_stats_da,   dm_pvals_da=dm_pvals_da,
            mcnemar_std=mcn_std,       mcnemar_pval=mcn_pval,
            models=dm_stats_mse.index.tolist()
        )

    # ---- Plotting: non-significant cells gray + asterisks ----
    def _draw_heat(ax, arr, labels, pvals_df, vmax, title=None,
                   alpha=NONSIG_ALPHA, alpha_lo=ALPHA, alpha_hi=ALPHA_STRICT):
        im = ax.imshow(arr, cmap='bwr', vmin=-vmax, vmax=vmax, aspect='auto', interpolation='nearest')
        n = arr.shape[0]
        ax.set_xticks(range(n)); ax.set_yticks(range(n))
        ax.set_xticklabels(labels, rotation=90, fontsize=8)
        ax.set_yticklabels(labels, fontsize=8)

        for (i, j), v in np.ndenumerate(arr):
            try:
                p = pvals_df.iat[i, j]
            except Exception:
                p = np.nan
            if pd.notna(p) and p <= alpha_lo and np.isfinite(v):
                ax.text(j, i, f"{v:.2f}", ha="center", va="center", fontsize=6.2, zorder=3)
                star = "**" if p <= alpha_hi else "*"
                ax.text(j + 0.46, i - 0.46, star, ha="right", va="top",
                        fontsize=6.6, fontweight="bold", color="black", zorder=4)

        for i in range(n):
            for j in range(n):
                try:
                    p = pvals_df.iat[i, j]
                except Exception:
                    p = np.nan
                if not (pd.notna(p) and p <= alpha_lo):
                    rect = mpatches.Rectangle((j-0.5, i-0.5), 1, 1,
                                              facecolor="lightgray", edgecolor=None,
                                              alpha=alpha, zorder=2)
                    ax.add_patch(rect)
        if title: ax.set_title(title, fontsize=10)
        return im

    def _vmax(*mats):
        vmax = 0.0
        for a in mats:
            if a is None: continue
            m = np.nanmax(np.abs(a))
            if np.isfinite(m): vmax = max(vmax, m)
        return vmax or 1.0

    def save_per_window_plots():
        for w, r in results.items():
            labels = r["dm_stats_mse"].index.tolist()
            arr1 = r["dm_stats_mse"].values.astype(float)
            arr2 = r["dm_stats_da"].values.astype(float)
            arr3 = (-r["mcnemar_std"].values.astype(float))
            v1 = _vmax(arr1); v2 = _vmax(arr2); v3 = _vmax(arr3)

            for (arr, pv, ttl, fn, vmax_) in [
                (arr1, r["dm_pvals_mse"], f"DM (MSE) w={w}  (negative = row is better; * p≤0.05, ** p≤0.01)",  f"DM_signed_MSE_w{w}.png", v1),
                (arr2, r["dm_pvals_da"],  f"DM (DA)  w={w}  (negative = row is better; * p≤0.05, ** p≤0.01)",  f"DM_signed_DA_w{w}.png",  v2),
                (arr3, r["mcnemar_pval"], f"McNemar Test w={w}  (negative = row is better; * p≤0.05, ** p≤0.01)", f"McNemar_std_w{w}.png", v3),
            ]:
                fig, ax = plt.subplots(figsize=(max(8, len(labels)*0.38), max(6, len(labels)*0.34)))
                im = _draw_heat(ax, arr, labels, pv, vmax_, title=ttl)
                cbar = fig.colorbar(mappable=im, ax=ax, fraction=0.05)
                cbar.ax.set_ylabel("Value", rotation=270, labelpad=12)
                out = os.path.join(OUTPUT_DIR, fn)
                fig.savefig(out, dpi=200, bbox_inches="tight")
                plt.close(fig)
                print("Saved:", out)

    def save_2x2_composites():
        if len(results) < 4:
            print("Not all windows present; skip 2x2 composites.")
            return
        wins = sorted(results.keys())
        labels = results[wins[0]]["dm_stats_mse"].index.tolist()

        def grid_for(metric_key, title, fname, is_mcnemar=False):
            if is_mcnemar:
                mats = [(-results[w]["mcnemar_std"].reindex(index=labels, columns=labels).values.astype(float)) for w in wins]
                pvals = [results[w]["mcnemar_pval"].reindex(index=labels, columns=labels) for w in wins]
            else:
                mats = [results[w][metric_key].reindex(index=labels, columns=labels).values.astype(float) for w in wins]
                pkey = metric_key.replace("dm_stats", "dm_pvals", 1)
                pvals = [results[w][pkey].reindex(index=labels, columns=labels) for w in wins]

            vmax = _vmax(*mats)
            fig, axes = plt.subplots(2, 2, figsize=(18, 14), constrained_layout=True)
            ims = []
            for ax, arr, pv, ww in zip(axes.ravel(), mats, pvals, wins):
                im = _draw_heat(ax, arr, labels, pv, vmax, title=f"w={ww}")
                ims.append(im)
            cbar = fig.colorbar(mappable=ims[0], ax=axes.ravel().tolist(), fraction=0.02, pad=0.02)
            cbar.ax.set_ylabel("Value", rotation=270, labelpad=12)
            fig.suptitle(title, fontsize=14)
            out = os.path.join(OUTPUT_DIR, fname)
            fig.savefig(out, dpi=220, bbox_inches="tight")
            plt.close(fig)
            print("Saved 2x2:", out)

        grid_for("dm_stats_mse", "DM (MSE) across windows  (negative = row is better; * p≤0.05, ** p≤0.01)", "DM_MSE_2x2.png")
        grid_for("dm_stats_da",  "DM (DA)  across windows  (negative = row is better; * p≤0.05, ** p≤0.01)", "DM_DA_2x2.png")
        grid_for(None,           "McNemar Test across windows  (negative = row is better; * p≤0.05, ** p≤0.01)", "McNemar_std_2x2.png", is_mcnemar=True)

    save_per_window_plots()
    save_2x2_composites()
    print("All done. CSVs at:", CSV_DIR, "| Figures at:", OUTPUT_DIR,
          "| Weighted:", "Value-weighted" if USE_VALUE_WEIGHT else "Equal-weighted")
