In [None]:
# =========================
# Notebook 2: Daily feature aggregation (ROOT-ONLY)
# =========================
import warnings
from pathlib import Path
import numpy as np
import pandas as pd

warnings.filterwarnings("ignore", category=FutureWarning)

# -------------------------
# Configuration (root files only)
# -------------------------
CONFIG = {
    "DATE_START": "2021-01-01",
    "DATE_END": "2023-12-31",
    "TICKERS": ["AAPL", "AMZN", "MSFT", "TSLA", "AMD"],
    "PX_TIDY": "historical_stock_yfinance_full_2021_2023_tidy.csv",
    "CLIP_STRATEGY": "expanding",   # or "static"
    "ACTIVITY_K": 3,
    "SRC_TW_SCORED": "twitter_scored.csv",
    "SRC_RD_SCORED": "reddit_scored.csv",
    "SRC_SP_SCORED": "sp500_scored.csv",
    "OUT_TW_DAILY": "twitter_daily.csv",
    "OUT_RD_DAILY": "reddit_daily.csv",
    "OUT_SP_DAILY": "sp500_daily.csv",
}

DATE_START_TS = pd.to_datetime(CONFIG["DATE_START"])
DATE_END_TS   = pd.to_datetime(CONFIG["DATE_END"])
TICKERS = CONFIG["TICKERS"]

print("Notebook 2 config OK (root-only).")

# =========================
# Pre-flight: root files must exist (root-only)
# =========================
need = [
    CONFIG["SRC_TW_SCORED"],
    CONFIG["SRC_RD_SCORED"],
    CONFIG["SRC_SP_SCORED"],
    CONFIG["PX_TIDY"],
]

missing = [p for p in need if not Path(p).exists()]
if missing:
    raise FileNotFoundError(f"Missing required root files: {missing}")

print("All required root files are present.")

def quick_summary(path, tag):
    try:
        df = pd.read_csv(path, nrows=5)
        print(f"{tag}: columns ->", list(df.columns))
    except Exception as e:
        print(f"{tag}: unable to read head ->", e)

quick_summary(CONFIG["SRC_TW_SCORED"], "twitter_scored")
quick_summary(CONFIG["SRC_RD_SCORED"], "reddit_scored")
quick_summary(CONFIG["SRC_SP_SCORED"], "sp500_scored")

# =========================
# Trading-day index from price tidy file (root)
# =========================
def build_trading_day_index(px_path, tickers, start_ts, end_ts):
    px = pd.read_csv(px_path, parse_dates=["Date"])
    px = px.rename(columns={"Date":"date", "Ticker":"ticker"}) if "Ticker" in px.columns else px.rename(columns={"Date":"date"})
    if "ticker" not in px.columns:
        if "Symbol" in px.columns:
            px = px.rename(columns={"Symbol":"ticker"})
        else:
            raise KeyError("Price file must include a ticker column (Ticker or Symbol).")
    px = px.loc[px["ticker"].isin(tickers)].copy()
    px = px[(px["date"] >= start_ts) & (px["date"] <= end_ts)]
    cal = (
        px[["date","ticker"]]
        .drop_duplicates()
        .sort_values(["ticker","date"])
        .reset_index(drop=True)
    )
    return cal

TRADING_CAL = build_trading_day_index(
    CONFIG["PX_TIDY"], TICKERS, DATE_START_TS, DATE_END_TS
)
TRADING_DAYS_INDEX = TRADING_CAL["date"]
print("Trading calendar:", TRADING_DAYS_INDEX.min().date(), "->", TRADING_DAYS_INDEX.max().date())

# =========================
# Robust aggregation helpers (v2, root-only)
# =========================
def winsor_series(x: pd.Series, lower_q=0.01, upper_q=0.99):
    lo, hi = x.quantile(lower_q), x.quantile(upper_q)
    return x.clip(lower=lo, upper=hi)

def expanding_winsor(x: pd.Series, lower_q=0.01, upper_q=0.99):
    out = []
    arr = x.values
    for i in range(len(arr)):
        hist = pd.Series(arr[:i])  # exclude current for no look-ahead
        if len(hist) < 10:
            out.append(arr[i])
            continue
        lo, hi = hist.quantile(lower_q), hist.quantile(upper_q)
        v = arr[i]
        v = lo if v < lo else v
        v = hi if v > hi else v
        out.append(v)
    return pd.Series(out, index=x.index)

def select_score_column(df):
    for c in ["s_finbert","s_deberta","score"]:
        if c in df.columns:
            return c
    raise KeyError("No sentiment score column found. Expected s_finbert or s_deberta.")

def aggregate_one(df_scored: pd.DataFrame, src_prefix: str, clip="expanding", activity_k=3):
    sc_col = select_score_column(df_scored)
    scored = df_scored.copy()
    scored["date"] = pd.to_datetime(scored["date"])
    scored = scored.sort_values(["ticker","date"])

    # winsorise score with no look-ahead (expanding) or static
    if clip == "expanding":
        scored["_score_w"] = scored.groupby("ticker", group_keys=False)[sc_col].apply(expanding_winsor)
    elif clip == "static":
        scored["_score_w"] = scored.groupby("ticker", group_keys=False)[sc_col].apply(winsor_series)
    else:
        raise ValueError("clip must be 'expanding' or 'static'")

    # base aggregation on winsorised score
    g = scored.groupby(["ticker","date"], as_index=False)
    ag_score = g["_score_w"].agg(
        mean_s = "mean",
        median_s = "median",
        std_s = "std",
        q10_s = lambda s: s.quantile(0.10),
        q90_s = lambda s: s.quantile(0.90),
    )
    ag_count = g.size().rename(columns={"size":"count_items"})

    ag = ag_score.merge(ag_count, on=["ticker","date"], how="left")

    # unique authors if present
    if "author" in scored.columns:
        ua = scored.groupby(["ticker","date"])["author"].nunique().reset_index(name="unique_authors")
        ag = ag.merge(ua, on=["ticker","date"], how="left")
    else:
        ag["unique_authors"] = 0.0

    # shares (prob >= 0.5) and daily means of FinBERT probabilities if present
    for pcol in ["p_pos","p_neu","p_neg"]:
        if pcol in scored.columns:
            # share >= 0.5
            sh = scored.groupby(["ticker","date"])[pcol].apply(lambda s: (s >= 0.5).mean()).reset_index(name=f"{pcol}_share")
            ag = ag.merge(sh, on=["ticker","date"], how="left")
        else:
            ag[f"{pcol}_share"] = 0.0

    if set(["p_pos","p_neu","p_neg"]).issubset(scored.columns):
        pm = scored.groupby(["ticker","date"])[["p_pos","p_neu","p_neg"]].mean().reset_index()
        pm = pm.rename(columns={
            "p_pos":"mean_p_pos",
            "p_neu":"mean_p_neu",
            "p_neg":"mean_p_neg"
        })
        ag = ag.merge(pm, on=["ticker","date"], how="left")
    else:
        ag["mean_p_pos"] = 0.0
        ag["mean_p_neu"] = 0.0
        ag["mean_p_neg"] = 0.0

    # expanding z, EWMAs on mean_s (computed only on active days, then aligned back)
    z_rows = []
    for t, tdf in ag.groupby("ticker"):
        tdf = tdf.sort_values("date").reset_index(drop=True)
        mu = tdf["mean_s"].expanding().mean().shift(1)
        sd = tdf["mean_s"].expanding().std().shift(1)
        zmean = (tdf["mean_s"] - mu) / sd
        tdf["z_mean_s"] = zmean.replace([np.inf, -np.inf], np.nan).fillna(0.0)
        tdf["ewma3_s"] = tdf["mean_s"].ewm(span=3, adjust=False).mean()
        tdf["ewma7_s"] = tdf["mean_s"].ewm(span=7, adjust=False).mean()
        z_rows.append(tdf)
    ag = pd.concat(z_rows, ignore_index=True)

    # activity flags on active rows (zero_day handled AFTER calendar merge)
    act_rows = []
    for t, tdf in ag.groupby("ticker"):
        tdf = tdf.sort_values("date")
        r90 = tdf["count_items"].rolling(60, min_periods=20).quantile(0.9).shift(1)
        rmed = tdf["count_items"].rolling(21, min_periods=10).median().shift(1)
        tdf["burst_day"] = (tdf["count_items"] > r90.fillna(0.0)).astype(float)
        tdf["active_k"]  = (tdf["count_items"] >= activity_k).astype(float)
        tdf["active_rl"] = (tdf["count_items"] > rmed.fillna(0.0)).astype(float)
        act_rows.append(tdf)
    ag = pd.concat(act_rows, ignore_index=True)

    # calendar merge (left join), THEN compute zero_day correctly
    ag = TRADING_CAL.merge(ag, on=["ticker","date"], how="left").sort_values(["ticker","date"]).reset_index(drop=True)
    ag["zero_day"] = ((ag["count_items"].isna()) | (ag["count_items"] == 0)).astype(float)

    # fill NaNs for neutral/no-item days
    ag = ag.fillna({
        "mean_s":0.0,"median_s":0.0,"std_s":0.0,"q10_s":0.0,"q90_s":0.0,
        "count_items":0.0,"unique_authors":0.0,
        "p_pos_share":0.0,"p_neg_share":0.0,"p_neu_share":0.0,
        "mean_p_pos":0.0,"mean_p_neu":0.0,"mean_p_neg":0.0,
        "z_mean_s":0.0,"ewma3_s":0.0,"ewma7_s":0.0,
        "burst_day":0.0,"active_k":0.0,"active_rl":0.0
    })

    # prefix columns
    pref = src_prefix
    rename_map = {c: f"{pref}{c}" for c in ag.columns if c not in ["ticker","date"]}
    out = ag.rename(columns=rename_map)
    return out

# =========================
# Main: build and save daily CSVs (write to ROOT)
# =========================
tw_scored = pd.read_csv(CONFIG["SRC_TW_SCORED"], parse_dates=["date"])
rd_scored = pd.read_csv(CONFIG["SRC_RD_SCORED"], parse_dates=["date"])
sp_scored = pd.read_csv(CONFIG["SRC_SP_SCORED"], parse_dates=["date"])

tw = aggregate_one(tw_scored, "Tw_", clip=CONFIG["CLIP_STRATEGY"], activity_k=CONFIG["ACTIVITY_K"])
rd = aggregate_one(rd_scored, "Rd_", clip=CONFIG["CLIP_STRATEGY"], activity_k=CONFIG["ACTIVITY_K"])
nw = aggregate_one(sp_scored, "Nw_SP500_", clip=CONFIG["CLIP_STRATEGY"], activity_k=CONFIG["ACTIVITY_K"])

tw = tw.sort_values(["ticker","date"]).reset_index(drop=True)
rd = rd.sort_values(["ticker","date"]).reset_index(drop=True)
nw = nw.sort_values(["ticker","date"]).reset_index(drop=True)

tw.to_csv(CONFIG["OUT_TW_DAILY"], index=False)
rd.to_csv(CONFIG["OUT_RD_DAILY"], index=False)
nw.to_csv(CONFIG["OUT_SP_DAILY"], index=False)

for f in [CONFIG["OUT_TW_DAILY"], CONFIG["OUT_RD_DAILY"], CONFIG["OUT_SP_DAILY"]]:
    p = Path(f)
    print("Written:", p.resolve(), "size=", p.stat().st_size, "bytes")

# =========================
# Post-run verification
# =========================
ALLOWED = {
    "Tw_": ["mean_s","median_s","std_s","q10_s","q90_s",
            "count_items","unique_authors",
            "p_pos_share","p_neg_share","p_neu_share",
            "mean_p_pos","mean_p_neu","mean_p_neg",
            "z_mean_s","ewma3_s","ewma7_s",
            "zero_day","burst_day","active_k","active_rl"],
    "Rd_": ["mean_s","median_s","std_s","q10_s","q90_s",
            "count_items","unique_authors",
            "p_pos_share","p_neg_share","p_neu_share",
            "mean_p_pos","mean_p_neu","mean_p_neg",
            "z_mean_s","ewma3_s","ewma7_s",
            "zero_day","burst_day","active_k","active_rl"],
    "Nw_SP500_": ["mean_s","median_s","std_s","q10_s","q90_s",
                  "count_items","unique_authors",
                  "p_pos_share","p_neg_share","p_neu_share",
                  "mean_p_pos","mean_p_neu","mean_p_neg",
                  "z_mean_s","ewma3_s","ewma7_s",
                  "zero_day","burst_day","active_k","active_rl"],
}

def check_df(df, pref):
    cols = [c for c in df.columns if c not in ["ticker","date"]]
    bad = [c for c in cols if c.replace(pref,"") not in ALLOWED[pref]]
    if bad:
        raise AssertionError(f"Unexpected columns for {pref}: {bad}")
    g = df.groupby("ticker")["date"].agg(["min","max","count"]).reset_index()
    return g

tw_v = pd.read_csv(CONFIG["OUT_TW_DAILY"], parse_dates=["date"])
rd_v = pd.read_csv(CONFIG["OUT_RD_DAILY"], parse_dates=["date"])
nw_v = pd.read_csv(CONFIG["OUT_SP_DAILY"], parse_dates=["date"])

print("Twitter coverage:\n", check_df(tw_v, "Tw_").to_string(index=False))
print("Reddit coverage:\n", check_df(rd_v, "Rd_").to_string(index=False))
print("SP500 coverage:\n", check_df(nw_v, "Nw_SP500_").to_string(index=False))

def align_check(df_name, df):
    merged = TRADING_CAL.merge(df, on=["ticker","date"], how="left")
    col = "Tw_mean_s" if df_name=="Twitter" else ("Rd_mean_s" if df_name=="Reddit" else "Nw_SP500_mean_s")
    miss = merged[col].isna().sum()
    if miss > 0:
        raise AssertionError(f"{df_name}: missing days vs calendar = {miss}")
    else:
        print(f"{df_name}: aligned to trading calendar.")

align_check("Twitter", tw_v)
align_check("Reddit", rd_v)
align_check("News", nw_v)

# Cross-source key alignment
def keyset(df):
    return set(zip(df["ticker"], df["date"].dt.strftime("%Y-%m-%d")))
keys_tw = keyset(tw_v); keys_rd = keyset(rd_v); keys_nw = keyset(nw_v)
if not (keys_tw == keys_rd == keys_nw):
    print("WARNING: cross-source key mismatch.")
else:
    print("Cross-source key alignment: OK")

if tw_v.empty or rd_v.empty or nw_v.empty:
    print("\nOne or more daily files are empty. Check inputs and date windows in Notebook 1.")
else:
    print("\nDaily feature files look OK.")

# =========================
# Optional diagnostics: first non-zero days per ticker
# =========================
def first_nonzero(df, prefix, name):
    c = f"{prefix}count_items"
    z = df[df[c] > 0].sort_values(["ticker","date"]).groupby("ticker").head(1)
    print(f"\n{name} first non-zero days:")
    if z.empty:
        print("None")
    else:
        print(z[["ticker","date", c, f"{prefix}mean_s"]].to_string(index=False))

first_nonzero(tw_v, "Tw_", "Twitter")
first_nonzero(rd_v, "Rd_", "Reddit")
first_nonzero(nw_v, "Nw_SP500_", "News")

# =========================
# Single-file downloads (Colab)
# =========================
try:
    from google.colab import files
    for f in [CONFIG["OUT_TW_DAILY"], CONFIG["OUT_RD_DAILY"], CONFIG["OUT_SP_DAILY"]]:
        if Path(f).exists():
            files.download(f)
except Exception as e:
    print("Direct download not available in this environment:", e)

# =========================
# Daily CSV Validation Pack (strict, EMR-aligned)
# =========================
import json, math
import numpy as np
import pandas as pd
from pathlib import Path

# ---- Config (adjust only if your filenames differ)
CFG = {
    "TW": "twitter_daily.csv",
    "RD": "reddit_daily.csv",
    "NW": "sp500_daily.csv",
    "PX_TIDY": "historical_stock_yfinance_full_2021_2023_tidy.csv",
    "ACTIVITY_K": 3,
}
ALLOWED = {
    "Tw_": ["mean_s","median_s","std_s","q10_s","q90_s",
            "count_items","unique_authors",
            "p_pos_share","p_neg_share","p_neu_share",
            "mean_p_pos","mean_p_neu","mean_p_neg",
            "z_mean_s","ewma3_s","ewma7_s","zero_day","burst_day","active_k","active_rl"],
    "Rd_": ["mean_s","median_s","std_s","q10_s","q90_s",
            "count_items","unique_authors",
            "p_pos_share","p_neg_share","p_neu_share",
            "mean_p_pos","mean_p_neu","mean_p_neg",
            "z_mean_s","ewma3_s","ewma7_s","zero_day","burst_day","active_k","active_rl"],
    "Nw_SP500_": ["mean_s","median_s","std_s","q10_s","q90_s",
                  "count_items","unique_authors",
                  "p_pos_share","p_neg_share","p_neu_share",
                  "mean_p_pos","mean_p_neu","mean_p_neg",
                  "z_mean_s","ewma3_s","ewma7_s","zero_day","burst_day","active_k","active_rl"],
}

# ---- Helpers
def _read_strict(path):
    if not Path(path).exists():
        return None, [f"Missing file: {path}"]
    try:
        df = pd.read_csv(path, parse_dates=["date"])
    except Exception as e:
        return None, [f"Unreadable CSV {path}: {e}"]
    # Drop accidental index columns
    for junk in [c for c in df.columns if c.lower().startswith("unnamed:")]:
        df = df.drop(columns=junk)
    return df, []

def _build_calendar(px_path):
    if not Path(px_path).exists():
        return None, [f"Missing tidy price file: {px_path}"]
    px = pd.read_csv(px_path, parse_dates=["Date"])
    if "Ticker" in px.columns:
        px = px.rename(columns={"Date":"date","Ticker":"ticker"})
    else:
        px = px.rename(columns={"Date":"date"})
        if "ticker" not in px.columns:
            if "Symbol" in px.columns:
                px = px.rename(columns={"Symbol":"ticker"})
            else:
                return None, ["Tidy price file has no ticker column (Ticker/Symbol)."]
    cal = px[["date","ticker"]].drop_duplicates().sort_values(["ticker","date"])
    return cal, []

def _smoke(df, pref, allowed):
    issues = []
    base = {"date","ticker"}
    if not base.issubset(df.columns):
        issues.append(f"{pref} missing base cols {base - set(df.columns)}")
    cols = [c for c in df.columns if c not in base]
    bad = [c for c in cols if c.replace(pref,"") not in allowed]
    if bad:
        issues.append(f"{pref} unexpected cols: {bad}")
    dups = df.duplicated(["ticker","date"]).sum()
    if dups:
        issues.append(f"{pref} duplicate (ticker,date) rows: {dups}")
    if not np.issubdtype(df["date"].dtype, np.datetime64):
        issues.append(f"{pref} date not datetime64")
    # integer-like checks for counts
    for c in [f"{pref}count_items", f"{pref}unique_authors"]:
        if c in df.columns:
            nonint = (~(df[c].fillna(0) % 1 == 0)).sum()
            if nonint:
                issues.append(f"{c} not integer-like rows: {nonint}")
            neg = (df[c] < 0).sum()
            if neg:
                issues.append(f"{c} negative rows: {neg}")
    return issues

def _range_checks(df, pref):
    issues, infos = [], []
    EPS = 1e-10
    c = lambda s: f"{pref}{s}"
    # shares in [0,1]
    for s in ["p_pos_share","p_neg_share","p_neu_share"]:
        if c(s) in df.columns:
            x = df[c(s)]
            bad = ((x < -EPS) | (x > 1+EPS)).sum()
            if bad: issues.append(f"{c(s)} out of [0,1] by >{EPS}: {bad}")
    # daily mean probabilities in [0,1]
    for s in ["mean_p_pos","mean_p_neu","mean_p_neg"]:
        if c(s) in df.columns:
            x = df[c(s)]
            bad = ((x < -EPS) | (x > 1+EPS)).sum()
            if bad: issues.append(f"{c(s)} out of [0,1] by >{EPS}: {bad}")
    # std >= 0
    if (df[c("std_s")] < -EPS).any():
        issues.append(f"{c('std_s')} negative by >{EPS}")
    # q10 <= q90
    if (df[c("q10_s")] > df[c("q90_s")] + EPS).any():
        issues.append(f"{c('q10_s')} > {c('q90_s')} by >{EPS}")
    # flags in {0,1}
    for s in ["zero_day","burst_day","active_k","active_rl"]:
        if c(s) in df.columns:
            x = df[c(s)]
            bad = (~x.isin([0,1])).sum()
            if bad: issues.append(f"{c(s)} not binary: {bad}")
    # median between q10 and q90 on active days (must hold)
    msk = df[c("count_items")] > 0
    if ((df.loc[msk, c("median_s")] < df.loc[msk, c("q10_s")] - EPS) |
        (df.loc[msk, c("median_s")] > df.loc[msk, c("q90_s")] + EPS)).any():
        issues.append(f"{c('median_s')} outside [q10,q90] on active days")
    # mean outside band = info only
    skew_rows = df.loc[msk & (
        (df[c("mean_s")] < df[c("q10_s")] - EPS) |
        (df[c("mean_s")] > df[c("q90_s")] + EPS)
    ), ["ticker","date",c("count_items"),c("q10_s"),c("mean_s"),c("q90_s")]]
    if not skew_rows.empty:
        infos.append(("mean_outside_q10_q90", skew_rows.sort_values(["ticker","date"]).reset_index(drop=True)))
    return issues, infos

def _audit_time_guard_active_only(df, pref, activity_k):
    # Recompute exactly how the notebook defines z/flags (active-only references)
    issues = []
    c = lambda s: f"{pref}{s}"
    for t, tdf in df.groupby("ticker"):
        tdf = tdf.sort_values("date").reset_index(drop=True)
        active = tdf[tdf[c("count_items")] > 0].copy()

        mu  = active[c("mean_s")].expanding().mean().shift(1)
        sd  = active[c("mean_s")].expanding().std().shift(1)
        z   = ((active[c("mean_s")] - mu) / sd).replace([np.inf,-np.inf], np.nan).fillna(0.0)
        z_full = tdf[["date"]].merge(active[["date"]].assign(_z=z.values), on="date", how="left")["_z"].fillna(0.0)
        if not np.allclose(z_full.values, tdf[c("z_mean_s")].values, atol=1e-8, rtol=1e-5):
            issues.append(f"{pref} z_mean_s mismatch {t}")

        r90  = active[c("count_items")].rolling(60, min_periods=20).quantile(0.9).shift(1).fillna(0.0)
        rmed = active[c("count_items")].rolling(21, min_periods=10).median().shift(1).fillna(0.0)
        burst = (active[c("count_items")] > r90).astype(float)
        arl   = (active[c("count_items")] > rmed).astype(float)

        burst_full = tdf[["date"]].merge(active[["date"]].assign(_b=burst.values), on="date", how="left")["_b"].fillna(0.0)
        arl_full   = tdf[["date"]].merge(active[["date"]].assign(_a=arl.values),   on="date", how="left")["_a"].fillna(0.0)

        if (burst_full != tdf[c("burst_day")]).any(): issues.append(f"{pref} burst_day mismatch {t}")
        if (arl_full   != tdf[c("active_rl")]).any(): issues.append(f"{pref} active_rl mismatch {t}")

        ak = (tdf[c("count_items")] >= activity_k).astype(float)
        if (ak != tdf[c("active_k")]).any(): issues.append(f"{pref} active_k mismatch {t}")
    return issues

def _coverage(df, pref, cal):
    issues = []
    g = df.groupby("ticker")["date"].agg(["min","max","count"]).reset_index()
    if cal is not None:
        merged = cal.merge(df[["ticker","date"]], on=["ticker","date"], how="left", indicator=True)
        miss = (merged["_merge"] != "both").sum()
        if miss:
            by_t = merged[merged["_merge"] != "both"].groupby("ticker").size()
            issues.append(f"{pref} missing trading days vs calendar: total {miss} -> {by_t.to_dict()}")
    for t, tdf in df.groupby("ticker"):
        if not tdf["date"].is_monotonic_increasing:
            issues.append(f"{pref} dates not monotonic for {t}")
    return issues, g

def validate_daily_file(path, pref, allowed_cols, calendar, activity_k):
    rep = {"file": path, "pref": pref, "errors": [], "infos": []}
    df, errs = _read_strict(path)
    if errs:
        rep["errors"] += errs; return rep, None
    rep["errors"] += _smoke(df, pref, allowed_cols)
    rng_errs, rng_info = _range_checks(df, pref)
    rep["errors"] += rng_errs
    rep["infos"]  += rng_info
    cov_errs, cov_table = _coverage(df, pref, calendar)
    rep["errors"] += cov_errs
    rep["errors"] += _audit_time_guard_active_only(df, pref, activity_k)
    return rep, df

# ---- Run validations
calendar, cal_errs = _build_calendar(CFG["PX_TIDY"])
if cal_errs:
    print("Calendar error:", cal_errs[0])
    calendar = None  # proceed with partial checks

reports = []
rep_tw, df_tw = validate_daily_file(CFG["TW"], "Tw_", ALLOWED["Tw_"], calendar, CFG["ACTIVITY_K"]); reports.append(rep_tw)
rep_rd, df_rd = validate_daily_file(CFG["RD"], "Rd_", ALLOWED["Rd_"], calendar, CFG["ACTIVITY_K"]); reports.append(rep_rd)
rep_nw, df_nw = validate_daily_file(CFG["NW"], "Nw_SP500_", ALLOWED["Nw_SP500_"], calendar, CFG["ACTIVITY_K"]); reports.append(rep_nw)

# Cross-source key alignment
def _keyset(df):
    return set(zip(df["ticker"], df["date"].dt.strftime("%Y-%m-%d")))
if all(d is not None for d in [df_tw, df_rd, df_nw]):
    k_tw, k_rd, k_nw = _keyset(df_tw), _keyset(df_rd), _keyset(df_nw)
    if not (k_tw == k_rd == k_nw):
        only_tw = len(k_tw - k_rd - k_nw)
        only_rd = len(k_rd - k_tw - k_nw)
        only_nw = len(k_nw - k_tw - k_rd)
        reports.append({"file":"CROSS-SOURCE","pref":"*","errors":[f"Key mismatch Tw:{only_tw} Rd:{only_rd} Nw:{only_nw}"],"infos":[]})
    else:
        reports.append({"file":"CROSS-SOURCE","pref":"*","errors":[], "infos":["Cross-source key alignment: PASS"]})

# ---- Print compact report and write JSON
any_errors = False
for rep in reports:
    tag = rep["file"]
    if rep["errors"]:
        any_errors = True
        print(f"[FAIL] {tag} ->")
        for e in rep["errors"][:10]:
            print("   -", e)
        if len(rep["errors"]) > 10:
            print(f"   ... and {len(rep['errors'])-10} more")
    else:
        print(f"[PASS] {tag}")
    for info in rep["infos"]:
        if isinstance(info, tuple):
            label, frame = info[0], info[1]
            print(f"  INFO {label}:")
            print(frame.to_string(index=False))
        else:
            print(" ", info)

Path("daily_validation_report.json").write_text(json.dumps(reports, default=str, indent=2))
print("\nValidation JSON -> daily_validation_report.json")
print("\nVIABILITY:", "PASS ✅" if not any_errors else "FAIL ❌")

Notebook 2 config OK (root-only).
All required root files are present.
twitter_scored: columns -> ['date', 'ticker', 'text', 'author', 's_finbert', 'p_pos', 'p_neu', 'p_neg', 's_deberta', 's_distil', 'model_finbert', 'model_deberta', 'model_distil']
reddit_scored: columns -> ['date', 'ticker', 'text', 'author', 's_finbert', 'p_pos', 'p_neu', 'p_neg', 's_deberta', 's_distil', 'model_finbert', 'model_deberta', 'model_distil']
sp500_scored: columns -> ['date', 'ticker', 'text', 'author', 's_finbert', 'p_pos', 'p_neu', 'p_neg', 's_deberta', 's_distil', 'model_finbert', 'model_deberta', 'model_distil']
Trading calendar: 2021-01-04 -> 2023-12-29
Written: /content/twitter_daily.csv size= 577463 bytes
Written: /content/reddit_daily.csv size= 397789 bytes
Written: /content/sp500_daily.csv size= 394580 bytes
Twitter coverage:
 ticker        min        max  count
  AAPL 2021-01-04 2023-12-29    753
   AMD 2021-01-04 2023-12-29    753
  AMZN 2021-01-04 2023-12-29    753
  MSFT 2021-01-04 2023-12-2

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

[PASS] twitter_daily.csv
[PASS] reddit_daily.csv
[PASS] sp500_daily.csv
[PASS] CROSS-SOURCE
  Cross-source key alignment: PASS

Validation JSON -> daily_validation_report.json

VIABILITY: PASS ✅


In [None]:
# === NB2 tighten-ups: explicit row-count gate + human-readable summary (fixed) ===
import numpy as np
import pandas as pd
from pathlib import Path

CFG = {
    "PX_TIDY": (CONFIG["PX_TIDY"] if "CONFIG" in globals() and "PX_TIDY" in CONFIG
                else "historical_stock_yfinance_full_2021_2023_tidy.csv"),
    "TICKERS": (CONFIG["TICKERS"] if "CONFIG" in globals() and "TICKERS" in CONFIG
                else ["AAPL","AMZN","MSFT","TSLA","AMD"]),
    "OUTS": [
        (CONFIG["OUT_TW_DAILY"] if "CONFIG" in globals() and "OUT_TW_DAILY" in CONFIG else "twitter_daily.csv", "Tw_"),
        (CONFIG["OUT_RD_DAILY"] if "CONFIG" in globals() and "OUT_RD_DAILY" in CONFIG else "reddit_daily.csv", "Rd_"),
        (CONFIG["OUT_SP_DAILY"] if "CONFIG" in globals() and "OUT_SP_DAILY" in CONFIG else "sp500_daily.csv", "Nw_SP500_"),
    ],
}

def _calendar_from_prices(px_path: str, tickers: list[str]) -> pd.DataFrame:
    px = pd.read_csv(px_path, parse_dates=["Date"])
    if "Ticker" in px.columns:
        px = px.rename(columns={"Date":"date","Ticker":"ticker"})
    else:
        px = px.rename(columns={"Date":"date"})
        if "ticker" not in px.columns and "Symbol" in px.columns:
            px = px.rename(columns={"Symbol":"ticker"})
    px = px[px["ticker"].isin(tickers)].copy()
    cal = (px[["date","ticker"]].drop_duplicates()
           .sort_values(["ticker","date"]).reset_index(drop=True))
    return cal

CAL = (TRADING_CAL[["date","ticker"]].drop_duplicates()
       .sort_values(["ticker","date"]).reset_index(drop=True)
       if "TRADING_CAL" in globals() else _calendar_from_prices(CFG["PX_TIDY"], CFG["TICKERS"]))

expected_rows = int(len(CAL))
per_ticker_expected = CAL.groupby("ticker")["date"].nunique()

summary_rows, errors = [], []

for out_path, pref in CFG["OUTS"]:
    p = Path(out_path)
    if not p.exists():
        errors.append(f"Missing daily file: {out_path}")
        continue

    df = pd.read_csv(p, parse_dates=["date"])

    # 1) Explicit row-count hard gate
    if len(df) != expected_rows:
        raise AssertionError(f"{out_path}: rows {len(df)} != expected {expected_rows} from trading calendar.")

    # 2) Per-ticker coverage hard gate
    per_ticker_got = df.groupby("ticker")["date"].nunique().reindex(per_ticker_expected.index, fill_value=0)
    mism = (per_ticker_got - per_ticker_expected)
    if (mism != 0).any():
        bad = mism[mism != 0].to_dict()
        raise AssertionError(f"{out_path}: per-ticker day-count mismatch vs calendar -> {bad}")

    # Duplicate key hard gate
    dups = int(df.duplicated(["ticker","date"]).sum())
    if dups:
        raise AssertionError(f"{out_path}: duplicate (ticker,date) rows = {dups}")

    # Non-finite numeric guard (FIXED)
    arr = df.select_dtypes(include=[np.number]).to_numpy()
    nonfinite = int((~np.isfinite(arr)).sum())
    if nonfinite:
        raise AssertionError(f"{out_path}: found {nonfinite} non-finite numeric values.")

    # Human-readable summary row
    summary_rows.append({
        "file": out_path,
        "min_date": df["date"].min().date().isoformat(),
        "max_date": df["date"].max().date().isoformat(),
        "rows": int(len(df)),
        "expected_rows": expected_rows,
        "dupes": dups,
        "missing_vs_calendar": 0,
        "tickers": ",".join(sorted(df["ticker"].unique()))
    })

summary_df = pd.DataFrame(summary_rows, columns=[
    "file","min_date","max_date","rows","expected_rows","dupes","missing_vs_calendar","tickers"
])
summary_df.to_csv("daily_summary_report.csv", index=False)
print("NB2 tighten-ups: wrote daily_summary_report.csv")
if errors:
    print("NB2 tighten-ups warnings:")
    for e in errors:
        print(" -", e)
else:
    print("NB2 tighten-ups: all daily files pass explicit gates.")

NB2 tighten-ups: wrote daily_summary_report.csv
NB2 tighten-ups: all daily files pass explicit gates.
