In [1]:
import pandas as pd
import numpy as np
from pathlib import Path

In [2]:
# --- Prices / market data ---
px = pd.read_csv("Data/10_day_run/merged_dataset.csv")
px["Week"] = pd.to_datetime(px["Week"], errors="coerce")
px = px.sort_values("Week").reset_index(drop=True)

In [3]:
# Forward weekly return (trade from Week t close to Week t+1 close)
px["ret_fwd"] = px["Close"].pct_change().shift(-1)

In [5]:
# --- Load model predictions (unweighted) ---
mkt = pd.read_csv("Data/10_day_run/day4_LR_market_only_preds.csv")
nfv = pd.read_csv("Data/10_day_run/day4_LR_news_filings_volume_preds.csv")
rw  = pd.read_csv("Data/10_day_run/rw_predictions_all.csv")

In [6]:
for df in (mkt, nfv, rw):
    df["Week"] = pd.to_datetime(df["Week"], errors="coerce")
    if "y_pred" in df.columns: df["y_pred"] = df["y_pred"].astype(int)
    if "y_true" in df.columns: df["y_true"] = df["y_true"].astype(int)


In [7]:
# --- Build signals s_t = P(Up) - P(Down) ---
def add_signal(df):
    df = df.copy()
    # Ensure prob columns exist in [-1, 0, 1] order
    pcols = [c for c in df.columns if c.startswith("p_")]
    need = {"p_-1","p_0","p_1"}
    if not need.issubset(set(pcols)):
        raise ValueError("Preds must include columns p_-1, p_0, p_1.")
    df["sig"] = df["p_1"] - df["p_-1"]
    return df[["Week","split_id","y_true","y_pred","sig"]]

mkt_s = add_signal(mkt).rename(columns={"sig":"sig_mkt"})
nfv_s = add_signal(nfv).rename(columns={"sig":"sig_nfv"})

In [8]:
# --- RW position from label at week t: +1 for Up, 0 for Neutral, -1 for Down ---
rw_pos = rw[["Week","split_id","y_pred"]].copy()
rw_pos["pos_rw"] = rw_pos["y_pred"].map({-1:-1, 0:0, 1:1})
rw_pos = rw_pos[["Week","split_id","pos_rw"]]

In [9]:
# --- Merge to a master OOS panel (keep entries present in each preds file separately) ---
base_cols = ["Week","Close","ret_fwd"]
base = px[base_cols].copy()

mkt_panel = base.merge(mkt_s, on="Week", how="inner").merge(rw_pos, on=["Week","split_id"], how="left")
nfv_panel = base.merge(nfv_s, on="Week", how="inner").merge(rw_pos, on=["Week","split_id"], how="left")


In [10]:
# --- For fair head-to-head: common subset across both LR panels and RW ---
common_weeks = set(mkt_panel["Week"]).intersection(set(nfv_panel["Week"]))
common = base[base["Week"].isin(common_weeks)].merge(mkt_s, on="Week", how="inner")\
                                             .merge(nfv_s, on=["Week","split_id"], how="inner")\
                                             .merge(rw_pos, on=["Week","split_id"], how="left")


In [11]:
# Save for inspection
mkt_panel.to_csv("Data/10_day_run/day6_mkt_panel.csv", index=False)
nfv_panel.to_csv("Data/10_day_run/day6_nfv_panel.csv", index=False)
common.to_csv("Data/10_day_run/day6_common_panel.csv", index=False)

print(f"OOS rows — LR market_only: {len(mkt_panel)}, LR news+filings+vol: {len(nfv_panel)}, common: {len(common)}")

OOS rows — LR market_only: 120, LR news+filings+vol: 120, common: 120


## Strategy builders, transaction costs, and metrics

In [12]:
from scipy.stats import pearsonr, spearmanr

In [14]:
def positions_from_signal(sig, rule="sign", tercile=0.3333):
    sig = sig.copy()
    if rule == "sign":
        # +1 if s>0, -1 if s<0, 0 if s==0
        pos = sig.apply(lambda x: 1 if x>0 else (-1 if x<0 else 0))
    elif rule == "terciles":
        # long top tercile, short bottom tercile, 0 otherwise
        lo = sig.quantile(tercile)
        hi = sig.quantile(1 - tercile)
        def f(x):
            if x <= lo: return -1
            if x >= hi: return  1
            return 0
        pos = sig.apply(f)
    else:
        raise ValueError("Unknown rule")
    return pos

def apply_costs_and_returns(df, pos_col, cost_bps=5.0):
    df = df.copy().sort_values("Week").reset_index(drop=True)
    # cost per side in fraction
    c = cost_bps / 10000.0
    # turnover relative to previous position
    df["pos_prev"] = df[pos_col].shift(1).fillna(0)
    df["turnover"] = (df[pos_col] - df["pos_prev"]).abs()
    df["cost"] = df["turnover"] * c
    # Realized net return using position decided at week t for next week's ret_fwd_t
    df["ret_gross"] = df[pos_col] * df["ret_fwd"]
    df["ret_net"] = df["ret_gross"] - df["cost"]
    return df

def sharpe_annualized(ret, periods_per_year=52):
    r = ret.dropna()
    if r.std(ddof=1) == 0 or len(r) < 3: return np.nan
    return (r.mean() / r.std(ddof=1)) * np.sqrt(periods_per_year)

def sortino_annualized(ret, periods_per_year=52):
    r = ret.dropna()
    downside = r[r < 0]
    if downside.std(ddof=1) == 0 or len(r) < 3: return np.nan
    return (r.mean() / downside.std(ddof=1)) * np.sqrt(periods_per_year)

def max_drawdown(ret):
    # ret is weekly net return series
    eq = (1 + ret.fillna(0)).cumprod()
    peak = eq.cummax()
    dd = eq/peak - 1.0
    return dd.min()

def info_coeff(sig, fwd_ret):
    s = pd.Series(sig).astype(float)
    r = pd.Series(fwd_ret).astype(float)
    # align
    idx = s.index.intersection(r.index)
    s = s.loc[idx]; r = r.loc[idx]
    if len(s) < 5: 
        return {"pearson": np.nan, "spearman": np.nan}
    pr = pearsonr(s, r)[0]
    sr = spearmanr(s, r)[0]
    return {"pearson": pr, "spearman": sr}

# Moving-block bootstrap for Sharpe & IC CIs
def mbb_indices(n, block_len=10, rng=None):
    if rng is None: rng = np.random.default_rng(42)
    k = int(np.ceil(n / block_len))
    starts = rng.integers(low=0, high=n, size=k)
    idx = []
    for s in starts:
        idx.extend([(s + i) % n for i in range(block_len)])
    return np.array(idx[:n], dtype=int)

def bootstrap_metrics(df, pos_col, B=2000, block_len=10):
    # compute base series
    r = df["ret_net"].values
    s = df[pos_col].astype(float).values
    sig = df.get("sig_mkt", df.get("sig_nfv", None))
    if sig is None:
        # when evaluating RW/BH we don't have sig; use zeros to avoid IC comp
        sig = np.zeros_like(r)
    fwd = df["ret_fwd"].values
    n = len(r)
    rng = np.random.default_rng(7)

    # point estimates
    sh = sharpe_annualized(pd.Series(r))
    so = sortino_annualized(pd.Series(r))
    dd = max_drawdown(pd.Series(r))
    to = pd.Series(df["turnover"]).mean()
    icp = info_coeff(pd.Series(sig), pd.Series(fwd))["pearson"]
    ics = info_coeff(pd.Series(sig), pd.Series(fwd))["spearman"]

    # bootstrap
    sh_b, icp_b, ics_b = [], [], []
    for _ in range(B):
        idx = mbb_indices(n, block_len=block_len, rng=rng)
        r_b = r[idx]
        sh_b.append(sharpe_annualized(pd.Series(r_b)))
        # ICs require resampling (sig, fwd) *pairwise*
        sig_b = sig[idx]; fwd_b = fwd[idx]
        icp_b.append(np.corrcoef(sig_b, fwd_b)[0,1] if np.std(sig_b)>0 and np.std(fwd_b)>0 else np.nan)
        ics_b.append(spearmanr(sig_b, fwd_b)[0] if len(sig_b)>3 else np.nan)

    def ci(arr):
        arr = np.array([x for x in arr if np.isfinite(x)])
        if len(arr)==0: return (np.nan, np.nan)
        return (np.nanpercentile(arr, 2.5), np.nanpercentile(arr, 97.5))

    return {
        "Sharpe": sh, "Sharpe_CI_low": ci(sh_b)[0], "Sharpe_CI_high": ci(sh_b)[1],
        "Sortino": so, "MaxDD": dd, "Turnover": to,
        "IC_Pearson": icp, "ICp_CI_low": ci(icp_b)[0], "ICp_CI_high": ci(icp_b)[1],
        "IC_Spearman": ics, "ICs_CI_low": ci(ics_b)[0], "ICs_CI_high": ci(ics_b)[1],
        "n_weeks": len(df)
    }


In [23]:
# --- Patch: safe correlations + silence constant-input warnings ---

import numpy as np, warnings
from scipy.stats import pearsonr, spearmanr
try:
    # Newer SciPy
    from scipy.stats import ConstantInputWarning
    warnings.filterwarnings("ignore", category=ConstantInputWarning)
except Exception:
    # Fallback: silence by message substring if ConstantInputWarning isn't available
    warnings.filterwarnings("ignore", message="An input array is constant; the correlation coefficient is not defined.")

def _clean_pair(x, y):
    x = np.asarray(x, float); y = np.asarray(y, float)
    m = np.isfinite(x) & np.isfinite(y)
    return x[m], y[m]

def safe_pearson(x, y):
    x, y = _clean_pair(x, y)
    if x.size < 3 or np.nanstd(x) == 0 or np.nanstd(y) == 0:
        return np.nan
    return pearsonr(x, y)[0]

def safe_spearman(x, y):
    x, y = _clean_pair(x, y)
    if x.size < 3 or np.nanstd(x) == 0 or np.nanstd(y) == 0:
        return np.nan
    return spearmanr(x, y)[0]

# Replace info_coeff to use the safe versions
def info_coeff(sig, fwd_ret):
    return {"pearson": safe_pearson(sig, fwd_ret), "spearman": safe_spearman(sig, fwd_ret)}

# Also update the bootstrap loop to use safe_* (edit in your Cell 2 if needed):
#   icp_b.append(safe_pearson(sig_b, fwd_b))
#   ics_b.append(safe_spearman(sig_b, fwd_b))

## Build strategies, apply costs, evaluate (own sample + common intersection)

In [24]:
# Build positions for LR signals
def add_positions(panel, sig_col, rule):
    df = panel.copy().sort_values("Week").reset_index(drop=True)
    df[f"pos_{rule}"] = positions_from_signal(df[sig_col], rule=rule)
    return df

# RW sign strategy
def add_rw_position(panel):
    df = panel.copy().sort_values("Week").reset_index(drop=True)
    df["pos_rw"] = df["pos_rw"].fillna(0)
    return df

# Buy & Hold
def add_bh_position(panel):
    df = panel.copy().sort_values("Week").reset_index(drop=True)
    df["pos_bh"] = 1
    return df

In [25]:
# --- Build all panels with positions ---
mkt_sign = add_positions(mkt_panel, "sig_mkt", rule="sign")
mkt_terc = add_positions(mkt_panel, "sig_mkt", rule="terciles")
nfv_sign = add_positions(nfv_panel, "sig_nfv", rule="sign")
nfv_terc = add_positions(nfv_panel, "sig_nfv", rule="terciles")

mkt_rw   = add_rw_position(mkt_panel)
nfv_rw   = add_rw_position(nfv_panel)

mkt_bh   = add_bh_position(mkt_panel)
nfv_bh   = add_bh_position(nfv_panel)

In [26]:
# --- Common intersection (for fair head-to-head) ---
com = common.sort_values("Week").reset_index(drop=True)
com_mkt_sign = com.copy(); com_mkt_sign["pos_sign"] = positions_from_signal(com["sig_mkt"], "sign")
com_mkt_terc = com.copy(); com_mkt_terc["pos_terciles"] = positions_from_signal(com["sig_mkt"], "terciles")
com_nfv_sign = com.copy(); com_nfv_sign["pos_sign_nfv"] = positions_from_signal(com["sig_nfv"], "sign")
com_nfv_terc = com.copy(); com_nfv_terc["pos_terciles_nfv"] = positions_from_signal(com["sig_nfv"], "terciles")
com_rw = com.copy(); com_rw["pos_rw"] = com_rw["pos_rw"].fillna(0)
com_bh = com.copy(); com_bh["pos_bh"] = 1

In [27]:
# --- Evaluate at two cost levels (5 bps and 10 bps per side) ---
def evaluate_suite(panel, label_prefix, cost_bps_list=(5.0, 10.0), variants=None):
    if variants is None:
        variants = []  # list of (pos_col, tag)
    rows = []
    equity_series = {}
    for bps in cost_bps_list:
        for pos_col, tag in variants:
            df_eval = apply_costs_and_returns(panel[["Week","ret_fwd",pos_col]].rename(columns={pos_col:"pos"}), "pos", cost_bps=bps)
            met = bootstrap_metrics(df_eval.rename(columns={"pos":"pos_eval"}), "pos_eval", B=2000, block_len=10)
            met.update({"strategy": f"{label_prefix}_{tag}", "cost_bps": bps})
            rows.append(met)
            # equity curve (for bps=5 only to keep file small)
            if bps == 5.0:
                eq = (1 + df_eval["ret_net"].fillna(0)).cumprod()
                equity_series[f"{label_prefix}_{tag}"] = pd.Series(eq.values, index=panel["Week"].values)
    return pd.DataFrame(rows), pd.DataFrame(equity_series)

In [31]:
# In evaluate_suite(), keep the signal column:
def evaluate_suite(panel, label_prefix, cost_bps_list=(5.0, 10.0), variants=None, sig_col=None):
    rows = []; equity_series = {}
    for bps in cost_bps_list:
        for pos_col, tag in variants:
            cols = ["Week","ret_fwd", pos_col]
            if sig_col and sig_col in panel.columns:
                cols.append(sig_col)
            df_eval = panel[cols].rename(columns={pos_col:"pos"})
            if sig_col and sig_col in df_eval.columns:
                # bootstrap_metrics will see 'sig_mkt' or 'sig_nfv'
                new_sig_name = "sig_mkt" if "mkt" in label_prefix else "sig_nfv"
                df_eval = df_eval.rename(columns={sig_col: new_sig_name})
            df_eval = apply_costs_and_returns(df_eval, "pos", cost_bps=bps)
            met = bootstrap_metrics(df_eval.rename(columns={"pos":"pos_eval"}), "pos_eval", B=2000, block_len=10)
            met.update({"strategy": f"{label_prefix}_{tag}", "cost_bps": bps})
            rows.append(met)
            if bps == 5.0:
                eq = (1 + df_eval["ret_net"].fillna(0)).cumprod()
                equity_series[f"{label_prefix}_{tag}"] = pd.Series(eq.values, index=panel["Week"].values)
    return pd.DataFrame(rows), pd.DataFrame(equity_series)

In [32]:
# Own-sample evaluations
own_metrics = []
own_equity = []

m, e = evaluate_suite(mkt_sign, "LRmkt", variants=[("pos_sign","sign")])
own_metrics.append(m); own_equity.append(e)
m, e = evaluate_suite(mkt_terc, "LRmkt", variants=[("pos_terciles","terc")])
own_metrics.append(m); own_equity.append(e)

m, e = evaluate_suite(nfv_sign, "LRnfv", variants=[("pos_sign","sign")])
own_metrics.append(m); own_equity.append(e)
m, e = evaluate_suite(nfv_terc, "LRnfv", variants=[("pos_terciles","terc")])
own_metrics.append(m); own_equity.append(e)

m, e = evaluate_suite(mkt_rw, "RW", variants=[("pos_rw","sign")])
own_metrics.append(m); own_equity.append(e)

m, e = evaluate_suite(mkt_bh, "BH", variants=[("pos_bh","long")])
own_metrics.append(m); own_equity.append(e)

own_metrics = pd.concat(own_metrics, ignore_index=True)
own_equity = pd.concat(own_equity, axis=1)
own_metrics.to_csv("Data/10_day_run/day6_metrics_own_sample.csv", index=False)
own_equity.to_csv("Data/10_day_run/day6_equity_own_sample_cost5bps.csv", index=True)

In [33]:
# Common-sample evaluations (use com_ panels)
common_metrics = []
common_equity = []

m, e = evaluate_suite(com_mkt_sign.rename(columns={"pos_sign":"pos"}), "LRmkt", variants=[("pos","sign")])
common_metrics.append(m); common_equity.append(e)
m, e = evaluate_suite(com_mkt_terc.rename(columns={"pos_terciles":"pos"}), "LRmkt", variants=[("pos","terc")])
common_metrics.append(m); common_equity.append(e)
m, e = evaluate_suite(com_nfv_sign.rename(columns={"pos_sign_nfv":"pos"}), "LRnfv", variants=[("pos","sign")])
common_metrics.append(m); common_equity.append(e)
m, e = evaluate_suite(com_nfv_terc.rename(columns={"pos_terciles_nfv":"pos"}), "LRnfv", variants=[("pos","terc")])
common_metrics.append(m); common_equity.append(e)
m, e = evaluate_suite(com_rw.rename(columns={"pos_rw":"pos"}), "RW", variants=[("pos","sign")])
common_metrics.append(m); common_equity.append(e)
m, e = evaluate_suite(com_bh.rename(columns={"pos_bh":"pos"}), "BH", variants=[("pos","long")])
common_metrics.append(m); common_equity.append(e)

common_metrics = pd.concat(common_metrics, ignore_index=True)
common_equity = pd.concat(common_equity, axis=1)
common_metrics.to_csv("Data/10_day_run/day6_metrics_common_sample.csv", index=False)
common_equity.to_csv("Data/10_day_run/day6_equity_common_sample_cost5bps.csv", index=True)

print("Saved:\n- Data/10_day_run/day6_metrics_own_sample.csv\n- Data/10_day_run/day6_metrics_common_sample.csv\n- Data/10_day_run/day6_equity_own_sample_cost5bps.csv\n- Data/10_day_run/day6_equity_common_sample_cost5bps.csv")

Saved:
- Data/10_day_run/day6_metrics_own_sample.csv
- Data/10_day_run/day6_metrics_common_sample.csv
- Data/10_day_run/day6_equity_own_sample_cost5bps.csv
- Data/10_day_run/day6_equity_common_sample_cost5bps.csv


In [34]:
# Compact table (common sample, cost=5 bps)
tab = pd.read_csv("Data/10_day_run/day6_metrics_common_sample.csv")
tab5 = tab[tab["cost_bps"]==5.0].copy()
cols = ["strategy","Sharpe","Sharpe_CI_low","Sharpe_CI_high","Sortino","MaxDD","Turnover",
        "IC_Pearson","ICp_CI_low","ICp_CI_high","IC_Spearman","ICs_CI_low","ICs_CI_high","n_weeks"]
tab5 = tab5.sort_values(["strategy"]).reset_index(drop=True)
tab5.to_csv("Data/10_day_run/day6_summary_table_cost5bps.csv", index=False)
tab5

Unnamed: 0,Sharpe,Sharpe_CI_low,Sharpe_CI_high,Sortino,MaxDD,Turnover,IC_Pearson,ICp_CI_low,ICp_CI_high,IC_Spearman,ICs_CI_low,ICs_CI_high,n_weeks,strategy,cost_bps
0,1.22748,0.129803,2.606359,1.855044,-0.176062,0.008333,,,,,,,120,BH_long,5.0
1,0.733946,-0.392658,2.06243,1.073875,-0.176062,0.508333,,,,,,,120,LRmkt_sign,5.0
2,0.735555,-0.563441,1.86049,1.180042,-0.11772,0.691667,,,,,,,120,LRmkt_terc,5.0
3,0.85351,-0.203566,1.998063,1.410556,-0.193934,0.675,,,,,,,120,LRnfv_sign,5.0
4,0.532202,-1.035379,1.967679,0.75914,-0.251549,0.716667,,,,,,,120,LRnfv_terc,5.0
5,-0.008578,-0.811462,0.878248,-0.009497,-0.227147,0.85,,,,,,,120,RW_sign,5.0


# Feature Importance

In [3]:
# === LR (market-only) coefficients across rolling splits ===
import numpy as np, pandas as pd
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

# --- Load & target (same as Day 4) ---
df = pd.read_csv("Data/10_day_run/merged_dataset.csv")
df["Week"] = pd.to_datetime(df["Week"], errors="coerce")
df = df.sort_values("Week").reset_index(drop=True)
df["Return_protocol"] = df["Close"].pct_change()

delta = 0.01
def to_label(r):
    if pd.isna(r): return np.nan
    if r >  delta: return  1
    if r < -delta: return -1
    return 0

df["Target_protocol"] = df["Return_protocol"].apply(to_label)
df = df.dropna(subset=["Target_protocol"]).reset_index(drop=True)
y = df["Target_protocol"].astype(int)

# --- Feature set: market-only + lags ---
def add_lags(frame, cols, lags=(1,2)):
    out = frame.copy()
    for L in lags:
        for c in cols:
            if c in out.columns:
                out[f"{c}_lag{L}"] = out[c].shift(L)
    return out

market_feats = [c for c in ["Volume"] if c in df.columns]
dfX = add_lags(df, market_feats, lags=(1,2))
feat_cols = []
for c in market_feats:
    if c in dfX.columns:
        feat_cols += [c, f"{c}_lag1", f"{c}_lag2"]
feat_cols = [c for c in feat_cols if c in dfX.columns]
X = dfX[feat_cols].copy()

# --- Rolling-origin splits ---
N = len(df)
initial_train_weeks = max(52, int(0.5 * N))
test_weeks = max(16, int(0.1 * N))
splits = []
train_end = initial_train_weeks - 1
sid = 1
while True:
    te_start = train_end + 1
    te_end   = te_start + test_weeks - 1
    if te_start >= N: break
    if te_end >= N: te_end = N - 1
    splits.append((sid, 0, int(train_end), int(te_start), int(te_end)))
    sid += 1
    if te_end >= N - 1: break
    train_end = te_end

# --- Fit LR per split, collect coefficients ---
rows = []
for sid, tr0, tr1, te0, te1 in splits:
    X_tr = X.iloc[tr0:tr1+1].copy()
    y_tr = y.iloc[tr0:tr1+1].copy()
    # Drop lag NaNs
    mask = ~X_tr.isna().any(axis=1)
    X_tr, y_tr = X_tr[mask], y_tr[mask]
    if len(X_tr) < 30: 
        continue

    pipe = Pipeline([
        ("scaler", StandardScaler()),
        ("clf", LogisticRegression(max_iter=2000, multi_class="multinomial"))
    ])
    grid = {"clf__C": [0.3, 1.0, 3.0], "clf__class_weight": [None, "balanced"]}
    gs = GridSearchCV(pipe, grid, scoring="accuracy", cv=TimeSeriesSplit(n_splits=3),
                      n_jobs=-1, refit=True)
    gs.fit(X_tr, y_tr)
    best = gs.best_estimator_
    clf  = best.named_steps["clf"]
    classes = clf.classes_.tolist()  # e.g., [-1,0,1]
    coefs = clf.coef_               # shape [n_classes, n_features]
    # Map per-class coefficients to dict
    for ci, cval in enumerate(classes):
        for fj, fname in enumerate(feat_cols):
            rows.append({"split_id": sid, "class": int(cval), "feature": fname, "coef": float(coefs[ci, fj])})

coef_df = pd.DataFrame(rows)
coef_df.to_csv("Data/10_day_run/day6_lr_market_only_coefs_per_split.csv", index=False)

# --- Up–Down contrast: beta(+1) - beta(-1) per split ---
up = coef_df[coef_df["class"]==1].rename(columns={"coef":"coef_up"})[["split_id","feature","coef_up"]]
dn = coef_df[coef_df["class"]==-1].rename(columns={"coef":"coef_dn"})[["split_id","feature","coef_dn"]]
ud = pd.merge(up, dn, on=["split_id","feature"], how="inner")
ud["coef_up_minus_down"] = ud["coef_up"] - ud["coef_dn"]
ud.to_csv("Data/10_day_run/day6_lr_market_only_upminusdown_per_split.csv", index=False)

# --- Summary with bootstrap CIs across splits ---
def ci_percentile(a, low=2.5, high=97.5, B=4000, seed=7):
    rng = np.random.default_rng(seed)
    boot = []
    a = np.asarray(a, float)
    n = len(a)
    if n == 0: return (np.nan, np.nan)
    for _ in range(B):
        idx = rng.integers(0, n, n)
        boot.append(np.nanmean(a[idx]))
    return (np.nanpercentile(boot, low), np.nanpercentile(boot, high))

summ_rows = []
for fname, g in ud.groupby("feature", sort=False):
    vals = g["coef_up_minus_down"].values
    mu = float(np.nanmean(vals))
    lo, hi = ci_percentile(vals)
    summ_rows.append({"feature": fname, "mean_up_minus_down": mu, "ci_low": lo, "ci_high": hi, "n_splits": len(vals)})

lr_summary = pd.DataFrame(summ_rows).sort_values("mean_up_minus_down", ascending=False)
lr_summary.to_csv("Data/10_day_run/day6_lr_market_only_upminusdown_summary.csv", index=False)
lr_summary



Unnamed: 0,feature,mean_up_minus_down,ci_low,ci_high,n_splits
2,Volume_lag2,0.117413,0.090191,0.145222,6
1,Volume_lag1,-0.000162,-0.032388,0.036488,6
0,Volume,-0.324777,-0.367159,-0.287511,6


In [7]:
# === Random Forest (unweighted, news+filings+volume) — permutation importance ===
import numpy as np, pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.inspection import permutation_importance
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import log_loss

# --- Load unweighted dataset & target (same as above) ---
df = pd.read_csv("Data/10_day_run/merged_dataset.csv")
df["Week"] = pd.to_datetime(df["Week"], errors="coerce")
df = df.sort_values("Week").reset_index(drop=True)
df["Return_protocol"] = df["Close"].pct_change()

delta = 0.01
def to_label(r):
    if pd.isna(r): return np.nan
    if r >  delta: return  1
    if r < -delta: return -1
    return 0

df["Target_protocol"] = df["Return_protocol"].apply(to_label)
df = df.dropna(subset=["Target_protocol"]).reset_index(drop=True)
y = df["Target_protocol"].astype(int)

# --- Feature families (unweighted) ---
news_feats = [c for c in ["mean_news_sentiment","smoothed_news_sentiment","num_news_articles","low_coverage_week"] if c in df.columns]
filings_feats = [c for c in ["sent_10k_mean","10q_mda_sent","10q_risk_sent","opt_vs_caut","sent_8k_mean","count_10k","count_10q","count_8k"] if c in df.columns]
market_feats = [c for c in ["Volume"] if c in df.columns]

def add_lags(frame, cols, lags=(1,2)):
    out = frame.copy()
    for L in lags:
        for c in cols:
            if c in out.columns:
                out[f"{c}_lag{L}"] = out[c].shift(L)
    return out

dfX = add_lags(df, news_feats + filings_feats + market_feats, lags=(1,2))

feat_cols = []
for c in (news_feats + filings_feats + market_feats):
    if c in dfX.columns:
        feat_cols += [c, f"{c}_lag1", f"{c}_lag2"]
feat_cols = [c for c in feat_cols if c in dfX.columns]
X = dfX[feat_cols].copy()

# --- Rolling-origin splits (same) ---
N = len(df)
initial_train_weeks = max(52, int(0.5 * N))
test_weeks = max(16, int(0.1 * N))
splits = []
train_end = initial_train_weeks - 1
sid = 1
while True:
    te_start = train_end + 1
    te_end   = te_start + test_weeks - 1
    if te_start >= N: break
    if te_end >= N: te_end = N - 1
    splits.append((sid, 0, int(train_end), int(te_start), int(te_end)))
    sid += 1
    if te_end >= N - 1: break
    train_end = te_end

# --- Optional: load best params per split if available ---
try:
    best_params = pd.read_csv("Data/10_day_run/rf_unweighted_best_params.csv")
except Exception:
    best_params = None

# Set to False to ignore the CSV and always use safe defaults
USE_BEST_PARAMS = True

def _is_nan(x):
    return x is None or (isinstance(x, float) and np.isnan(x))

def _coerce_int(x, default):
    try:
        return int(x)
    except Exception:
        return default

def _coerce_class_weight(x, default='balanced_subsample'):
    """Return 'balanced', 'balanced_subsample', dict, or None; never NaN."""
    if _is_nan(x): 
        return default
    if isinstance(x, dict):
        return x
    if isinstance(x, str):
        xs = x.strip().lower()
        if xs in {'balanced','balanced_subsample'}:
            return xs
        if xs in {'none','null','nan',''}:
            return None
    # Anything else → default
    return default

def _coerce_max_features(x, default='sqrt'):
    """Allow 'sqrt','log2','auto', float in (0,1], int >=1; never NaN."""
    if _is_nan(x):
        return default
    if isinstance(x, str):
        xs = x.strip().lower()
        if xs in {'sqrt','log2','auto'}:
            return xs
        # maybe a number in string form
        try:
            f = float(xs)
            return f
        except Exception:
            return default
    # numeric is fine
    return x

def params_for_split(sid):
    # Fallback defaults (sensible & stable for our data size)
    fallback = dict(
        n_estimators=400,
        max_depth=None,
        min_samples_leaf=5,
        max_features='sqrt',
        class_weight='balanced_subsample',
        random_state=42,
        n_jobs=-1
    )
    if (not USE_BEST_PARAMS) or (best_params is None) or ('split_id' not in getattr(best_params, 'columns', [])):
        return fallback

    row = best_params.loc[best_params['split_id'] == sid]
    if row.empty:
        return fallback

    r = row.iloc[0].to_dict()
    return dict(
        n_estimators=_coerce_int(r.get('n_estimators', 400), 400),
        max_depth=None if _is_nan(r.get('max_depth', np.nan)) else _coerce_int(r.get('max_depth'), None),
        min_samples_leaf=_coerce_int(r.get('min_samples_leaf', 5), 5),
        max_features=_coerce_max_features(r.get('max_features', 'sqrt')),
        class_weight=_coerce_class_weight(r.get('class_weight', 'balanced_subsample')),
        random_state=42,
        n_jobs=-1
    )

print("RF params patch loaded. Set USE_BEST_PARAMS =", USE_BEST_PARAMS)

# --- Permutation importance per split on TEST fold (neg_log_loss) ---
imp_rows = []
for sid, tr0, tr1, te0, te1 in splits:
    X_tr = X.iloc[tr0:tr1+1].copy(); y_tr = y.iloc[tr0:tr1+1].copy()
    X_te = X.iloc[te0:te1+1].copy(); y_te = y.iloc[te0:te1+1].copy()
    # Drop NaNs from lags
    tr_mask = ~X_tr.isna().any(axis=1); te_mask = ~X_te.isna().any(axis=1)
    X_tr, y_tr = X_tr[tr_mask], y_tr[tr_mask]
    X_te, y_te = X_te[te_mask], y_te[te_mask]
    if len(X_tr) < 50 or len(X_te) < 10: 
        continue

    rf = RandomForestClassifier(**params_for_split(sid))
    rf.fit(X_tr, y_tr)

    # neg_log_loss permutation importance (higher => more important)
    pi = permutation_importance(
        rf, X_te, y_te, scoring="neg_log_loss", n_repeats=50, random_state=13, n_jobs=-1
    )
    for fname, imp, imp_sd in zip(feat_cols, pi.importances_mean, pi.importances_std):
        imp_rows.append({"split_id": sid, "feature": fname, "imp_mean": float(imp), "imp_sd": float(imp_sd)})

rf_pi = pd.DataFrame(imp_rows)
rf_pi.to_csv("Data/10_day_run/day6_rf_unweighted_perm_importance_per_split.csv", index=False)

# --- Aggregate across splits with percentile CIs ---
def ci_percentile(a, low=2.5, high=97.5, B=4000, seed=11):
    rng = np.random.default_rng(seed)
    a = np.asarray(a, float)
    if len(a)==0: return (np.nan, np.nan)
    boot = []
    for _ in range(B):
        idx = rng.integers(0, len(a), len(a))
        boot.append(np.nanmean(a[idx]))
    return (np.nanpercentile(boot, low), np.nanpercentile(boot, high))

summ = []
for fname, g in rf_pi.groupby("feature", sort=False):
    vals = g["imp_mean"].values
    mu = float(np.nanmean(vals))
    lo, hi = ci_percentile(vals)
    summ.append({"feature": fname, "mean_perm_importance": mu, "ci_low": lo, "ci_high": hi, "n_splits": len(vals)})

rf_summary = pd.DataFrame(summ).sort_values("mean_perm_importance", ascending=False)
rf_summary.to_csv("Data/10_day_run/day6_rf_unweighted_perm_importance_summary.csv", index=False)
rf_summary.head(15)


RF params patch loaded. Set USE_BEST_PARAMS = True


Unnamed: 0,feature,mean_perm_importance,ci_low,ci_high,n_splits
34,count_8k_lag1,0.003474,0.001407,0.005889011,5
17,10q_mda_sent_lag2,0.003132,-4.3e-05,0.007103073,5
3,smoothed_news_sentiment,0.003105,-0.004432,0.009501608,5
26,sent_8k_mean_lag2,0.002254,-0.002879,0.007761763,5
32,count_10q_lag2,0.001842,-0.000862,0.006119091,5
29,count_10k_lag2,0.001668,2.8e-05,0.00355197,5
33,count_8k,0.001159,-0.001712,0.003450109,5
22,opt_vs_caut_lag1,0.001039,-0.004753,0.007240678,5
23,opt_vs_caut_lag2,0.000779,-0.002173,0.003734224,5
37,Volume_lag1,0.000588,-0.000446,0.001654804,5
