### H1

In [1]:
import pandas as pd
import numpy as np

df_crypto = pd.read_csv('new_cleaned_crypto_data.csv')
df_crypto['date'] = pd.to_datetime(df_crypto['date'])

df_alt = df_crypto[df_crypto['is_alt'] == 1].copy()
df_alt['date'] = pd.to_datetime(df_alt['date'])
df_alt = df_alt.sort_values(['symbol', 'date'])

df_last = df_alt.groupby('symbol').tail(1)
q33 = df_last['CapMrktEstUSD'].quantile(0.33)
q66 = df_last['CapMrktEstUSD'].quantile(0.66)
def assign_cap_group(mcap):
    if mcap >= q66:
        return 'High'
    elif mcap >= q33:
        return 'Mid'
    else:
        return 'Low'

df_last['cap_group'] = df_last['CapMrktEstUSD'].apply(assign_cap_group)
cap_map = df_last.set_index('symbol')['cap_group'].to_dict()
df_crypto['cap_group'] = df_crypto['symbol'].map(cap_map)

df_crypto['log_daily_return'] =  100 * np.log(df_crypto['PriceUSD'] / df_crypto['PriceUSD'].shift(1))

df_crypto = df_crypto.dropna(subset=['log_daily_return'])

# compute log levels
df_crypto['log_volume'] = np.log(df_crypto['volume_trusted_spot_usd_1d'].replace(0, np.nan))
df_crypto['log_market_cap'] = np.log(df_crypto['CapMrktEstUSD'].replace(0, np.nan))
df_crypto['log_TxCnt'] = np.log(df_crypto['TxCnt'].replace(0, np.nan))
df_crypto['log_AdrActCnt'] = np.log(df_crypto['AdrActCnt'].replace(0, np.nan))
# compute log differences
df_crypto['log_diff_volume'] = 100 * np.log(df_crypto['volume_trusted_spot_usd_1d'] / df_crypto['volume_trusted_spot_usd_1d'].shift(1))
df_crypto['log_diff_market_cap'] = 100 * np.log(df_crypto['CapMrktEstUSD'] / df_crypto['CapMrktEstUSD'].shift(1))
df_crypto['log_diff_TxCnt'] = 100 * np.log(df_crypto['TxCnt'] / df_crypto['TxCnt'].shift(1))
df_crypto['log_diff_AdrActCnt'] = 100 * np.log(df_crypto['AdrActCnt'] / df_crypto['AdrActCnt'].shift(1))

#compute log levels for economic indicators
df_crypto['log_IndPro'] = np.log(df_crypto['IndPro'].replace(0, np.nan))
df_crypto['log_TotRes'] = np.log(df_crypto['TotRes'].replace(0, np.nan))
df_crypto['log_CPIPrc'] = np.log(df_crypto['CPIPrc'].replace(0, np.nan))
df_crypto['log_UnemRt'] = np.log(df_crypto['UnemRt'].replace(0, np.nan))

# compute log differences for economic indicators
df_crypto['log_diff_IndPro'] = 100 * np.log(df_crypto['IndPro'] / df_crypto['IndPro'].shift(1))
df_crypto['log_diff_TotRes'] = 100 * np.log(df_crypto['TotRes'] / df_crypto['TotRes'].shift(1))
df_crypto['log_diff_CPIPrc'] = 100 * np.log(df_crypto['CPIPrc'] / df_crypto['CPIPrc'].shift(1))
df_crypto['log_diff_UnemRt'] = 100 * np.log(df_crypto['UnemRt'] / df_crypto['UnemRt'].shift(1))

# compute differences for sentiment proxies (VIX,TwitSIX, EPU_DUS, InvSIX, fng_value, ConSIX)
df_crypto['diff_VIX'] = df_crypto['VIX'].diff()
df_crypto['diff_TwitSIX'] = df_crypto['TwitSIX'].diff()
df_crypto['diff_EPU_DUS'] = df_crypto['EPU_DUS'].diff()
df_crypto['diff_InvSIX'] = df_crypto['InvSIX'].diff()
df_crypto['diff_fng_value'] = df_crypto['fng_value'].diff()
df_crypto['diff_ConSIX'] = df_crypto['ConSIX'].diff()

df_crypto.info()

<class 'pandas.core.frame.DataFrame'>
Index: 300934 entries, 1 to 300934
Data columns (total 52 columns):
 #   Column                      Non-Null Count   Dtype         
---  ------                      --------------   -----         
 0   Unnamed: 0                  300934 non-null  int64         
 1   symbol                      300934 non-null  object        
 2   date                        300934 non-null  datetime64[ns]
 3   AdrActCnt                   208655 non-null  float64       
 4   CapMrktEstUSD               230887 non-null  float64       
 5   PriceUSD                    300934 non-null  float64       
 6   TxCnt                       215205 non-null  float64       
 7   volume_trusted_spot_usd_1d  290425 non-null  float64       
 8   fng_value                   277273 non-null  float64       
 9   fng_classification          277273 non-null  object        
 10  VIX                         300934 non-null  float64       
 11  ConSIX                      300934 non-null 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_last['cap_group'] = df_last['CapMrktEstUSD'].apply(assign_cap_group)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)


In [13]:
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
from sklearn.preprocessing import StandardScaler
from statsmodels.stats.stattools import durbin_watson

# ---------- 1) PREPROCESSING (two-part, per-symbol, NA-safe) ----------
base_cols = ['AdrActCnt', 'volume_trusted_spot_usd_1d', 'TxCnt']

for col in base_cols:
    prev = df_crypto.groupby('symbol')[col].shift(1)   # per-symbol lag
    curr = df_crypto[col]
    df_crypto[f'jump_{col}'] = ((prev == 0) & (curr > 0)).astype(int)
    # set to 0 outside >0→>0 regime so we don't drop rows
    df_crypto[f'log_diff_{col}'] = np.where((prev > 0) & (curr > 0),
                                            np.log(curr/prev), 0.0)

# ---------- 2) ELIGIBILITY (robust) ----------
def min_required_n(regressors, floor=365, margin=30):
    p = 1 + len(regressors)  # + intercept
    return max(floor, p + margin)

def _symbol_eligible(sub, target, regressors):
    if any(r not in sub.columns for r in regressors):
        return False, "missing_columns"
    cc = sub.dropna(subset=[target])  # X have no NA after step 1
    if len(cc) < min_required_n(regressors):
        return False, f"too_few_obs:{len(cc)}"
    # require variation for continuous vars (allow jump_* to be constant)
    for r in regressors:
        if not r.startswith('jump_') and cc[r].nunique() < 2:
            return False, f"no_variation:{r}"
    return True, "ok"

# ---------- 3) REGRESSION ----------
def run_symbol_regression(df, symbol, target, regressors):
    sub = df[df['symbol'] == symbol].sort_values('date').copy()
    if target not in sub.columns:
        sub[target] = sub['log_daily_return'].shift(-1)

    ok, reason = _symbol_eligible(sub, target, regressors)
    if not ok:
        return None

    sub = sub.dropna(subset=[target])  # only target must be present

    # standardize only continuous predictors (leave jump_* as is)
    cont = [r for r in regressors if not r.startswith('jump_')]
    if cont:
        sub[cont] = StandardScaler().fit_transform(sub[cont])

    rhs = ' + '.join(regressors)
    fit = smf.ols(f"{target} ~ {rhs}", data=sub).fit()
    if fit.df_resid <= 8:   # guard for reliable HAC/AIC
        return None

    n = len(sub)
    max_hac_lag = int(n ** 0.25)

    best_aic, best_lag, best_res = np.inf, 0, None
    for lag in range(max_hac_lag + 1):
        r = fit.get_robustcov_results(cov_type='HAC', maxlags=lag, use_correction=True)
        if r.aic < best_aic:
            best_aic, best_lag, best_res = r.aic, lag, r

    dw_val = float(durbin_watson(fit.resid))

    out = {
        'symbol': symbol, 'n': n, 'nobs': n,
        'n_regs': len(regressors), 'regs_used': ','.join(regressors),
        'opt_hac_lag': best_lag, 'r2': best_res.rsquared,
        'adj_r2': best_res.rsquared_adj, 'aic': best_res.aic, 'bic': best_res.bic,
        'log_likelihood': best_res.llf, 'fstat': best_res.fvalue, 'f_pval': best_res.f_pvalue,
        'durbin_watson': dw_val
    }
    for name, b, se, t, p in zip(best_res.model.exog_names,
                                 best_res.params, best_res.bse, best_res.tvalues, best_res.pvalues):
        out[f'coef_{name}']   = b
        out[f'stderr_{name}'] = se
        out[f'tval_{name}']   = t
        out[f'pval_{name}']   = p
    return out

def run_all_symbols(df, target, regressors):
    df = df.sort_values(['symbol','date']).copy()
    # crea il target t+1 se non presente
    if target not in df.columns:
        df[target] = df.groupby('symbol')['log_daily_return'].shift(-1)

    results = []
    kept, dropped = [], []

    for sym in df['symbol'].unique():
        sub = df[df['symbol'] == sym]
        if _symbol_eligible(sub, target, regressors):
            r = run_symbol_regression(df, sym, target, regressors)
            if r is not None:
                results.append(r)
                kept.append(sym)
            else:
                dropped.append(sym)
        else:
            dropped.append(sym)

    out = pd.DataFrame(results)
    # Log sintetico (opzionale)
    print(f"Kept {len(set(kept))} symbols; Dropped {len(set(dropped))}: {sorted(set(dropped))[:10]}...")
    return out

# ---------------------- 3) SET UP YOUR LOOP ----------------------
target = 'log_daily_next'

base_regressors = [
    'log_daily_return',
    # 'jump_AdrActCnt',
    'log_diff_AdrActCnt',
    # 'jump_volume_trusted_spot_usd_1d',
    'log_diff_volume_trusted_spot_usd_1d',
    # 'jump_TxCnt',
    'log_diff_TxCnt',
    'log_UnemRt', 'log_IndPro', 'log_CPIPrc', 'log_TotRes',
]

sentiment_proxies = {
    'no_sentiment': None,
    'EPU_DUS':      'EPU_DUS',
    'VIX':          'VIX',
    'InvSIX':       'InvSIX',
    'TwitSIX':      'TwitSIX',
    'ConSIX':       'ConSIX',
    'fng_value':    'fng_value'
}

for name, proxy in sentiment_proxies.items():
    regs = base_regressors.copy()
    if proxy is not None:
        regs.insert(1, proxy)   # sentiment subito dopo il ritorno laggato
    summary_df = run_all_symbols(df_crypto, target, regs)
    fn = f"Regressions/crypto_regression_summary_{name}.csv"
    summary_df.to_csv(fn, index=False)
    print(f"Saved {summary_df['symbol'].nunique()} symbols → {fn}")


  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)


Kept 85 symbols; Dropped 40: ['ALCX', 'APE', 'API3', 'ATOM', 'AUDIO', 'AVAX', 'BADGER', 'BIT', 'BNT', 'CEL']...
Saved 85 symbols → Regressions/crypto_regression_summary_no_sentiment.csv
Kept 85 symbols; Dropped 40: ['ALCX', 'APE', 'API3', 'ATOM', 'AUDIO', 'AVAX', 'BADGER', 'BIT', 'BNT', 'CEL']...
Saved 85 symbols → Regressions/crypto_regression_summary_EPU_DUS.csv
Kept 85 symbols; Dropped 40: ['ALCX', 'APE', 'API3', 'ATOM', 'AUDIO', 'AVAX', 'BADGER', 'BIT', 'BNT', 'CEL']...
Saved 85 symbols → Regressions/crypto_regression_summary_VIX.csv
Kept 85 symbols; Dropped 40: ['ALCX', 'APE', 'API3', 'ATOM', 'AUDIO', 'AVAX', 'BADGER', 'BIT', 'BNT', 'CEL']...
Saved 85 symbols → Regressions/crypto_regression_summary_InvSIX.csv
Kept 85 symbols; Dropped 40: ['ALCX', 'APE', 'API3', 'ATOM', 'AUDIO', 'AVAX', 'BADGER', 'BIT', 'BNT', 'CEL']...
Saved 85 symbols → Regressions/crypto_regression_summary_TwitSIX.csv
Kept 85 symbols; Dropped 40: ['ALCX', 'APE', 'API3', 'ATOM', 'AUDIO', 'AVAX', 'BADGER', 'BIT', 

In [14]:
import os
import pandas as pd
import numpy as np
from pathlib import Path

# ===== Paths =====
REGRESSIONS_DIR = "Regressions"
OUT_DIR = "Regressions/Tables"
Path(OUT_DIR).mkdir(exist_ok=True)

# ===== Classification =====
def classify_symbol(symbol):
    if symbol == "MTL_METAL":
        symbol = "MTL"
    BTC = {"BTC"}
    ALT_HIGH = {"AAVE","ADA","ALGO","ATOM","AVAX","BCH","BNB","BSV","CRO","CRV","DOT","ENS","EOS","ETC",
                "ETH","FIL","FLOW","GALA","GRT","ICP","LDO","LINK","LTC","MANA","MKR","QNT","SAND","SOL",
                "TRX","UNI","VET","XLM","XMR","XRP","XTZ","ZEC"}
    ALT_MID  = {"1INCH","ANT","AUDIO","BAT","COMP","CVC","CVX","DASH","DCR","DGB","ELF","ENJ","FTT","FXS",
                "GAS","GLM","GNO","ICX","LPT","LRC","LUNA","NEO","QTUM","RSR","SKL","SNT","SNX",
                "SUSHI","UMA","WAVES","XVG","YFI","ZIL","ZRX"}
    ALT_LOW  = {"ALCX","ALPHA","API3","BADGER","BAL","BIT","BNT","BTG","CEL","DRGN","FUN","GNT","GRIN","HEDG","HT",
                "KNC","LEND","LOOM","LSK","MAID","MTL","NMR","OGN","OMG","PAY","PERP","POLY","POWR","PPT",
                "QASH","REN","REP","ROOK","SRM","STORJ","SWRV","VTC","WNXM","WTC","XEM"}
    STABLE   = {"USDT","USDC","BUSD","DAI","GUSD","HUSD","PAX","TUSD"}
    GOLDPEG  = {"PAXG","XAUT"}
    MEME     = {"DOGE","SHIB","APE"}
    if symbol in BTC:     return "BTC"
    if symbol in ALT_HIGH:return "ALT_HIGH"
    if symbol in ALT_MID: return "ALT_MID"
    if symbol in ALT_LOW: return "ALT_LOW"
    if symbol in STABLE:  return "STABLE"
    if symbol in GOLDPEG: return "GOLDPEG"
    if symbol in MEME:    return "MEME"
    return "UNCLASSIFIED"

# ===== Helpers =====
def first_existing(df, names):
    for n in names:
        if n in df.columns:
            return n
    return None

def starify(p):
    if pd.isna(p): return ""
    return "**" if p < 0.01 else ("*" if p < 0.05 else "")

def fmt_num(x, d=3, int_ok=False):
    if pd.isna(x): return ""
    if int_ok:
        try: return f"{int(x)}"
        except: pass
    try: return f"{float(x):.{d}f}"
    except: return ""

# ===== pick 10 symbols per your rule =====
def pick_symbols(df, proxy):
    pcol = first_existing(df, [f"pval_{proxy}", f"pval_log_d_{proxy}", f"pval_log_diff_{proxy}"])
    if pcol is None:
        # fallback: use any pval_* that exists (last resort)
        pvals = [c for c in df.columns if c.startswith("pval_") and c != "pval_Intercept"]
        pcol = pvals[0] if pvals else None

    d = df.copy()
    d["group"] = d["symbol"].map(classify_symbol)

    def pick_median(group):
        sub = d[(d["group"] == group) & d[pcol].notna()].sort_values(pcol)
        if sub.empty: return None
        return sub.iloc[len(sub)//2]["symbol"]

    def pick_min(group):
        sub = d[(d["group"] == group) & d[pcol].notna()].sort_values(pcol)
        if sub.empty: return None
        return sub.iloc[0]["symbol"]

    chosen = []
    if "BTC" in set(d["symbol"]): chosen.append("BTC")

    for grp in ["ALT_HIGH","ALT_MID","ALT_LOW"]:
        s1 = pick_min(grp)
        s2 = pick_median(grp)
        picks = []
        if s1: picks.append(s1)
        if s2 and s2 not in picks: picks.append(s2)
        if len(picks) < 2:
            # deterministic pad
            pool = [x for x in sorted(d.loc[d["group"]==grp,"symbol"]) if x not in picks]
            while len(picks) < 2 and pool:
                picks.append(pool.pop(0))
        chosen += picks[:2]

    for grp in ["STABLE","GOLDPEG","MEME"]:
        s = pick_median(grp)
        if s: chosen.append(s)

    return chosen[:10]

# ===== Build one LaTeX table from a CSV =====
def build_table_from_csv(csv_path):
    df = pd.read_csv(csv_path)
    if "symbol" not in df.columns:
        raise ValueError(f"'symbol' column missing in {csv_path}")

    # detect proxy (the sentiment var): first coef_* that is not intercept/logs
    proxy_candidates = [c.replace("coef_","") for c in df.columns
                        if c.startswith("coef_")
                        and c not in ("coef_Intercept","coef_log_daily_return")]
    if not proxy_candidates:
        raise ValueError(f"No proxy coef_ column found in {csv_path}")
    proxy = proxy_candidates[0]

    # columns (LaTeX rows) you want, with flexible suffixes
    row_specs = [
        ("Intercept",           ["Intercept"]),
        (proxy,                 [proxy, f"log_d_{proxy}", f"log_diff_{proxy}"]),
        (r"$R_{i,t}$",          ["log_daily_return","log_returns_lag1","lag_return","returns_lag1"]),
        (r"$\Delta\log\ $TxCnt",           ["log_TxCnt","log_diff_TxCnt","log_d_TxCnt"]),
        (r"$\Delta\log\ $Volume",          ["log_volume_trusted_spot_usd_1d","log_diff_volume_trusted_spot_usd_1d","log_d_volume_trusted_spot_usd_1d"]),
        (r"$\Delta\log\ $AdrActCnt",     ["log_AdrActCnt","log_diff_AdrActCnt"]),
        (r"$\log\ $UnemRt",          ["log_UnemRt"]),
        (r"$\log\ $IndPro",          ["log_IndPro"]),
        (r"$\log\ $CPIPrc",          ["log_CPIPrc"]),
        (r"$\log\ $TotRes",          ["log_TotRes"]),
    ]

    # metric columns (flexible names)
    col_N   = first_existing(df, ["n","nobs","N"])
    col_DW  = first_existing(df, ["dw","durbin_watson","DurbinWatson"])
    col_F   = first_existing(df, ["fstat","F","f_stat"])
    col_R2  = first_existing(df, ["r_squared","r2"])
    col_AR2 = first_existing(df, ["adj_r_squared","adj_r2","adjR"])
    
    symbols = pick_symbols(df, proxy)
    idx = {s: df[df["symbol"]==s].iloc[0] for s in symbols if not df[df["symbol"]==s].empty}

    # resolve which exact CSV cols to use (coef/stderr/pval) for each row
    resolved = []
    for label, suffixes in row_specs:
        coef_col  = first_existing(df, [f"coef_{s}"   for s in suffixes])
        se_col    = first_existing(df, [f"stderr_{s}" for s in suffixes])  # <-- Newey–West SE expected here
        pval_col  = first_existing(df, [f"pval_{s}"   for s in suffixes])
        resolved.append((label, coef_col, se_col, pval_col))

    # ---- Assemble LaTeX lines ----
    lines = []
    lines += [
        r"\begin{table}[ht]",
        r"\centering",
        r"\scriptsize",
        r"\setlength{\tabcolsep}{4pt}",
        rf"\begin{{tabular}}{{l *{{{len(symbols)}}}{{c}}}}",
        r"\toprule",
        r"&\multicolumn{1}{c}{\textbf{Bitcoin}}&\multicolumn{2}{c}{\textbf{High-Cap}}&\multicolumn{2}{c}{\textbf{Mid-Cap}}&\multicolumn{2}{c}{\textbf{Low-Cap}}&\multicolumn{1}{c}{\textbf{Gold}}&\multicolumn{1}{c}{\textbf{Stable}}&\multicolumn{1}{c}{\textbf{Meme}}\\",
        r"\addlinespace",
        " & " + " & ".join(symbols) + r" \\",
        r"\midrule"
    ]

    def add_var(label, coef_col, se_col, pval_col):
        coefs = []
        ses   = []
        for s in symbols:
            row = idx.get(s)
            if row is None:
                coefs.append("")
                ses.append("()")
                continue
            c = row.get(coef_col, np.nan) if coef_col else np.nan
            p = row.get(pval_col, np.nan) if pval_col else np.nan
            se = row.get(se_col, np.nan)  if se_col   else np.nan
            coefs.append(f"{fmt_num(c,3)}{starify(p)}")
            ses.append(f"({fmt_num(se,3)})" if not pd.isna(se) else "()")
        lines.append(f"{label} & " + " & ".join(coefs) + r" \\")
        lines.append(" & " + " & ".join(ses) + r" \\")
        lines.append(r"\addlinespace")

    for label, cc, sc, pc in resolved:
        add_var(label, cc, sc, pc)

    # ---- Bottom metrics ----
    def metric_row(name, colname, int_flag=False, decimals=2):
        vals = []
        for s in symbols:
            row = idx.get(s)
            if row is None or colname is None:
                vals.append("")
            else:
                vals.append(fmt_num(row.get(colname, np.nan), d=decimals, int_ok=int_flag))
        lines.append(f"{name} & " + " & ".join(vals) + r" \\")

    lines.append(r"\midrule")
    metric_row("N", col_N, int_flag=True, decimals=0)
    metric_row("Durbin-Watson", col_DW, int_flag=False, decimals=2)
    metric_row("F-stat", col_F, int_flag=False, decimals=2)
    metric_row(r"\(R^2\)", col_R2, int_flag=False, decimals=3)
    metric_row(r"Adj.\ \(R^2\)", col_AR2, int_flag=False, decimals=3)
    length = len(symbols) + 1

    lines += [
        r"\addlinespace",
        r"\midrule",
        rf"\multicolumn{{{length}}}{{c}}{{Specification tested: $R_{{i,t+1}} = \alpha_i + \beta_{{sent}} S_t + \phi R_{{i,t}} + \theta B_{{i,t}} + \gamma M_t + \varepsilon_{{i,t+1}}$}}\\",
        r"\bottomrule",
        r"\end{tabular}",
        rf"\caption{{\textbf{{Estimation Results from Predictive Regressions - {proxy}}} \\",
        r"This table presents coefficient estimates and Newey--West standard errors (in parentheses) from predictive regressions of next-day returns on the selected proxy for investor sentiment. The regression specification is reported at the top of the table. Each column corresponds to a selected cryptocurrency, chosen to represent the full range of categories in the sample: Bitcoin (BTC), two high-cap altcoins, two mid-cap altcoins, two low-cap altcoins, one stablecoin, one gold-pegged token, and one meme coin. Each regression is estimated separately using the available time series data for the respective cryptocurrency and the sentiment proxy. The dependent variable is the next-day log return. Standard errors are computed using the Newey--West estimator with automatic lag selection. Statistical significance is denoted as follows: *$p<0.05$, **$p<0.01$.}",
        rf"\label{{tab:{proxy.lower()}_result_h1}}",
        r"\end{table}"
    ]
    return proxy, lines

# ===== Run over all CSVs =====
for csv in sorted(Path(REGRESSIONS_DIR).glob("*.csv")):
    try:
        proxy, lines = build_table_from_csv(csv)
    except Exception as e:
        print(f"[skip] {csv.name}: {e}")
        continue
    out = Path(OUT_DIR) / f"table_{proxy}.tex"
    with open(out, "w", encoding="utf-8") as f:
        f.write("\n".join(lines))
    print(f"✅ {csv.name} -> {out}")

✅ crypto_regression_summary_ConSIX.csv -> Regressions/Tables/table_ConSIX.tex
✅ crypto_regression_summary_EPU_DUS.csv -> Regressions/Tables/table_EPU_DUS.tex
✅ crypto_regression_summary_InvSIX.csv -> Regressions/Tables/table_InvSIX.tex
✅ crypto_regression_summary_TwitSIX.csv -> Regressions/Tables/table_TwitSIX.tex
✅ crypto_regression_summary_VIX.csv -> Regressions/Tables/table_VIX.tex
✅ crypto_regression_summary_fng_value.csv -> Regressions/Tables/table_fng_value.tex
✅ crypto_regression_summary_no_sentiment.csv -> Regressions/Tables/table_log_diff_AdrActCnt.tex
[skip] h2_functional_avg_then_transform_results.csv: 'symbol' column missing in Regressions/h2_functional_avg_then_transform_results.csv
[skip] h2_wald_tests.csv: 'symbol' column missing in Regressions/h2_wald_tests.csv


In [8]:
import os
import glob
from pathlib import Path

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# ================== GLOBAL STYLE ==================
sns.set_style("whitegrid")
sns.set_context("paper", font_scale=1.2)
plt.rcParams.update({
    "font.family":       "serif",
    "font.serif":        ["DejaVu Serif"],
    "axes.titlesize":    22,
    "axes.labelsize":    20,
    "axes.titlecolor":   (38/255, 38/255, 38/255),
    "xtick.labelsize":   13,
    "ytick.labelsize":   13,
    "legend.fontsize":   10,
    "axes.titleweight":  "normal",
    "axes.edgecolor":    "black",
    "axes.linewidth":    0.5,
    "grid.color":        "0.85",
    "grid.linestyle":    "-",
    "grid.linewidth":    0.5,
    "figure.dpi":        300,
})

# ====== Paths ======
REG_DIR = "Regressions"
OUT_DIR = "Regressions/Figures"
Path(OUT_DIR).mkdir(parents=True, exist_ok=True)

# ====== Classification ======
def classify_symbol(symbol: str) -> str:
    if pd.isna(symbol):
        return "Unclassified"
    if symbol == "MTL_METAL":
        symbol = "MTL"

    BTC = {"BTC"}
    ALT_HIGH = {"AAVE","ADA","ALGO","ATOM","AVAX","BCH","BNB","BSV","CRO","CRV","DOT","ENS","EOS","ETC",
                "ETH","FIL","FLOW","GALA","GRT","ICP","LDO","LINK","LTC","MANA","MKR","QNT","SAND","SOL",
                "TRX","UNI","VET","XLM","XMR","XRP","XTZ","ZEC"}
    ALT_MID  = {"1INCH","ANT","AUDIO","BAT","COMP","CVC","CVX","DASH","DCR","DGB","ELF","ENJ","FTT","FXS",
                "GAS","GLM","GNO","GNT","ICX","LPT","LRC","LUNA","NEO","QTUM","RSR","SKL","SNT","SNX",
                "SUSHI","UMA","WAVES","XVG","YFI","ZIL","ZRX"}
    ALT_LOW  = {"ALCX","ALPHA","API3","BADGER","BAL","BIT","BNT","BTG","CEL","DRGN","FUN","GRIN","HEDG","HT",
                "KNC","LEND","LOOM","LSK","MAID","MTL","NMR","OGN","OMG","PAY","PERP","POLY","POWR","PPT",
                "QASH","REN","REP","ROOK","SRM","STORJ","SWRV","VTC","WNXM","WTC","XEM"}
    STABLE   = {"USDT","USDC","BUSD","DAI","GUSD","HUSD","PAX","TUSD"}
    GOLDPEG  = {"PAXG","XAUT"}
    MEME     = {"DOGE","SHIB","APE"}

    if symbol in BTC:     return "Bitcoin"
    if symbol in ALT_HIGH:return "Altcoins (High)"
    if symbol in ALT_MID: return "Altcoins (Mid)"
    if symbol in ALT_LOW: return "Altcoins (Low)"
    if symbol in STABLE:  return "Stablecoins"
    if symbol in GOLDPEG: return "Gold-Pegged"
    if symbol in MEME:    return "Meme Coins"
    return "Unclassified"

GROUP_ORDER = ["Bitcoin","Altcoins (High)","Altcoins (Mid)","Altcoins (Low)",
               "Stablecoins","Gold-Pegged","Meme Coins"]

GROUP_PALETTE = {
    "Bitcoin": "#4C72B0",
    "Altcoins (High)": "#599da2",
    "Altcoins (Mid)":  "#83a075",
    "Altcoins (Low)":  "#aca24a",
    "Stablecoins":     "#eb9681",
    "Gold-Pegged":     "#d2a022",
    "Meme Coins":      "#8C8C8C"
}

# ====== Proxy detection ======
def detect_proxy(df: pd.DataFrame):
    cands = [c for c in df.columns if c.startswith("coef_")
             and c != "coef_Intercept"
             and not c.startswith("coef_log_")]
    if not cands:
        return None, None, None
    proxy = cands[0].replace("coef_", "")
    coef_col = f"coef_{proxy}"
    pval_col = f"pval_{proxy}" if f"pval_{proxy}" in df.columns else None
    return proxy, coef_col, pval_col

# ====== Violin plot generator ======
def plot_violin(df, proxy, coef_col, pval_col):
    # classify
    df["Group"] = df["symbol"].map(classify_symbol)
    df = df[df["Group"].isin(GROUP_ORDER)]

    plt.figure(figsize=(8, 5))
    ax = sns.violinplot(
        data=df, x="Group", y=coef_col,
        order=GROUP_ORDER, palette=GROUP_PALETTE,
        inner=None, cut=2
    )
    sns.pointplot(
        data=df, x="Group", y=coef_col,
        order=GROUP_ORDER, join=False, estimator=np.median,
        color="black", markers="_", scale=1.3, errwidth=0
    )

    if pval_col and pval_col in df.columns:
        sig = df[df[pval_col] < 0.05]
        sns.stripplot(
            data=sig, x="Group", y=coef_col,
            order=GROUP_ORDER, color="red", size=4.5, jitter=True, alpha=0.85
        )

    ax.axhline(0, color="gray", lw=1)
    ax.set_title(rf"Distribution of $\beta_{{sent}}$ by Group — {proxy}")
    ax.set_ylabel(r"Estimated $\beta_{\mathrm{sent}}$")
    ax.set_xlabel("")
    ax.set_xticklabels(GROUP_ORDER, rotation=15, ha="center")

    plt.tight_layout()
    plt.savefig(Path(OUT_DIR) / f"violin_{proxy}.png", dpi=300)
    plt.close()
    print(f"[ok] Saved violin_{proxy}.png")

# ====== Run for all ======
for csv in sorted(glob.glob(os.path.join(REG_DIR, "*.csv"))):
    df = pd.read_csv(csv)
    if "symbol" not in df.columns:
        continue
    proxy, coef_col, pval_col = detect_proxy(df)
    if proxy:
        plot_violin(df, proxy, coef_col, pval_col)


Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  ax = sns.violinplot(

The `scale` parameter is deprecated and will be removed in v0.15.0. You can now control the size of each plot element using matplotlib `Line2D` parameters (e.g., `linewidth`, `markersize`, etc.).

  sns.pointplot(

The `join` parameter is deprecated and will be removed in v0.15.0. You can remove the line between points with `linestyle='none'`.

  sns.pointplot(

The `errwidth` parameter is deprecated. And will be removed in v0.15.0. Pass `err_kws={'linewidth': 0}` instead.

  sns.pointplot(
  ax.set_xticklabels(GROUP_ORDER, rotation=15, ha="center")


[ok] Saved violin_ConSIX.png
[ok] Saved violin_EPU_DUS.png



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  ax = sns.violinplot(

The `scale` parameter is deprecated and will be removed in v0.15.0. You can now control the size of each plot element using matplotlib `Line2D` parameters (e.g., `linewidth`, `markersize`, etc.).

  sns.pointplot(

The `join` parameter is deprecated and will be removed in v0.15.0. You can remove the line between points with `linestyle='none'`.

  sns.pointplot(

The `errwidth` parameter is deprecated. And will be removed in v0.15.0. Pass `err_kws={'linewidth': 0}` instead.

  sns.pointplot(
  ax.set_xticklabels(GROUP_ORDER, rotation=15, ha="center")

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  ax = sns.violinplot(

The `scale` parameter is deprecated and will be removed in v0.

[ok] Saved violin_InvSIX.png
[ok] Saved violin_TwitSIX.png



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  ax = sns.violinplot(

The `scale` parameter is deprecated and will be removed in v0.15.0. You can now control the size of each plot element using matplotlib `Line2D` parameters (e.g., `linewidth`, `markersize`, etc.).

  sns.pointplot(

The `join` parameter is deprecated and will be removed in v0.15.0. You can remove the line between points with `linestyle='none'`.

  sns.pointplot(

The `errwidth` parameter is deprecated. And will be removed in v0.15.0. Pass `err_kws={'linewidth': 0}` instead.

  sns.pointplot(
  ax.set_xticklabels(GROUP_ORDER, rotation=15, ha="center")

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  ax = sns.violinplot(

The `scale` parameter is deprecated and will be removed in v0.

[ok] Saved violin_VIX.png
[ok] Saved violin_fng_value.png


### H2

In [None]:

import numpy as np
import pandas as pd
from pathlib import Path
from typing import List, Dict, Optional, Tuple
import statsmodels.formula.api as smf
from statsmodels.tsa.ar_model import AutoReg
from sklearn.preprocessing import StandardScaler
from statsmodels.stats.stattools import durbin_watson

# -------------------- TOGGLES --------------------
USE_JUMPS        = False  # include jump_* dummies as controls
STANDARDIZE_CONT = True   # z-score continuous regressors only (not jumps)
VERBOSE          = True
# -------------------------------------------------

# ---------------- CONFIG ----------------
DATE_COL   = "date"
SYMBOL_COL = "symbol"
RET_COL    = "log_daily_return"   # already in df_crypto
TARGET_COL = "R_g_t_plus1"        # target built below

# Raw blockchain columns to average at group level
BASE_BLOCKCHAIN = ["TxCnt", "AdrActCnt", "volume_trusted_spot_usd_1d"]

# Macro (log levels in df) and sentiments
MACRO_COLS     = ["log_UnemRt", "log_IndPro", "log_CPIPrc", "log_TotRes"]
SENTIMENT_COLS = ["VIX", "TwitSIX", "EPU_DUS", "InvSIX", "fng_value", "ConSIX"]

# Regression controls
MIN_OBS_FLOOR = 365    # minimum sample length
MARGIN_OBS    = 30     # buffer above number of parameters

# Outputs (ONLY these two)
EXPORT_DIR     = Path("Regressions")
RESULTS_CSV    = EXPORT_DIR / "h2_functional_avg_then_transform_results.csv"
FUNC_TABLE_TEX = EXPORT_DIR / "h2_effective_membership_overall_table.tex"
# ---------------------------------------------------------

# ---------- Functional groups (as per your taxonomy) ----------
UTILITY = {"1INCH","AAVE","ANT","APE","BADGER","BAL","BAT","BIT","CEL","COMP","CRV","DCR","ENS","FXS",
           "HEDG","LDO","MKR","QNT","ROOK","SUSHI","SWRV","UNI","YFI"}
ASSET   = {"ALCX","ALPHA","CVX","PAXG","XAUT"}
PAYMENT = {"BCH","BNB","BSV","BTC","BTG","CRO","DASH","DGB","DOGE","ETC","ETH","FTT","GRIN","HT","LTC",
           "MTL_METAL","PAY","SHIB","VTC","XLM","XMR","XRP","XVG","ZEC"}
HYBRID_UA = {"CVC"}
HYBRID_UP = {"ADA","ALGO","API3","ATOM","AUDIO","AVAX","BNT","DOT","DRGN","ELF","ENJ","EOS","FIL","FLOW",
             "FUN","GALA","GAS","GLM","GNO","GNT","GRT","ICP","ICX","KNC","LEND","LINK","LOOM","LPT","LRC",
             "LSK","LUNA","MAID","MANA","NEO","NMR","OGN","OMG","PERP","POLY","POWR","PPT","QASH","QTUM",
             "REN","REP","RSR","SAND","SKL","SNT","SNX","SOL","SRM","STORJ","TRX","UMA","VET","WAVES",
             "WNXM","WTC","XEM","XTZ","ZIL","ZRX"}
HYBRID_AP = {"BUSD","DAI","GUSD","HUSD","PAX","TUSD","USDC","USDT"}

GROUP_ORDER = ["Utility", "Asset", "Payment", "Hybrid (U-A)", "Hybrid (U-P)", "Hybrid (A-P)"]

def map_functional_group(sym: str) -> str:
    s = str(sym).upper()
    if s in UTILITY:   return "Utility"
    if s in ASSET:     return "Asset"
    if s in PAYMENT:   return "Payment"
    if s in HYBRID_UA: return "Hybrid (U-A)"
    if s in HYBRID_UP: return "Hybrid (U-P)"
    if s in HYBRID_AP: return "Hybrid (A-P)"
    return "Unclassified"

# ---------------- helpers ----------------
def min_required_n(regressors: List[str], floor: int = MIN_OBS_FLOOR, margin: int = MARGIN_OBS) -> int:
    p = 1 + len(regressors)  # + intercept
    return max(floor, p + margin)

def group_eligible(sub: pd.DataFrame, target: str, regressors: List[str]) -> Tuple[bool, str]:
    if any(r not in sub.columns for r in regressors):
        return False, "missing_columns"
    cc = sub.dropna(subset=[target])
    if len(cc) < min_required_n(regressors):
        return False, f"too_few_obs:{len(cc)}"
    # Require variation for continuous vars (allow jump_* to be constant)
    for r in regressors:
        if not r.startswith("jump_") and r in cc.columns and cc[r].nunique() < 2:
            return False, f"no_variation:{r}"
    return True, "ok"

def select_hac_lag_via_resid_aic(resid: pd.Series, kmax: int) -> int:
    n = resid.shape[0]
    if n < 20 or kmax <= 0:
        return 0
    best_k, best_aic = 0, np.inf
    for k in range(kmax + 1):
        try:
            if k == 0:
                e = resid - resid.mean()
                sigma2 = np.var(e, ddof=1)
                aic = n * np.log(sigma2 + 1e-12) + 2
            else:
                ar = AutoReg(resid, lags=k, old_names=False, trend="c").fit()
                aic = ar.aic
        except Exception:
            aic = np.inf
        if aic < best_aic:
            best_aic, best_k = aic, k
    return int(best_k)

def _tex_escape_ticker(s: str) -> str:
    return str(s).replace("_", r"\_")

# ---------- Build group panel (avg raw → transform) + capture daily contributors ----------
def make_group_panel_avg_then_transform_with_members(df: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame]:
    """
    Returns:
      grp : group panel with R_g_t, R_g_t_lag, R_g_t_plus1 and group-level log_diffs/jumps
      mem_day : long DataFrame with [date, h2_group, symbol] listing daily contributors
    """
    df = df.copy()
    df[DATE_COL]   = pd.to_datetime(df[DATE_COL], errors="coerce")
    df[SYMBOL_COL] = df[SYMBOL_COL].astype(str).str.upper()
    df["h2_group"] = df[SYMBOL_COL].map(map_functional_group)
    df = df[df["h2_group"].isin(GROUP_ORDER)].copy()

    # Eligible rows for raw group means: need raw blockchain cols present that day
    elig = df.dropna(subset=BASE_BLOCKCHAIN).copy()

    # (A) daily contributor list per (date, group)
    mem_day = (elig.loc[:, [DATE_COL, "h2_group", SYMBOL_COL]]
                    .drop_duplicates()
                    .rename(columns={SYMBOL_COL: "symbol"})
                    .sort_values(["h2_group", DATE_COL, "symbol"])
                    .reset_index(drop=True))

    # (B) group means of RAW blockchain + returns
    agg_cols = [RET_COL] + BASE_BLOCKCHAIN
    grp_raw = (elig.groupby([DATE_COL, "h2_group"], dropna=False)[agg_cols]
                    .mean(numeric_only=True)
                    .reset_index()
                    .sort_values(["h2_group", DATE_COL]))

    # (C) zero-safe log-diffs (+ optional jump dummies) on group averages
    for col in BASE_BLOCKCHAIN:
        prev = grp_raw.groupby("h2_group")[col].shift(1)
        curr = grp_raw[col]
        if USE_JUMPS:
            grp_raw[f"jump_{col}"] = ((prev == 0) & (curr > 0)).astype(int)
        grp_raw[f"log_diff_{col}"] = np.where((prev > 0) & (curr > 0), np.log(curr / prev), 0.0)

    # (D) attach date-level macro + sentiment (unique per date)
    date_cols = [DATE_COL] + [c for c in (MACRO_COLS + SENTIMENT_COLS) if c in df.columns]
    date_level = df[date_cols].drop_duplicates(subset=[DATE_COL]).sort_values(DATE_COL)
    grp = grp_raw.merge(date_level, on=DATE_COL, how="left")

    # (E) target and lags
    grp["R_g_t"]       = grp.groupby("h2_group")[RET_COL].shift(0)
    grp["R_g_t_lag"]   = grp.groupby("h2_group")[RET_COL].shift(1)
    grp["R_g_t_plus1"] = grp.groupby("h2_group")[RET_COL].shift(-1)

    # (F) drop missing edges
    grp = grp.dropna(subset=["R_g_t_lag", "R_g_t_plus1"]).reset_index(drop=True)
    return grp, mem_day

# ---------- Effective membership utilities ----------
def build_effective_membership(mem_day: pd.DataFrame,
                               used_dates: pd.Index,
                               group_name: str) -> pd.DataFrame:
    sub = mem_day[(mem_day["h2_group"] == group_name) & (mem_day[DATE_COL].isin(used_dates))].copy()
    if sub.empty:
        return pd.DataFrame(columns=["group","symbol","contrib_days"])
    out = (sub.groupby(["h2_group","symbol"], as_index=False)
               .agg(contrib_days=("symbol","size"))
               .rename(columns={"h2_group":"group"}))
    return out

def summarize_membership(df_long: pd.DataFrame,
                         group_order=GROUP_ORDER) -> pd.DataFrame:
    if df_long.empty:
        return pd.DataFrame(columns=["Token Type","Ticker","# Coins"])
    tmp = (df_long.groupby("group", as_index=False)
                 .agg(symbols=("symbol", lambda s: sorted(set(s))),
                      n_coins=("symbol", "nunique")))
    tmp["Ticker"] = tmp["symbols"].apply(lambda lst: ", ".join(_tex_escape_ticker(s) for s in lst))
    tmp = tmp.drop(columns=["symbols"]).rename(columns={"group":"Token Type", "n_coins":"# Coins"})
    tmp["Token Type"] = pd.Categorical(tmp["Token Type"], categories=group_order, ordered=True)
    return tmp.sort_values("Token Type").reset_index(drop=True)

def render_latex_functional_table(df_tbl: pd.DataFrame,
                                  caption: str,
                                  label: str,
                                  colwidths=("3cm","10cm","1.2cm")) -> str:
    lines = []
    lines += [r"\begin{table}[ht]",
              r"\centering",
              r"\scriptsize",
              r"\setlength{\tabcolsep}{4pt}",
              rf"\begin{{tabular}}{{p{{{colwidths[0]}}}p{{{colwidths[1]}}}c}}",
              r"\toprule",
              r"\textbf{Token Type} & \textbf{Ticker} & \textbf{\# Coins} \\",
              r"\midrule"]
    total = 0
    for _, row in df_tbl.iterrows():
        lines.append(f"{row['Token Type']} & {row['Ticker']} & {int(row['# Coins'])} \\\\")
        lines.append(r"\addlinespace")
        total += int(row['# Coins'])
    lines += [rf"\textbf{{Total:}} & & \textbf{{{total}}} \\",
              r"\bottomrule",
              r"\end{tabular}",
              rf"\caption{{\textbf{{{caption}}}}}",
              rf"\label{{{label}}}",
              r"\end{table}"]
    return "\n".join(lines)

# ---------- One regression (group × proxy) that returns H1-style row ----------
def run_group_proxy_regression(dfg: pd.DataFrame,
                               group_name: str,
                               proxy_label: Optional[str],
                               regressors: List[str]) -> Optional[Dict]:
    """
    dfg must contain: TARGET_COL and regressors (already cleaned & standardized if needed)
    Returns one row with H1-like columns: n, nobs, n_regs, regs_used, opt_hac_lag, r2, adj_r2, aic, bic,
    log_likelihood, fstat, f_pval, durbin_watson, and per-parameter coef_*, stderr_*, tval_*, pval_*.
    """
    rhs = " + ".join(regressors)
    fit = smf.ols(f"{TARGET_COL} ~ {rhs}", data=dfg).fit()
    n = int(fit.nobs)
    if n <= 8:
        return None

    # HAC lag selection via AR(k)-AIC
    kmax = int(n ** 0.25)
    best_aic, best_lag, best_res = np.inf, 0, None
    for lag in range(kmax + 1):
        r = fit.get_robustcov_results(cov_type="HAC", maxlags=lag, use_correction=True)
        if r.aic < best_aic:
            best_aic, best_lag, best_res = r.aic, lag, r

    # Build output row mirroring H1 style
    row = {
        "group": group_name,
        "proxy": proxy_label if proxy_label is not None else "no_sentiment",
        "n": n,                        # H1 included both n and nobs
        "nobs": int(best_res.nobs),
        "n_regs": len(regressors),
        "regs_used": ",".join(regressors),
        "opt_hac_lag": int(best_lag),
        "r2": float(best_res.rsquared),
        "adj_r2": float(best_res.rsquared_adj),
        "aic": float(best_res.aic),
        "bic": float(best_res.bic),
        "log_likelihood": float(best_res.llf),
        "fstat": float(best_res.fvalue) if best_res.fvalue is not None else np.nan,
        "f_pval": float(best_res.f_pvalue) if best_res.f_pvalue is not None else np.nan,
        "durbin_watson": float(durbin_watson(fit.resid)),
    }

    # Add per-parameter columns, as in H1
    for name, b, se, t, p in zip(best_res.model.exog_names,
                                 best_res.params, best_res.bse, best_res.tvalues, best_res.pvalues):
        row[f"coef_{name}"]   = float(b)
        row[f"stderr_{name}"] = float(se)
        row[f"tval_{name}"]   = float(t)
        row[f"pval_{name}"]   = float(p)

    return row

# ---------- Driver: run all proxies, write one CSV + one LaTeX table ----------
def run_h2_write_only_results_and_func_table(df_crypto: pd.DataFrame,
                                             export_dir: Path = EXPORT_DIR,
                                             verbose: bool = VERBOSE) -> pd.DataFrame:
    export_dir.mkdir(parents=True, exist_ok=True)

    # 1) Build group panel and daily membership (to know who actually contributes)
    grp, mem_day = make_group_panel_avg_then_transform_with_members(df_crypto)

    # 2) Regressors lists
    base_cont = ["R_g_t_lag",
                 "log_diff_AdrActCnt", "log_diff_volume_trusted_spot_usd_1d", "log_diff_TxCnt"]
    base_jumps = (["jump_AdrActCnt", "jump_volume_trusted_spot_usd_1d", "jump_TxCnt"] if USE_JUMPS else [])
    macro      = [c for c in MACRO_COLS if c in grp.columns]

    proxy_dict = {
        "no_sentiment": None,
        "EPU_DUS":      "EPU_DUS",
        "VIX":          "VIX",
        "InvSIX":       "InvSIX",
        "TwitSIX":      "TwitSIX",
        "ConSIX":       "ConSIX",
        "fng_value":    "fng_value"
    }

    results_rows = []
    overall_members = []  # collect by (group, symbol) across all successful regressions

    present_groups = [g for g in GROUP_ORDER if g in grp["h2_group"].unique()]
    if verbose:
        print(f"[INFO] Groups present: {present_groups}")

    for label, proxy in proxy_dict.items():
        for g in present_groups:
            sub = grp[grp["h2_group"] == g].copy()

            # Assemble RHS
            regressors = base_cont.copy()
            if proxy is not None and proxy in sub.columns:
                regressors.insert(1, proxy)  # sentiment after lagged return
            regressors += base_jumps + macro

            # Keep only needed cols + date for membership
            cols_needed = [TARGET_COL] + regressors
            dfg = sub[cols_needed + [DATE_COL]].copy()

            # Coerce numeric & drop non-finite
            for c in cols_needed:
                dfg[c] = pd.to_numeric(dfg[c], errors="coerce")
            dfg = dfg.replace([np.inf, -np.inf], np.nan).dropna(subset=cols_needed)

            # Eligibility
            ok, reason = group_eligible(dfg, TARGET_COL, regressors)
            if not ok:
                if verbose: print(f"[SKIP] {g} × {proxy or 'no_sentiment'} — {reason}")
                continue

            # Standardize continuous (not jumps)
            cont = [r for r in regressors if not r.startswith("jump_")]
            if STANDARDIZE_CONT and cont:
                dfg[cont] = StandardScaler().fit_transform(dfg[cont])

            # Run regression and append H1-style row
            row = run_group_proxy_regression(dfg, g, proxy, regressors)
            if row is None:
                if verbose: print(f"[SKIP] {g} × {proxy or 'no_sentiment'} — too few nobs after clean")
                continue
            results_rows.append(row)

            # Track effective membership for the dates actually used in this regression
            used_dates = dfg[DATE_COL]
            eff = build_effective_membership(mem_day, used_dates, g)
            if not eff.empty:
                overall_members.append(eff)

    # 3) Write the single results CSV
    results_df = pd.DataFrame(results_rows).sort_values(["group","proxy"]).reset_index(drop=True)
    results_df.to_csv(RESULTS_CSV, index=False)
    if verbose:
        print(f"[INFO] Wrote results to {RESULTS_CSV} (rows={len(results_df)})")

    # 4) Build and write ONLY the functional membership LaTeX table (overall union)
    if overall_members:
        overall_long = pd.concat(overall_members, ignore_index=True)
        # Union across proxies by (group, symbol)
        overall_union = overall_long.drop_duplicates(subset=["group","symbol"])
        overall_wide  = summarize_membership(overall_union)
        latex_code = render_latex_functional_table(
            overall_wide,
            caption="Functional Groups Used in the Final Regression Sample",
            label="tab:functional_groups_effective_sample"
        )
        with open(FUNC_TABLE_TEX, "w") as f:
            f.write(latex_code)
        if verbose:
            print(f"[INFO] Wrote functional membership table to {FUNC_TABLE_TEX}")
    else:
        if verbose:
            print("[WARN] No functional membership captured (no regressions passed eligibility).")

    return results_df

# ---------------- RUN ----------------
# df_crypto must already be loaded & pre-engineered as in your earlier prep
results_h2 = run_h2_write_only_results_and_func_table(df_crypto)
results_h2.head()

  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)


[INFO] Groups present: ['Utility', 'Asset', 'Payment', 'Hybrid (U-A)', 'Hybrid (U-P)', 'Hybrid (A-P)']
[INFO] Wrote results to Regressions/h2_functional_avg_then_transform_results.csv (rows=42)
[INFO] Wrote functional membership table to Regressions/h2_effective_membership_overall_table.tex


Unnamed: 0,group,proxy,n,nobs,n_regs,regs_used,opt_hac_lag,r2,adj_r2,aic,...,tval_TwitSIX,pval_TwitSIX,coef_ConSIX,stderr_ConSIX,tval_ConSIX,pval_ConSIX,coef_fng_value,stderr_fng_value,tval_fng_value,pval_fng_value
0,Asset,ConSIX,1922,1922,9,"R_g_t_lag,ConSIX,log_diff_AdrActCnt,log_diff_v...",0,0.015749,0.011116,13794.078303,...,,,0.788847,0.79234,0.995592,0.319574,,,,
1,Asset,EPU_DUS,1922,1922,9,"R_g_t_lag,EPU_DUS,log_diff_AdrActCnt,log_diff_...",0,0.010152,0.005492,13804.976739,...,,,,,,,,,,
2,Asset,InvSIX,1922,1922,9,"R_g_t_lag,InvSIX,log_diff_AdrActCnt,log_diff_v...",0,0.012966,0.008319,13799.505228,...,,,,,,,,,,
3,Asset,TwitSIX,1239,1239,9,"R_g_t_lag,TwitSIX,log_diff_AdrActCnt,log_diff_...",0,0.019213,0.01203,9421.211059,...,0.715605,0.474371,,,,,,,,
4,Asset,VIX,1922,1922,9,"R_g_t_lag,VIX,log_diff_AdrActCnt,log_diff_volu...",0,0.013935,0.009293,13797.616675,...,,,,,,,,,,


In [None]:
# =========================================================
# Build LaTeX tables (functional groups as columns) from H2 CSV
# - Uses the new combined results file with coef/SE/t/p
# - Runs for ALL proxies found in the results file
# - Auto-detects optional jump_* rows if present
# - Writes one .tex per proxy to Reg./Tables/
# =========================================================
import pandas as pd
import numpy as np
from pathlib import Path

# ---- Paths ----
RESULTS_CSV = Path("Regressions/h2_functional_avg_then_transform_results.csv")
OUT_DIR     = Path("Regressions/Tables")
OUT_DIR.mkdir(parents=True, exist_ok=True)

# ---- Functional group column order (match H2 runs) ----
GROUP_ORDER = ["Utility", "Asset", "Payment", "Hybrid (U-A)", "Hybrid (U-P)", "Hybrid (A-P)"]

# ---- Significance thresholds ----
# ** for p<0.01; * for p<0.05 (matches your earlier tables)
STAR_THRESHOLDS = [(0.01, "**"), (0.05, "*")]

# ---- Formatting helpers ----
def star_for_p(p):
    try:
        p = float(p)
    except Exception:
        return ""
    for thr, mark in STAR_THRESHOLDS:
        if p < thr:
            return mark
    return ""

def fmt_coef(c, p):
    if c is None or (isinstance(c, float) and (np.isnan(c) or np.isinf(c))):
        return ""
    return f"{float(c):.3f}{star_for_p(p)}"

def fmt_se(se):
    if se is None or (isinstance(se, float) and (np.isnan(se) or np.isinf(se))):
        return ""
    return f"({float(se):.3f})"

def fmt_stat(x, ints=False):
    if x is None or (isinstance(x, float) and np.isnan(x)):
        return ""
    if ints:
        try:
            return f"{int(x)}"
        except Exception:
            return f"{x}"
    return f"{float(x):.2f}"

def header_line(num_cols:int) -> str:
    # Safer multi-part build to avoid truncation issues
    num_cols = num_cols + 1
    return (
        "\\multicolumn{" + str(num_cols) + "}{c}{Specification tested: $"
        "R_{g,t+1} = "
        "\\alpha_g + "
        "\\beta_{sent,g} S_t + "
        "\\phi R_{g,t} + "
        "\\theta B_{g,t} + "
        "\\gamma M_t + "
        "\\varepsilon_{g,t+1}$}\\\\"
    )

def build_row_specs(proxy_label: str, available_cols: set):
    """
    Build the list of (row_label, coef_key, se_key, p_key) for this proxy,
    including jump_* rows only if columns exist.
    """
    specs = [
        ("Intercept",              "coef_Intercept",                "stderr_Intercept",                "pval_Intercept"),
        (proxy_label,              f"coef_{proxy_label}",           f"stderr_{proxy_label}",           f"pval_{proxy_label}"),
        ("$R_{g,t}$",              "coef_R_g_t_lag",                "stderr_R_g_t_lag",                "pval_R_g_t_lag"),
        ("$\Delta\log\ $TxCnt",              "coef_log_diff_TxCnt",           "stderr_log_diff_TxCnt",           "pval_log_diff_TxCnt"),
        ("$\Delta\log\ $Volume",             "coef_log_diff_volume_trusted_spot_usd_1d",
                                   "stderr_log_diff_volume_trusted_spot_usd_1d",
                                   "pval_log_diff_volume_trusted_spot_usd_1d"),
        ("$\Delta\log\ $AdrActCnt",   "coef_log_diff_AdrActCnt",       "stderr_log_diff_AdrActCnt",       "pval_log_diff_AdrActCnt"),
    ]
    # Optional jump rows — include only if present in the CSV
    jump_rows = [
        ("Jump TxCnt",             "coef_jump_TxCnt",               "stderr_jump_TxCnt",               "pval_jump_TxCnt"),
        ("Jump Volume",            "coef_jump_volume_trusted_spot_usd_1d",
                                   "stderr_jump_volume_trusted_spot_usd_1d",
                                   "pval_jump_volume_trusted_spot_usd_1d"),
        ("Jump Active Addresses",  "coef_jump_AdrActCnt",           "stderr_jump_AdrActCnt",           "pval_jump_AdrActCnt"),
    ]
    for r in jump_rows:
        if (r[1] in available_cols) or (r[2] in available_cols) or (r[3] in available_cols):
            specs.append(r)

    specs += [
        ("$\log\ $UnemRt",             "coef_log_UnemRt",               "stderr_log_UnemRt",               "pval_log_UnemRt"),
        ("$\log\ $IndPro",             "coef_log_IndPro",               "stderr_log_IndPro",               "pval_log_IndPro"),
        ("$\log\ $CPIPrc",             "coef_log_CPIPrc",               "stderr_log_CPIPrc",               "pval_log_CPIPrc"),
        ("$\log\ $TotRes",             "coef_log_TotRes",               "stderr_log_TotRes",               "pval_log_TotRes"),
    ]
    return specs

def render_table_for_proxy(df_proxy: pd.DataFrame, proxy_label: str):
    """
    Render LaTeX table string for one proxy, using df_proxy rows (one per group).
    """
    present_groups = [g for g in GROUP_ORDER if g in set(df_proxy["group"].astype(str))]
    if not present_groups:
        return None

    sub_idx = df_proxy.set_index("group")
    available_cols = set(df_proxy.columns)

    row_specs = build_row_specs(proxy_label, available_cols)

    lines = []
    num_cols = len(present_groups)
    cols_spec = f"l *{{{num_cols}}}{{c}}"
    lines += [
        r"\begin{table}[ht]",
        r"\centering",
        r"\scriptsize",
        r"\setlength{\tabcolsep}{4pt}",
        rf"\begin{{tabular}}{{{cols_spec}}}",
        r"\toprule",
        " & " + " & ".join(present_groups) + r" \\",
        r"\midrule"
    ]

    # Coefficient row + SE row per item
    for label, coef_key, se_key, p_key in row_specs:
        # coef line
        coef_cells = []
        for g in present_groups:
            row = sub_idx.loc[g] if g in sub_idx.index else None
            coef_cells.append(fmt_coef(row.get(coef_key, np.nan), row.get(p_key, np.nan)) if row is not None else "")
        lines.append(label + " & " + " & ".join(coef_cells) + r" \\")
        # se line
        se_cells = []
        for g in present_groups:
            row = sub_idx.loc[g] if g in sub_idx.index else None
            se_cells.append(fmt_se(row.get(se_key, np.nan)) if row is not None else "")
        lines.append(" & " + " & ".join(se_cells) + r" \\")
        lines.append(r"\addlinespace")

    # Bottom stats
    lines.append(r"\midrule")
    bottom_specs = [
        ("N",              "nobs",           True),
        ("Durbin-Watson",  "durbin_watson",  False),
        ("F-stat",         "fstat",          False),
        (r"\(R^2\)",       "r2",             False),
        (r"Adj.\ \(R^2\)", "adj_r2",         False),
    ]
    for label, stat_key, as_int in bottom_specs:
        vals = []
        for g in present_groups:
            v = sub_idx.loc[g].get(stat_key, np.nan) if g in sub_idx.index else np.nan
            vals.append(fmt_stat(v, ints=as_int))
        lines.append(label + " & " + " & ".join(vals) + r" \\")

    # Caption + label (multi-line caption to avoid truncation issues)
    nice_proxy = proxy_label
    lines += [
        r"\addlinespace",
        r"\midrule",
        header_line(num_cols),
        r"\bottomrule",
        r"\end{tabular}",
        rf"\caption{{\textbf{{Estimation Results from Predictive Regressions — {nice_proxy}}} \\",
        r"This table presents coefficient estimates and Newey--West standard errors (in parentheses) from predictive regressions of next-day group returns on the selected proxy for investor sentiment.",
        r"The regression specification is reported at the top of the table. Each column corresponds to a functional group portfolio (equal-weight) as defined in Section~\ref{sec:groupings}.",
        r"The dependent variable is the next-day log return of the group portfolio. Standard errors are computed using the Newey--West estimator with automatic lag selection. Statistical significance is denoted as follows: *$p<0.05$, **$p<0.01$.}",
        rf"\label{{tab:h2_{proxy_label.lower()}_results}}",
        r"\end{table}"
    ]

    return "\n".join(lines)

# ---- Load results ----
df = pd.read_csv(RESULTS_CSV)

# Normalize types
df["group"] = df["group"].astype(str)
df["proxy"] = df["proxy"].astype(str)

# Get proxies present
proxies = list(df["proxy"].dropna().unique())

# Generate a table for each proxy found
written = []
for proxy in proxies:
    df_proxy = df[df["proxy"] == proxy].copy()
    tex = render_table_for_proxy(df_proxy, proxy)
    if tex is None:
        continue
    out_tex = OUT_DIR / f"h2_functional_results_{proxy}.tex"
    with open(out_tex, "w") as f:
        f.write(tex)
    written.append(str(out_tex))

print("[INFO] Wrote", len(written), "tables:")
for p in written:
    print(" -", p)

[INFO] Wrote 7 tables:
 - Regressions/Tables/h2_functional_results_ConSIX.tex
 - Regressions/Tables/h2_functional_results_EPU_DUS.tex
 - Regressions/Tables/h2_functional_results_InvSIX.tex
 - Regressions/Tables/h2_functional_results_TwitSIX.tex
 - Regressions/Tables/h2_functional_results_VIX.tex
 - Regressions/Tables/h2_functional_results_fng_value.tex
 - Regressions/Tables/h2_functional_results_no_sentiment.tex


  ("$\Delta\log\ $ TxCnt",              "coef_log_diff_TxCnt",           "stderr_log_diff_TxCnt",           "pval_log_diff_TxCnt"),
  ("$\Delta\log\ $ Volume",             "coef_log_diff_volume_trusted_spot_usd_1d",
  ("$\Delta\log\ $ AdrActCnt",   "coef_log_diff_AdrActCnt",       "stderr_log_diff_AdrActCnt",       "pval_log_diff_AdrActCnt"),
  ("$\log\ $ UnemRt",             "coef_log_UnemRt",               "stderr_log_UnemRt",               "pval_log_UnemRt"),
  ("$\log\ $ IndPro",             "coef_log_IndPro",               "stderr_log_IndPro",               "pval_log_IndPro"),
  ("$\log\ $ CPIPrc",             "coef_log_CPIPrc",               "stderr_log_CPIPrc",               "pval_log_CPIPrc"),
  ("$\log\ $ TotRes",             "coef_log_TotRes",               "stderr_log_TotRes",               "pval_log_TotRes"),


In [None]:

import pandas as pd
import numpy as np
from pathlib import Path

# ---- Paths ----
RESULTS_CSV = Path("Regressions/h2_functional_avg_then_transform_results.csv")
OUT_DIR     = Path("Regressions/Tables")
OUT_DIR.mkdir(parents=True, exist_ok=True)

# ---- Functional group column order (match H2 runs) ----
GROUP_ORDER = ["Utility", "Asset", "Payment", "Hybrid (U-A)", "Hybrid (U-P)", "Hybrid (A-P)"]

# ---- Significance thresholds ----
# ** for p<0.01; * for p<0.05 (matches your earlier tables)
STAR_THRESHOLDS = [(0.01, "**"), (0.05, "*")]

# ---- Formatting helpers ----
def star_for_p(p):
    try:
        p = float(p)
    except Exception:
        return ""
    for thr, mark in STAR_THRESHOLDS:
        if p < thr:
            return mark
    return ""

def fmt_coef(c, p):
    if c is None or (isinstance(c, float) and (np.isnan(c) or np.isinf(c))):
        return ""
    return f"{float(c):.3f}{star_for_p(p)}"

def fmt_se(se):
    if se is None or (isinstance(se, float) and (np.isnan(se) or np.isinf(se))):
        return ""
    return f"({float(se):.3f})"

def fmt_stat(x, ints=False, ndigits=2):
    if x is None or (isinstance(x, float) and np.isnan(x)):
        return ""
    if ints:
        try:
            return f"{int(x)}"
        except Exception:
            return f"{x}"
    return f"{float(x):.{ndigits}f}"

def header_line(num_cols:int) -> str:
    # Safer multi-part build to avoid truncation issues
    num_cols = num_cols + 1
    return (
        "\\multicolumn{" + str(num_cols) + "}{c}{Specification tested: $"
        "R_{g,t+1} = "
        "\\alpha_g + "
        "\\beta_{sent,g} S_t + "
        "\\phi R_{g,t} + "
        "\\theta B_{g,t} + "
        "\\gamma M_t + "
        "\\varepsilon_{g,t+1}$}\\\\"
    )

def build_row_specs(proxy_label: str, available_cols: set):
    """
    Build the list of (row_label, coef_key, se_key, p_key) for this proxy,
    including jump_* rows only if columns exist.
    """
    specs = [
        ("Intercept",              "coef_Intercept",                "stderr_Intercept",                "pval_Intercept"),
        (proxy_label,              f"coef_{proxy_label}",           f"stderr_{proxy_label}",           f"pval_{proxy_label}"),
        ("$R_{g,t}$",              "coef_R_g_t_lag",                "stderr_R_g_t_lag",                "pval_R_g_t_lag"),
        ("$\Delta\log\ $TxCnt",              "coef_log_diff_TxCnt",           "stderr_log_diff_TxCnt",           "pval_log_diff_TxCnt"),
        ("$\Delta\log\ $Volume",             "coef_log_diff_volume_trusted_spot_usd_1d",
                                   "stderr_log_diff_volume_trusted_spot_usd_1d",
                                   "pval_log_diff_volume_trusted_spot_usd_1d"),
        ("$\Delta\log\ $AdrActCnt",   "coef_log_diff_AdrActCnt",       "stderr_log_diff_AdrActCnt",       "pval_log_diff_AdrActCnt"),
    ]
    
    jump_rows = [
        ("Jump TxCnt",             "coef_jump_TxCnt",               "stderr_jump_TxCnt",               "pval_jump_TxCnt"),
        ("Jump Volume",            "coef_jump_volume_trusted_spot_usd_1d",
                                   "stderr_jump_volume_trusted_spot_usd_1d",
                                   "pval_jump_volume_trusted_spot_usd_1d"),
        ("Jump Active Addresses",  "coef_jump_AdrActCnt",           "stderr_jump_AdrActCnt",           "pval_jump_AdrActCnt"),
    ]
    for r in jump_rows:
        if (r[1] in available_cols) or (r[2] in available_cols) or (r[3] in available_cols):
            specs.append(r)

    specs += [
        ("$\log\ $UnemRt",             "coef_log_UnemRt",               "stderr_log_UnemRt",               "pval_log_UnemRt"),
        ("$\log\ $IndPro",             "coef_log_IndPro",               "stderr_log_IndPro",               "pval_log_IndPro"),
        ("$\log\ $CPIPrc",             "coef_log_CPIPrc",               "stderr_log_CPIPrc",               "pval_log_CPIPrc"),
        ("$\log\ $TotRes",             "coef_log_TotRes",               "stderr_log_TotRes",               "pval_log_TotRes"),
    ]
    return specs

def render_table_for_proxy(df_proxy: pd.DataFrame, proxy_label: str):
    """
    Render LaTeX table string for one proxy, using df_proxy rows (one per group).
    """
    present_groups = [g for g in GROUP_ORDER if g in set(df_proxy["group"].astype(str))]
    if not present_groups:
        return None

    sub_idx = df_proxy.set_index("group")
    available_cols = set(df_proxy.columns)

    row_specs = build_row_specs(proxy_label, available_cols)

    lines = []
    num_cols = len(present_groups)
    cols_spec = f"l *{{{num_cols}}}{{c}}"
    lines += [
        r"\begin{table}[ht]",
        r"\centering",
        r"\scriptsize",
        r"\setlength{\tabcolsep}{4pt}",
        rf"\begin{{tabular}}{{{cols_spec}}}",
        r"\toprule",
        " & " + " & ".join(present_groups) + r" \\",
        r"\midrule"
    ]

    # Coefficient row + SE row per item
    for label, coef_key, se_key, p_key in row_specs:
        # coef line
        coef_cells = []
        for g in present_groups:
            row = sub_idx.loc[g] if g in sub_idx.index else None
            coef_cells.append(fmt_coef(row.get(coef_key, np.nan), row.get(p_key, np.nan)) if row is not None else "")
        lines.append(label + " & " + " & ".join(coef_cells) + r" \\")
        # se line
        se_cells = []
        for g in present_groups:
            row = sub_idx.loc[g] if g in sub_idx.index else None
            se_cells.append(fmt_se(row.get(se_key, np.nan)) if row is not None else "")
        lines.append(" & " + " & ".join(se_cells) + r" \\")
        lines.append(r"\addlinespace")

    # Bottom stats
    lines.append(r"\midrule")
# label, key, as_int, ndigits
    bottom_specs = [
        ("N",              "nobs",           True,  None),
        ("Durbin-Watson",  "durbin_watson",  False, 2),
        ("F-stat",         "fstat",          False, 2),
        (r"\(R^2\)",       "r2",             False, 3),
        (r"Adj.\ \(R^2\)", "adj_r2",         False, 3),
    ]

    for label, stat_key, as_int, nd in bottom_specs:
        vals = []
        for g in present_groups:
            v = sub_idx.loc[g].get(stat_key, np.nan) if g in sub_idx.index else np.nan
            vals.append(fmt_stat(v, ints=as_int, ndigits=(nd or 2)))
        lines.append(label + " & " + " & ".join(vals) + r" \\")
    # Caption + label (multi-line caption to avoid truncation issues)
    nice_proxy = proxy_label
    lines += [
        r"\addlinespace",
        r"\midrule",
        header_line(num_cols),
        r"\bottomrule",
        r"\end{tabular}",
        rf"\caption{{\textbf{{Estimation Results from Predictive Regressions — {nice_proxy}}} \\",
        r"This table presents coefficient estimates and Newey--West standard errors (in parentheses) from predictive regressions of next-day group returns on the selected proxy for investor sentiment.",
        r"The regression specification is reported at the top of the table. Each column corresponds to a functional group portfolio (equal-weight) as defined in Section~\ref{sec:groupings}.",
        r"The dependent variable is the next-day log return of the group portfolio. Standard errors are computed using the Newey--West estimator with automatic lag selection. Statistical significance is denoted as follows: *$p<0.05$, **$p<0.01$.}",
        rf"\label{{tab:h2_{proxy_label.lower()}_results}}",
        r"\end{table}"
    ]

    return "\n".join(lines)

# ---- Load results ----
df = pd.read_csv(RESULTS_CSV)

# Normalize types
df["group"] = df["group"].astype(str)
df["proxy"] = df["proxy"].astype(str)

# Get proxies present
proxies = list(df["proxy"].dropna().unique())

# Generate a table for each proxy found
written = []
for proxy in proxies:
    df_proxy = df[df["proxy"] == proxy].copy()
    tex = render_table_for_proxy(df_proxy, proxy)
    if tex is None:
        continue
    out_tex = OUT_DIR / f"h2_functional_results_{proxy}.tex"
    with open(out_tex, "w") as f:
        f.write(tex)
    written.append(str(out_tex))

print("[INFO] Wrote", len(written), "tables:")
for p in written:
    print(" -", p)

  ("$\Delta\log\ $TxCnt",              "coef_log_diff_TxCnt",           "stderr_log_diff_TxCnt",           "pval_log_diff_TxCnt"),
  ("$\Delta\log\ $Volume",             "coef_log_diff_volume_trusted_spot_usd_1d",
  ("$\Delta\log\ $AdrActCnt",   "coef_log_diff_AdrActCnt",       "stderr_log_diff_AdrActCnt",       "pval_log_diff_AdrActCnt"),
  ("$\log\ $UnemRt",             "coef_log_UnemRt",               "stderr_log_UnemRt",               "pval_log_UnemRt"),
  ("$\log\ $IndPro",             "coef_log_IndPro",               "stderr_log_IndPro",               "pval_log_IndPro"),
  ("$\log\ $CPIPrc",             "coef_log_CPIPrc",               "stderr_log_CPIPrc",               "pval_log_CPIPrc"),
  ("$\log\ $TotRes",             "coef_log_TotRes",               "stderr_log_TotRes",               "pval_log_TotRes"),


[INFO] Wrote 7 tables:
 - Regressions/Tables/h2_functional_results_ConSIX.tex
 - Regressions/Tables/h2_functional_results_EPU_DUS.tex
 - Regressions/Tables/h2_functional_results_InvSIX.tex
 - Regressions/Tables/h2_functional_results_TwitSIX.tex
 - Regressions/Tables/h2_functional_results_VIX.tex
 - Regressions/Tables/h2_functional_results_fng_value.tex
 - Regressions/Tables/h2_functional_results_no_sentiment.tex


WALD TEST

In [None]:
# =========================================================
# H2 Wald test: equality of sentiment slopes across groups
# - Pooled OLS with group FE and group-specific sentiment slopes
# - Newey–West HAC (AIC-selected lag)
# - One χ² Wald test per sentiment proxy
# - Returns DataFrame; optionally writes Reg/h2_wald_tests.csv
# =========================================================
import numpy as np
import pandas as pd
import re
from pathlib import Path
import statsmodels.formula.api as smf
from statsmodels.tsa.ar_model import AutoReg

# -------------------- CONFIG --------------------
DATE_COL   = "date"
SYMBOL_COL = "symbol"
RET_COL    = "log_daily_return"
TARGET_COL = "R_g_t_plus1"  # constructed below

# Raw blockchain columns to be averaged (RAW → then transform)
BASE_BLOCKCHAIN = ["TxCnt", "AdrActCnt", "volume_trusted_spot_usd_1d"]

# Macros (log levels already in df_crypto) and Sentiments
MACRO_COLS     = ["log_UnemRt", "log_IndPro", "log_CPIPrc", "log_TotRes"]
SENTIMENT_COLS = ["VIX", "TwitSIX", "EPU_DUS", "InvSIX", "fng_value", "ConSIX"]

# Functional groups (as in your taxonomy)
UTILITY = {"1INCH","AAVE","ANT","APE","BADGER","BAL","BAT","BIT","CEL","COMP","CRV","DCR","ENS","FXS",
           "HEDG","LDO","MKR","QNT","ROOK","SUSHI","SWRV","UNI","YFI"}
ASSET   = {"ALCX","ALPHA","CVX","PAXG","XAUT"}
PAYMENT = {"BCH","BNB","BSV","BTC","BTG","CRO","DASH","DGB","DOGE","ETC","ETH","FTT","GRIN","HT","LTC",
           "MTL_METAL","PAY","SHIB","VTC","XLM","XMR","XRP","XVG","ZEC"}
HYBRID_UA = {"CVC"}
HYBRID_UP = {"ADA","ALGO","API3","ATOM","AUDIO","AVAX","BNT","DOT","DRGN","ELF","ENJ","EOS","FIL","FLOW",
             "FUN","GALA","GAS","GLM","GNO","GNT","GRT","ICP","ICX","KNC","LEND","LINK","LOOM","LPT","LRC",
             "LSK","LUNA","MAID","MANA","NEO","NMR","OGN","OMG","PERP","POLY","POWR","PPT","QASH","QTUM",
             "REN","REP","RSR","SAND","SKL","SNT","SNX","SOL","SRM","STORJ","TRX","UMA","VET","WAVES",
             "WNXM","WTC","XEM","XTZ","ZIL","ZRX"}
HYBRID_AP = {"BUSD","DAI","GUSD","HUSD","PAX","TUSD","USDC","USDT"}

GROUP_ORDER = ["Utility", "Asset", "Payment", "Hybrid (U-A)", "Hybrid (U-P)", "Hybrid (A-P)"]

# Output
EXPORT_CSV       = Path("Regressions/h2_wald_tests.csv")
WRITE_WALD_CSV   = True  # set False if you don't want a CSV

# -------------------- HELPERS --------------------
def map_functional_group(sym: str) -> str:
    s = str(sym).upper()
    if s in UTILITY:   return "Utility"
    if s in ASSET:     return "Asset"
    if s in PAYMENT:   return "Payment"
    if s in HYBRID_UA: return "Hybrid (U-A)"
    if s in HYBRID_UP: return "Hybrid (U-P)"
    if s in HYBRID_AP: return "Hybrid (A-P)"
    return "Unclassified"

def select_hac_lag_via_resid_aic(resid: pd.Series, kmax: int) -> int:
    """
    Choose HAC lag via AIC on AR(k) residual model (k=0..kmax).
    For k=0, use log-variance proxy objective.
    """
    n = resid.shape[0]
    if n < 20 or kmax <= 0:
        return 0
    best_k, best_aic = 0, np.inf
    for k in range(kmax + 1):
        try:
            if k == 0:
                e = resid - resid.mean()
                sigma2 = np.var(e, ddof=1)
                aic = n * np.log(sigma2 + 1e-12) + 2
            else:
                ar = AutoReg(resid, lags=k, old_names=False, trend="c").fit()
                aic = ar.aic
        except Exception:
            aic = np.inf
        if aic < best_aic:
            best_aic, best_k = aic, k
    return int(best_k)

def make_group_panel_avg_then_transform(df: pd.DataFrame) -> pd.DataFrame:
    """
    H2 panel: average RAW → zero-safe log-diffs → build lags.
    """
    d = df.copy()
    d[DATE_COL]   = pd.to_datetime(d[DATE_COL], errors="coerce")
    d[SYMBOL_COL] = d[SYMBOL_COL].astype(str).str.upper()
    d["h2_group"] = d[SYMBOL_COL].map(map_functional_group)
    d = d[d["h2_group"].isin(GROUP_ORDER)].copy()

    # days where all raw blockchain inputs are present
    elig = d.dropna(subset=BASE_BLOCKCHAIN).copy()

    # equal-weight means (RAW)
    agg_cols = [RET_COL] + BASE_BLOCKCHAIN
    grp_raw = (elig.groupby([DATE_COL, "h2_group"], dropna=False)[agg_cols]
                   .mean(numeric_only=True)
                   .reset_index()
                   .sort_values(["h2_group", DATE_COL]))

    # zero-safe log-diffs on group averages
    for col in BASE_BLOCKCHAIN:
        prev = grp_raw.groupby("h2_group")[col].shift(1)
        curr = grp_raw[col]
        grp_raw[f"log_diff_{col}"] = np.where((prev > 0) & (curr > 0), np.log(curr/prev), 0.0)

    # attach date-level macros + sentiments (unique per date)
    date_cols = [DATE_COL] + [c for c in (MACRO_COLS + SENTIMENT_COLS) if c in d.columns]
    date_level = d[date_cols].drop_duplicates(subset=[DATE_COL]).sort_values(DATE_COL)
    grp = grp_raw.merge(date_level, on=DATE_COL, how="left")

    # target and lags
    grp["R_g_t"]       = grp.groupby("h2_group")[RET_COL].shift(0)
    grp["R_g_t_lag"]   = grp.groupby("h2_group")[RET_COL].shift(1)
    grp["R_g_t_plus1"] = grp.groupby("h2_group")[RET_COL].shift(-1)

    grp = grp.dropna(subset=["R_g_t_lag", "R_g_t_plus1"]).reset_index(drop=True)
    return grp

def find_interaction_param_names(param_names, group_list, proxy):
    """
    Map each group to the parameter name of its interaction slope with the proxy.
    Handles both 'C(h2_group)[T.Group]:Proxy' and 'Proxy:C(h2_group)[T.Group]'.
    """
    mapping = {}
    for g in group_list:
        pat1 = re.compile(rf"C\(h2_group\)\[(?:T\.)?{re.escape(g)}\]\:{re.escape(proxy)}$")
        pat2 = re.compile(rf"{re.escape(proxy)}\:C\(h2_group\)\[(?:T\.)?{re.escape(g)}\]$")
        hit = next((n for n in param_names if pat1.search(n) or pat2.search(n)), None)
        mapping[g] = hit
    return mapping

def build_R_equal_betas(param_names, beta_names):
    """
    Build R for H0: beta_g1 = beta_g2 = ... = beta_gk, implemented as
    (beta_gi - beta_g1) = 0 for i=2..k.
    """
    valid = [b for b in beta_names if b is not None]
    if len(valid) < 2:
        return None, None
    p = len(param_names)
    base = valid[0]
    rows = []
    for b in valid[1:]:
        r = np.zeros(p)
        r[param_names.index(b)] = 1.0
        r[param_names.index(base)] = -1.0
        rows.append(r)
    R = np.vstack(rows)
    r = np.zeros(R.shape[0])
    return R, r

# -------------------- MAIN --------------------
def run_h2_wald_tests(df_crypto: pd.DataFrame,
                      write_csv: bool = WRITE_WALD_CSV,
                      export_csv: Path = EXPORT_CSV) -> pd.DataFrame:
    grp = make_group_panel_avg_then_transform(df_crypto)

    present_groups = [g for g in GROUP_ORDER if g in grp["h2_group"].unique()]
    if len(present_groups) < 2:
        raise ValueError("Not enough groups present for a Wald equality test.")

    # common controls (same slope across groups)
    controls = ["R_g_t_lag", "log_diff_AdrActCnt",
                "log_diff_volume_trusted_spot_usd_1d", "log_diff_TxCnt"] \
               + [c for c in MACRO_COLS if c in grp.columns]

    out_rows = []
    for proxy in [c for c in SENTIMENT_COLS if c in grp.columns]:
        # keep rows with all needed vars
        sub = grp.dropna(subset=[TARGET_COL, proxy] + controls).copy()
        if sub.empty:
            continue

        # standardize continuous controls + proxy (not FE)
        z_cols = controls + [proxy]
        for c in z_cols:
            m, s = sub[c].mean(), sub[c].std(ddof=0)
            if s and np.isfinite(s) and s > 0:
                sub[c] = (sub[c] - m) / s

        # pooled OLS: group FE + group-specific sentiment slopes + common controls
        # FE: 0 + C(h2_group)
        # Group-specific proxy slopes: 0 + C(h2_group):proxy
        rhs = " + ".join(["0 + C(h2_group)", f"0 + C(h2_group):{proxy}"] + controls)
        formula = f"{TARGET_COL} ~ {rhs}"

        fit = smf.ols(formula, data=sub).fit()
        n = int(fit.nobs)
        if n <= 8:
            continue

        # HAC lag selection by AIC
        kmax = int(n ** 0.25)
        best_aic, best_lag, best_res = np.inf, 0, None
        for lag in range(kmax + 1):
            r = fit.get_robustcov_results(cov_type="HAC", maxlags=lag, use_correction=True)
            if r.aic < best_aic:
                best_aic, best_lag, best_res = r.aic, lag, r

        param_names = list(best_res.model.exog_names)
        beta_map = find_interaction_param_names(param_names, present_groups, proxy)
        beta_names = [beta_map[g] for g in present_groups]

        # build R for equality of those slopes
        R, rvec = build_R_equal_betas(param_names, beta_names)
        if R is None:
            
            continue

        wt = best_res.wald_test((R, rvec), scalar=False)  # χ²
        chi2 = float(np.asarray(wt.statistic).ravel()[0])
        pval = float(np.asarray(wt.pvalue).ravel()[0])
        df_num = int(R.shape[0])

        out_rows.append({
            "proxy": proxy,
            "groups_tested": ",".join(present_groups),
            "nobs": n,
            "hac_maxlags": best_lag,
            "wald_test": "equal_betas_across_groups",
            "chi2": chi2,
            "df_num": df_num,
            "p_value": pval
        })

    out = pd.DataFrame(out_rows).sort_values(["proxy"]).reset_index(drop=True)
    if write_csv:
        export_csv.parent.mkdir(parents=True, exist_ok=True)
        out.to_csv(export_csv, index=False)
    return out


wald_results = run_h2_wald_tests(df_crypto)
wald_results.head()

  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)


Unnamed: 0,proxy,groups_tested,nobs,hac_maxlags,wald_test,chi2,df_num,p_value
0,ConSIX,"Utility,Asset,Payment,Hybrid (U-A),Hybrid (U-P...",18014,0,equal_betas_across_groups,0.649266,5,0.662071
1,EPU_DUS,"Utility,Asset,Payment,Hybrid (U-A),Hybrid (U-P...",18014,0,equal_betas_across_groups,1.741273,5,0.121426
2,InvSIX,"Utility,Asset,Payment,Hybrid (U-A),Hybrid (U-P...",18014,0,equal_betas_across_groups,0.80077,5,0.548878
3,TwitSIX,"Utility,Asset,Payment,Hybrid (U-A),Hybrid (U-P...",13916,0,equal_betas_across_groups,0.61515,5,0.68831
4,VIX,"Utility,Asset,Payment,Hybrid (U-A),Hybrid (U-P...",18014,0,equal_betas_across_groups,0.465164,5,0.802461


In [26]:
# =========================================================
# Build LaTeX table from H2 Wald tests
# - Reads from DataFrame or CSV: Regressions/h2_wald_tests.csv
# - Orders proxies consistently
# - Adds significance stars on p-values
# - Writes a single LaTeX table to Regressions/Tables/h2_wald_tests.tex
# =========================================================
import pandas as pd
import numpy as np
from pathlib import Path

# ------------ Paths ------------
IN_CSV   = Path("Regressions/h2_wald_tests.csv")
OUT_DIR  = Path("Regressions/Tables")
OUT_TEX  = OUT_DIR / "h2_wald_tests.tex"
OUT_DIR.mkdir(parents=True, exist_ok=True)

# ------------ Proxy order (consistent with H2 figures/tables) ------------
PROXY_ORDER = ["VIX", "TwitSIX", "EPU_DUS", "InvSIX", "fng_value", "ConSIX"]

# ------------ Stars for significance (on p-values) ------------
STAR_THRESHOLDS = [(0.01, "**"), (0.05, "*")]

def star_for_p(p):
    try:
        p = float(p)
    except Exception:
        return ""
    for thr, mark in STAR_THRESHOLDS:
        if p < thr:
            return mark
    return ""

def fmt_float(x, digits=3):
    if x is None or (isinstance(x, float) and np.isnan(x)):
        return ""
    try:
        return f"{float(x):.{digits}f}"
    except Exception:
        return str(x)

def fmt_int(x):
    if x is None or (isinstance(x, float) and np.isnan(x)):
        return ""
    try:
        return f"{int(x)}"
    except Exception:
        return str(x)

def label_for_proxy(p):
    """Nicer LaTeX-safe proxy labels if needed."""
    # Escape underscores
    return str(p).replace("_", r"\_")

def render_wald_table(df: pd.DataFrame,
                      caption: str = r"\textbf{Wald Tests for Equality of Sentiment Slopes Across Functional Groups}\\ The table presents the results of Wald $\chi^2$ tests evaluating the null hypothesis that sentiment coefficients are identical across functional groups. Test statistics are computed using HAC (Newey--West) covariance estimates, with the lag length determined by the Akaike Information Criterion (AIC). The analysis includes six functional groups: Utility, Asset, Payment, and Hybrid variants (U-A, U-P, A-P). Reported are the $\chi^2$ statistic, the corresponding degrees of freedom (df), the p-value, the number of observations, and the number of HAC lags used.",
                      label: str   = "tab:h2_wald_tests") -> str:
    """
    Renders a LaTeX table:
      columns: Proxy | χ² | df | p-value | N | HAC maxlags
    Notes: groups tested are stored in df['groups_tested'] and listed in a footnote.
    """
    # Keep only columns we need and sort by PROXY_ORDER
    need_cols = ["proxy", "chi2", "df_num", "p_value", "nobs", "hac_maxlags", "groups_tested"]
    for c in need_cols:
        if c not in df.columns:
            raise ValueError(f"Required column '{c}' not found in the Wald results DataFrame.")
    df2 = df[need_cols].copy()

    # ordering
    cat = pd.Categorical(df2["proxy"], categories=[p for p in PROXY_ORDER if p in df2["proxy"].unique()], ordered=True)
    # Put any unexpected proxies (if present) after the known ones
    other = [p for p in df2["proxy"].unique() if p not in PROXY_ORDER]
    df2["_order"] = pd.Categorical(df2["proxy"], categories=list(cat.categories)+other, ordered=True)
    df2 = df2.sort_values("_order").drop(columns=["_order"]).reset_index(drop=True)

    # Build LaTeX
    lines = []
    lines += [
        r"\begin{table}[ht]",
        r"\centering",
        r"\scriptsize",
        r"\setlength{\tabcolsep}{4pt}",
        r"\begin{tabular}{l c c c c c}",
        r"\toprule",
        r"\textbf{Proxy} & \(\boldsymbol{\chi^2}\) & \textbf{df} & \textbf{p-value} & \textbf{\# Obs} & \textbf{HAC lags} \\",
        r"\midrule"
    ]

    # Table rows
    for _, row in df2.iterrows():
        proxy = label_for_proxy(row["proxy"])
        chi2  = fmt_float(row["chi2"], 2)
        dfnum = fmt_int(row["df_num"])
        pval  = fmt_float(row["p_value"], 3)
        pstar = star_for_p(row["p_value"])
        nobs  = fmt_int(row["nobs"])
        lag   = fmt_int(row["hac_maxlags"])

        lines.append(f"{proxy} & {chi2} & {dfnum} & {pval}{pstar} & {nobs} & {lag} \\\\")
    lines += [r"\midrule"]

    # Footnote with groups tested (unique across rows)
    # If you prefer to show per-proxy groups, comment these two lines and print row-wise after each line.
    groups_sets = sorted(set(df2["groups_tested"]))
    foot_groups = "; ".join(groups_sets).replace("_", r"\_")

    lines += [
        r"\multicolumn{6}{c}{$H_0 : \beta_{sent,g_1} = \beta_{sent,g_2}=\ ...\ =\beta_{sent,g}$}\\",
        r"\bottomrule",
        r"\end{tabular}",
        rf"\caption{{{caption}}}",
        rf"\label{{{label}}}",
        r"\end{table}"
    ]
    return "\n".join(lines)

def build_wald_table_from_df_or_csv(df_wald: pd.DataFrame | None = None,
                                    in_csv: Path = IN_CSV,
                                    out_tex: Path = OUT_TEX) -> Path:
    """
    If df_wald is provided, use it; otherwise load from CSV.
    Writes LaTeX table to out_tex and returns the path.
    """
    if df_wald is None:
        if not in_csv.exists():
            raise FileNotFoundError(f"Input CSV not found: {in_csv}")
        df_wald = pd.read_csv(in_csv)

    tex = render_wald_table(df_wald)
    with open(out_tex, "w") as f:
        f.write(tex)
    print(f"[INFO] Wrote LaTeX Wald table → {out_tex}")
    return out_tex

# ---------------- Example usage ----------------
# Case A) You just ran the tests in this session:
# from your_previous_cell import run_h2_wald_tests
# wald_results = run_h2_wald_tests(df_crypto)
build_wald_table_from_df_or_csv(df_wald=wald_results)

# Case B) Load from CSV already on disk:
# build_wald_table_from_df_or_csv()

[INFO] Wrote LaTeX Wald table → Regressions/Tables/h2_wald_tests.tex


PosixPath('Regressions/Tables/h2_wald_tests.tex')

### ROBUSTNESS CHECKS

In [3]:
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
from sklearn.preprocessing import StandardScaler
from statsmodels.stats.stattools import durbin_watson

# ---------- 1) PREPROCESSING (two-part, per-symbol, NA-safe) ----------
base_cols = ['AdrActCnt', 'volume_trusted_spot_usd_1d', 'TxCnt']

for col in base_cols:
    prev = df_crypto.groupby('symbol')[col].shift(1)   # per-symbol lag
    curr = df_crypto[col]
    df_crypto[f'jump_{col}'] = ((prev == 0) & (curr > 0)).astype(int)
    # set to 0 outside >0→>0 regime so we don't drop rows
    df_crypto[f'log_diff_{col}'] = np.where((prev > 0) & (curr > 0),
                                            np.log(curr/prev), 0.0)

# ---------- 2) ELIGIBILITY (robust) ----------
def min_required_n(regressors, floor=365, margin=30):
    p = 1 + len(regressors)  # + intercept
    return max(floor, p + margin)

def _symbol_eligible(sub, target, regressors):
    if any(r not in sub.columns for r in regressors):
        return False, "missing_columns"
    cc = sub.dropna(subset=[target])  # X have no NA after step 1
    if len(cc) < min_required_n(regressors):
        return False, f"too_few_obs:{len(cc)}"
    # require variation for continuous vars (allow jump_* to be constant)
    for r in regressors:
        if not r.startswith('jump_') and cc[r].nunique() < 2:
            return False, f"no_variation:{r}"
    return True, "ok"

# ---------- 3) REGRESSION ----------
def run_symbol_regression(df, symbol, target, regressors):
    sub = df[df['symbol'] == symbol].sort_values('date').copy()
    if target not in sub.columns:
        sub[target] = sub['log_daily_return'].shift(-1)

    ok, reason = _symbol_eligible(sub, target, regressors)
    if not ok:
        return None

    sub = sub.dropna(subset=[target])  # only target must be present

    # standardize only continuous predictors (leave jump_* as is)
    cont = [r for r in regressors if not r.startswith('jump_')]
    if cont:
        sub[cont] = StandardScaler().fit_transform(sub[cont])

    rhs = ' + '.join(regressors)
    fit = smf.ols(f"{target} ~ {rhs}", data=sub).fit()
    if fit.df_resid <= 8:   # guard for reliable HAC/AIC
        return None

    n = len(sub)
    max_hac_lag = int(n ** 0.25)

    best_aic, best_lag, best_res = np.inf, 0, None
    for lag in range(max_hac_lag + 1):
        r = fit.get_robustcov_results(cov_type='HAC', maxlags=lag, use_correction=True)
        if r.aic < best_aic:
            best_aic, best_lag, best_res = r.aic, lag, r

    dw_val = float(durbin_watson(fit.resid))

    out = {
        'symbol': symbol, 'n': n, 'nobs': n,
        'n_regs': len(regressors), 'regs_used': ','.join(regressors),
        'opt_hac_lag': best_lag, 'r2': best_res.rsquared,
        'adj_r2': best_res.rsquared_adj, 'aic': best_res.aic, 'bic': best_res.bic,
        'log_likelihood': best_res.llf, 'fstat': best_res.fvalue, 'f_pval': best_res.f_pvalue,
        'durbin_watson': dw_val
    }
    for name, b, se, t, p in zip(best_res.model.exog_names,
                                 best_res.params, best_res.bse, best_res.tvalues, best_res.pvalues):
        out[f'coef_{name}']   = b
        out[f'stderr_{name}'] = se
        out[f'tval_{name}']   = t
        out[f'pval_{name}']   = p
    return out

def run_all_symbols(df, target, regressors):
    df = df.sort_values(['symbol','date']).copy()
    # crea il target t+1 se non presente
    if target not in df.columns:
        df[target] = df.groupby('symbol')['log_daily_return'].shift(-1)

    results = []
    kept, dropped = [], []

    for sym in df['symbol'].unique():
        sub = df[df['symbol'] == sym]
        if _symbol_eligible(sub, target, regressors):
            r = run_symbol_regression(df, sym, target, regressors)
            if r is not None:
                results.append(r)
                kept.append(sym)
            else:
                dropped.append(sym)
        else:
            dropped.append(sym)

    out = pd.DataFrame(results)
    # Log sintetico (opzionale)
    print(f"Kept {len(set(kept))} symbols; Dropped {len(set(dropped))}: {sorted(set(dropped))[:10]}...")
    return out

# ---------------------- 3) SET UP YOUR LOOP ----------------------
target = 'log_daily_next'

base_regressors = [
    'log_daily_return',
    # 'jump_AdrActCnt',
    'log_diff_AdrActCnt',
    # 'jump_volume_trusted_spot_usd_1d',
    'log_diff_volume_trusted_spot_usd_1d',
    # 'jump_TxCnt',
    'log_diff_TxCnt',
    'log_UnemRt', 'log_IndPro', 'log_CPIPrc', 'log_TotRes',
]

sentiment_proxies = {
    '$\Delta$ EPU_DUS':      'diff_EPU_DUS',
    '$\Delta$ VIX':          'diff_VIX',
    '$\Delta$ InvSIX':       'diff_InvSIX',
    '$\Delta$ TwitSIX':      'diff_TwitSIX',
    '$\Delta$ ConSIX':       'diff_ConSIX',
    '$\Delta$ fng_value':    'diff_fng_value'
}

for name, proxy in sentiment_proxies.items():
    regs = base_regressors.copy()
    if proxy is not None:
        regs.insert(1, proxy)   # sentiment subito dopo il ritorno laggato
    summary_df = run_all_symbols(df_crypto, target, regs)
    fn = f"Regressions/checkrobustness_{name}.csv"
    summary_df.to_csv(fn, index=False)
    print(f"Saved {summary_df['symbol'].nunique()} symbols → {fn}")


  '$\Delta$ EPU_DUS':      'diff_EPU_DUS',
  '$\Delta$ VIX':          'diff_VIX',
  '$\Delta$ InvSIX':       'diff_InvSIX',
  '$\Delta$ TwitSIX':      'diff_TwitSIX',
  '$\Delta$ ConSIX':       'diff_ConSIX',
  '$\Delta$ fng_value':    'diff_fng_value'
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)


Kept 85 symbols; Dropped 40: ['ALCX', 'APE', 'API3', 'ATOM', 'AUDIO', 'AVAX', 'BADGER', 'BIT', 'BNT', 'CEL']...
Saved 85 symbols → Regressions/checkrobustness_$\Delta$ EPU_DUS.csv
Kept 85 symbols; Dropped 40: ['ALCX', 'APE', 'API3', 'ATOM', 'AUDIO', 'AVAX', 'BADGER', 'BIT', 'BNT', 'CEL']...
Saved 85 symbols → Regressions/checkrobustness_$\Delta$ VIX.csv
Kept 85 symbols; Dropped 40: ['ALCX', 'APE', 'API3', 'ATOM', 'AUDIO', 'AVAX', 'BADGER', 'BIT', 'BNT', 'CEL']...
Saved 85 symbols → Regressions/checkrobustness_$\Delta$ InvSIX.csv
Kept 85 symbols; Dropped 40: ['ALCX', 'APE', 'API3', 'ATOM', 'AUDIO', 'AVAX', 'BADGER', 'BIT', 'BNT', 'CEL']...
Saved 85 symbols → Regressions/checkrobustness_$\Delta$ TwitSIX.csv
Kept 85 symbols; Dropped 40: ['ALCX', 'APE', 'API3', 'ATOM', 'AUDIO', 'AVAX', 'BADGER', 'BIT', 'BNT', 'CEL']...
Saved 85 symbols → Regressions/checkrobustness_$\Delta$ ConSIX.csv
Kept 85 symbols; Dropped 40: ['ALCX', 'APE', 'API3', 'ATOM', 'AUDIO', 'AVAX', 'BADGER', 'BIT', 'BNT', 'CEL

In [4]:
import os
import pandas as pd
import numpy as np
from pathlib import Path

# ===== Paths =====
REGRESSIONS_DIR = "Regressions"
OUT_DIR = "Regressions/Tables"
Path(OUT_DIR).mkdir(exist_ok=True)

# ===== Classification =====
def classify_symbol(symbol):
    if symbol == "MTL_METAL":
        symbol = "MTL"
    BTC = {"BTC"}
    ALT_HIGH = {"AAVE","ADA","ALGO","ATOM","AVAX","BCH","BNB","BSV","CRO","CRV","DOT","ENS","EOS","ETC",
                "ETH","FIL","FLOW","GALA","GRT","ICP","LDO","LINK","LTC","MANA","MKR","QNT","SAND","SOL",
                "TRX","UNI","VET","XLM","XMR","XRP","XTZ","ZEC"}
    ALT_MID  = {"1INCH","ANT","AUDIO","BAT","COMP","CVC","CVX","DASH","DCR","DGB","ELF","ENJ","FTT","FXS",
                "GAS","GLM","GNO","ICX","LPT","LRC","LUNA","NEO","QTUM","RSR","SKL","SNT","SNX",
                "SUSHI","UMA","WAVES","XVG","YFI","ZIL","ZRX"}
    ALT_LOW  = {"ALCX","ALPHA","API3","BADGER","BAL","BIT","BNT","BTG","CEL","DRGN","FUN","GNT","GRIN","HEDG","HT",
                "KNC","LEND","LOOM","LSK","MAID","MTL","NMR","OGN","OMG","PAY","PERP","POLY","POWR","PPT",
                "QASH","REN","REP","ROOK","SRM","STORJ","SWRV","VTC","WNXM","WTC","XEM"}
    STABLE   = {"USDT","USDC","BUSD","DAI","GUSD","HUSD","PAX","TUSD"}
    GOLDPEG  = {"PAXG","XAUT"}
    MEME     = {"DOGE","SHIB","APE"}
    if symbol in BTC:     return "BTC"
    if symbol in ALT_HIGH:return "ALT_HIGH"
    if symbol in ALT_MID: return "ALT_MID"
    if symbol in ALT_LOW: return "ALT_LOW"
    if symbol in STABLE:  return "STABLE"
    if symbol in GOLDPEG: return "GOLDPEG"
    if symbol in MEME:    return "MEME"
    return "UNCLASSIFIED"

# ===== Helpers =====
def first_existing(df, names):
    for n in names:
        if n in df.columns:
            return n
    return None

def starify(p):
    if pd.isna(p): return ""
    return "**" if p < 0.01 else ("*" if p < 0.05 else "")

def fmt_num(x, d=3, int_ok=False):
    if pd.isna(x): return ""
    if int_ok:
        try: return f"{int(x)}"
        except: pass
    try: return f"{float(x):.{d}f}"
    except: return ""

# ===== pick 10 symbols per your rule =====
def pick_symbols(df, proxy):
    pcol = first_existing(df, [f"pval_{proxy}", f"pval_log_d_{proxy}", f"pval_log_diff_{proxy}"])
    if pcol is None:
        # fallback: use any pval_* that exists (last resort)
        pvals = [c for c in df.columns if c.startswith("pval_") and c != "pval_Intercept"]
        pcol = pvals[0] if pvals else None

    d = df.copy()
    d["group"] = d["symbol"].map(classify_symbol)

    def pick_median(group):
        sub = d[(d["group"] == group) & d[pcol].notna()].sort_values(pcol)
        if sub.empty: return None
        return sub.iloc[len(sub)//2]["symbol"]

    def pick_min(group):
        sub = d[(d["group"] == group) & d[pcol].notna()].sort_values(pcol)
        if sub.empty: return None
        return sub.iloc[0]["symbol"]

    chosen = []
    if "BTC" in set(d["symbol"]): chosen.append("BTC")

    for grp in ["ALT_HIGH","ALT_MID","ALT_LOW"]:
        s1 = pick_min(grp)
        s2 = pick_median(grp)
        picks = []
        if s1: picks.append(s1)
        if s2 and s2 not in picks: picks.append(s2)
        if len(picks) < 2:
            # deterministic pad
            pool = [x for x in sorted(d.loc[d["group"]==grp,"symbol"]) if x not in picks]
            while len(picks) < 2 and pool:
                picks.append(pool.pop(0))
        chosen += picks[:2]

    for grp in ["STABLE","GOLDPEG","MEME"]:
        s = pick_median(grp)
        if s: chosen.append(s)

    return chosen[:10]

# ===== Build one LaTeX table from a CSV =====
def build_table_from_csv(csv_path):
    df = pd.read_csv(csv_path)
    if "symbol" not in df.columns:
        raise ValueError(f"'symbol' column missing in {csv_path}")

    # detect proxy (the sentiment var): first coef_* that is not intercept/logs
    proxy_candidates = [c.replace("coef_","") for c in df.columns
                        if c.startswith("coef_")
                        and c not in ("coef_Intercept","coef_log_daily_return")]
    if not proxy_candidates:
        raise ValueError(f"No proxy coef_ column found in {csv_path}")
    proxy = proxy_candidates[0]

    # columns (LaTeX rows) you want, with flexible suffixes
    row_specs = [
        ("Intercept",           ["Intercept"]),
        (proxy,                 [proxy, f"log_d_{proxy}", f"log_diff_{proxy}"]),
        (r"$R_{i,t}$",          ["log_daily_return","log_returns_lag1","lag_return","returns_lag1"]),
        (r"$\Delta\log\ $TxCnt",           ["log_TxCnt","log_diff_TxCnt","log_d_TxCnt"]),
        (r"$\Delta\log\ $Volume",          ["log_volume_trusted_spot_usd_1d","log_diff_volume_trusted_spot_usd_1d","log_d_volume_trusted_spot_usd_1d"]),
        (r"$\Delta\log\ $AdrActCnt",     ["log_AdrActCnt","log_diff_AdrActCnt"]),
        (r"$\log\ $UnemRt",          ["log_UnemRt"]),
        (r"$\log\ $IndPro",          ["log_IndPro"]),
        (r"$\log\ $CPIPrc",          ["log_CPIPrc"]),
        (r"$\log\ $TotRes",          ["log_TotRes"]),
    ]

    # metric columns (flexible names)
    col_N   = first_existing(df, ["n","nobs","N"])
    col_DW  = first_existing(df, ["dw","durbin_watson","DurbinWatson"])
    col_F   = first_existing(df, ["fstat","F","f_stat"])
    col_R2  = first_existing(df, ["r_squared","r2"])
    col_AR2 = first_existing(df, ["adj_r_squared","adj_r2","adjR"])
    
    symbols = pick_symbols(df, proxy)
    idx = {s: df[df["symbol"]==s].iloc[0] for s in symbols if not df[df["symbol"]==s].empty}

    # resolve which exact CSV cols to use (coef/stderr/pval) for each row
    resolved = []
    for label, suffixes in row_specs:
        coef_col  = first_existing(df, [f"coef_{s}"   for s in suffixes])
        se_col    = first_existing(df, [f"stderr_{s}" for s in suffixes])  # <-- Newey–West SE expected here
        pval_col  = first_existing(df, [f"pval_{s}"   for s in suffixes])
        resolved.append((label, coef_col, se_col, pval_col))

    # ---- Assemble LaTeX lines ----
    lines = []
    lines += [
        r"\begin{table}[ht]",
        r"\centering",
        r"\scriptsize",
        r"\setlength{\tabcolsep}{4pt}",
        rf"\begin{{tabular}}{{l *{{{len(symbols)}}}{{c}}}}",
        r"\toprule",
        r"&\multicolumn{1}{c}{\textbf{Bitcoin}}&\multicolumn{2}{c}{\textbf{High-Cap}}&\multicolumn{2}{c}{\textbf{Mid-Cap}}&\multicolumn{2}{c}{\textbf{Low-Cap}}&\multicolumn{1}{c}{\textbf{Gold}}&\multicolumn{1}{c}{\textbf{Stable}}&\multicolumn{1}{c}{\textbf{Meme}}\\",
        r"\addlinespace",
        " & " + " & ".join(symbols) + r" \\",
        r"\midrule"
    ]

    def add_var(label, coef_col, se_col, pval_col):
        coefs = []
        ses   = []
        for s in symbols:
            row = idx.get(s)
            if row is None:
                coefs.append("")
                ses.append("()")
                continue
            c = row.get(coef_col, np.nan) if coef_col else np.nan
            p = row.get(pval_col, np.nan) if pval_col else np.nan
            se = row.get(se_col, np.nan)  if se_col   else np.nan
            coefs.append(f"{fmt_num(c,3)}{starify(p)}")
            ses.append(f"({fmt_num(se,3)})" if not pd.isna(se) else "()")
        lines.append(f"{label} & " + " & ".join(coefs) + r" \\")
        lines.append(" & " + " & ".join(ses) + r" \\")
        lines.append(r"\addlinespace")

    for label, cc, sc, pc in resolved:
        add_var(label, cc, sc, pc)

    # ---- Bottom metrics ----
    def metric_row(name, colname, int_flag=False, decimals=2):
        vals = []
        for s in symbols:
            row = idx.get(s)
            if row is None or colname is None:
                vals.append("")
            else:
                vals.append(fmt_num(row.get(colname, np.nan), d=decimals, int_ok=int_flag))
        lines.append(f"{name} & " + " & ".join(vals) + r" \\")

    lines.append(r"\midrule")
    metric_row("N", col_N, int_flag=True, decimals=0)
    metric_row("Durbin-Watson", col_DW, int_flag=False, decimals=2)
    metric_row("F-stat", col_F, int_flag=False, decimals=2)
    metric_row(r"\(R^2\)", col_R2, int_flag=False, decimals=3)
    metric_row(r"Adj.\ \(R^2\)", col_AR2, int_flag=False, decimals=3)
    length = len(symbols) + 1

    lines += [
        r"\addlinespace",
        r"\midrule",
        rf"\multicolumn{{{length}}}{{c}}{{Specification tested: $R_{{i,t+1}} = \alpha_i + \beta_{{sent}} S_t + \phi R_{{i,t}} + \theta B_{{i,t}} + \gamma M_t + \varepsilon_{{i,t+1}}$}}\\",
        r"\bottomrule",
        r"\end{tabular}",
        rf"\caption{{\textbf{{Estimation Results from Predictive Regressions - {proxy}}} \\",
        r"This table presents coefficient estimates and Newey--West standard errors (in parentheses) from predictive regressions of next-day returns on the selected proxy for investor sentiment. The regression specification is reported at the top of the table. Each column corresponds to a selected cryptocurrency, chosen to represent the full range of categories in the sample: Bitcoin (BTC), two high-cap altcoins, two mid-cap altcoins, two low-cap altcoins, one stablecoin, one gold-pegged token, and one meme coin. Each regression is estimated separately using the available time series data for the respective cryptocurrency and the sentiment proxy. The dependent variable is the next-day log return. Standard errors are computed using the Newey--West estimator with automatic lag selection. Statistical significance is denoted as follows: *$p<0.05$, **$p<0.01$.}",
        rf"\label{{tab:{proxy.lower()}_result_h1}}",
        r"\end{table}"
    ]
    return proxy, lines

# ===== Run over all CSVs =====
for csv in sorted(Path(REGRESSIONS_DIR).glob("*.csv")):
    try:
        proxy, lines = build_table_from_csv(csv)
    except Exception as e:
        print(f"[skip] {csv.name}: {e}")
        continue
    out = Path(OUT_DIR) / f"rob_tab_{proxy}.tex"
    with open(out, "w", encoding="utf-8") as f:
        f.write("\n".join(lines))
    print(f"✅ {csv.name} -> {out}")

✅ checkrobustness_$\Delta$ ConSIX.csv -> Regressions/Tables/rob_tab_diff_ConSIX.tex
✅ checkrobustness_$\Delta$ EPU_DUS.csv -> Regressions/Tables/rob_tab_diff_EPU_DUS.tex
✅ checkrobustness_$\Delta$ InvSIX.csv -> Regressions/Tables/rob_tab_diff_InvSIX.tex
✅ checkrobustness_$\Delta$ TwitSIX.csv -> Regressions/Tables/rob_tab_diff_TwitSIX.tex
✅ checkrobustness_$\Delta$ VIX.csv -> Regressions/Tables/rob_tab_diff_VIX.tex
✅ checkrobustness_$\Delta$ fng_value.csv -> Regressions/Tables/rob_tab_diff_fng_value.tex
✅ crypto_regression_summary_ConSIX.csv -> Regressions/Tables/rob_tab_ConSIX.tex
✅ crypto_regression_summary_EPU_DUS.csv -> Regressions/Tables/rob_tab_EPU_DUS.tex
✅ crypto_regression_summary_InvSIX.csv -> Regressions/Tables/rob_tab_InvSIX.tex
✅ crypto_regression_summary_TwitSIX.csv -> Regressions/Tables/rob_tab_TwitSIX.tex
✅ crypto_regression_summary_VIX.csv -> Regressions/Tables/rob_tab_VIX.tex
✅ crypto_regression_summary_fng_value.csv -> Regressions/Tables/rob_tab_fng_value.tex
✅ crypto

In [14]:
import os, glob
import numpy as np
import pandas as pd
from pathlib import Path
from statistics import NormalDist

REG_DIR = "Regressions"
TAB_DIR = Path("Regressions/Tables")
TAB_DIR.mkdir(parents=True, exist_ok=True)

# --- sentiment proxies to include (adjust to your set) ---
SENTIMENT_PROXIES = ["ConSIX","VIX","TwitSIX","InvSIX","EPU_DUS","fng_value"]

def format_proxy_label_level(p):   # Level row label
    return p

def format_proxy_label_delta(p):   # Δ only in math mode
    return rf"$\Delta$ {p}"

def _find_cols(df: pd.DataFrame, proxy_base: str, scope: str):
    """
    Return (coef_col, pval_col, se_col_or_None, t_col_or_None)
    """
    if scope == "Level":
        c = f"coef_{proxy_base}"
        p = f"pval_{proxy_base}" if f"pval_{proxy_base}" in df.columns else f"p_{proxy_base}"
        s = next((col for col in [f"se_{proxy_base}", f"stderr_{proxy_base}", f"std_{proxy_base}"] if col in df.columns), None)
        t = next((col for col in [f"t_{proxy_base}", f"tval_{proxy_base}", f"tstat_{proxy_base}"] if col in df.columns), None)
    else:
        c = f"coef_diff_{proxy_base}"
        p = f"pval_diff_{proxy_base}" if f"pval_diff_{proxy_base}" in df.columns else f"p_diff_{proxy_base}"
        s = next((col for col in [f"se_diff_{proxy_base}", f"stderr_diff_{proxy_base}", f"std_diff_{proxy_base}"] if col in df.columns), None)
        t = next((col for col in [f"t_diff_{proxy_base}", f"tval_diff_{proxy_base}", f"tstat_diff_{proxy_base}"] if col in df.columns), None)

    if c not in df.columns or p not in df.columns:
        raise KeyError(f"Expected coef/pval for {scope} {proxy_base}: {c}, {p}")
    return c, p, s, t

def _compute_se_series(beta: pd.Series,
                       se: pd.Series | None,
                       t: pd.Series | None,
                       p: pd.Series | None) -> pd.Series:
    """
    Fill SE using (in order of preference):
      1) provided SE
      2) SE = |beta| / |t|
      3) SE = |beta| / z, where z = Phi^{-1}(1 - p/2)   (two-sided normal)
    """
    out = pd.Series(np.nan, index=beta.index, dtype="float64")

    # 1) direct SE
    if se is not None:
        se = pd.to_numeric(se, errors="coerce")
        if se.notna().any():
            return se

    # 2) from t-stat
    if t is not None:
        tt = pd.to_numeric(t, errors="coerce").abs().replace(0, np.nan)
        cand = pd.to_numeric(beta, errors="coerce").abs() / tt
        if cand.notna().any():
            out = cand

    # 3) from two-sided p-value (normal approx)
    if p is not None:
        pp = pd.to_numeric(p, errors="coerce")
        q = (1 - pp/2.0).clip(lower=1e-16, upper=1-1e-16)
        z = q.apply(NormalDist().inv_cdf).abs().replace(0, np.nan)
        cand = pd.to_numeric(beta, errors="coerce").abs() / z
        out = out.where(out.notna(), cand)

    return out

def _read_level_file(proxy_base: str) -> pd.DataFrame:
    # crypto_regression_summary_<proxy>.csv  (allow minor name variations)
    patt = os.path.join(REG_DIR, f"crypto_regression_summary*{proxy_base}*.csv")
    files = glob.glob(patt)
    if not files:
        return pd.DataFrame()
    # take the first best match
    df = pd.read_csv(files[0])
    if "symbol" not in df.columns:
        return pd.DataFrame()
    try:
        coef_col, pval_col, se_col, t_col = _find_cols(df, proxy_base, "Level")
    except KeyError:
        return pd.DataFrame()

    beta = df[coef_col]
    pval = df[pval_col]
    se_series = _compute_se_series(beta,
                                df[se_col] if se_col else None,
                                df[t_col] if t_col else None,
                                pval)

    out = pd.DataFrame({
        "proxy_base": proxy_base,
        "scope": "Level",
        "symbol": df["symbol"],
        "beta": beta,
        "pval": pval,
        "se": se_series,  # <-- now filled
        "r2": df["r2"] if "r2" in df.columns else np.nan,
        "adj_r2": df["adj_r2"] if "adj_r2" in df.columns else np.nan,
    })
    return out

def _read_delta_file(proxy_base: str) -> pd.DataFrame:
    # checkrobustness_*<proxy>*.csv (robust to "Δ", spaces, hyphens)
    patt = os.path.join(REG_DIR, f"checkrobustness_$\Delta$ {proxy_base}*.csv")
    files = glob.glob(patt)
    if not files:
        return pd.DataFrame()
    df = pd.read_csv(files[0])
    if "symbol" not in df.columns:
        return pd.DataFrame()
    try:
        coef_col, pval_col, se_col, t_col = _find_cols(df, proxy_base, "Delta")
    except KeyError:
        # fallback auto-detect like you already had...
        cand = [c for c in df.columns if c.startswith("coef_diff_")]
        if len(cand) == 1:
            proxy_base_auto = cand[0].replace("coef_diff_", "")
            if proxy_base_auto != proxy_base:
                proxy_base = proxy_base_auto
            coef_col = cand[0]
            pval_col = f"pval_diff_{proxy_base}" if f"pval_diff_{proxy_base}" in df.columns else (f"p_diff_{proxy_base}" if f"p_diff_{proxy_base}" in df.columns else None)
            se_col   = f"se_diff_{proxy_base}" if f"se_diff_{proxy_base}" in df.columns else None
            t_col    = f"t_diff_{proxy_base}" if f"t_diff_{proxy_base}" in df.columns else None
            if pval_col is None:
                return pd.DataFrame()
        else:
            return pd.DataFrame()

    beta = df[coef_col]
    pval = df[pval_col] if pval_col in df.columns else pd.Series(np.nan, index=df.index)
    se_series = _compute_se_series(beta,
                                df[se_col] if (se_col and se_col in df.columns) else None,
                                df[t_col] if (t_col and t_col in df.columns) else None,
                                pval)

    out = pd.DataFrame({
        "proxy_base": proxy_base,
        "scope": "Delta",
        "symbol": df["symbol"],
        "beta": beta,
        "pval": pval,
        "se": se_series,  # <-- now filled
        "r2": df["r2"] if "r2" in df.columns else np.nan,
        "adj_r2": df["adj_r2"] if "adj_r2" in df.columns else np.nan,
    })
    return out

def collect_long_from_separate_files():
    rows = []
    for prox in SENTIMENT_PROXIES:
        lvl = _read_level_file(prox)
        if not lvl.empty:
            rows.append(lvl)
        dlt = _read_delta_file(prox)
        if not dlt.empty:
            rows.append(dlt)
    if not rows:
        return pd.DataFrame(columns=["proxy_base","scope","symbol","beta","pval","se","r2","adj_r2"])
    return pd.concat(rows, ignore_index=True)

def build_h1_summary(df_long: pd.DataFrame):
    df = df_long.dropna(subset=["beta"]).copy()

    # aggregates across assets
    agg = (df.groupby(["proxy_base","scope"], as_index=False)
             .agg(
                 N_assets=("beta","count"),
                 mean_beta=("beta","mean"),
                 median_beta=("beta","median"),
                 median_se=("se", lambda s: np.nanmedian(s.values) if len(s) else np.nan),
                 median_r2=("r2", lambda s: np.nanmedian(s.values) if len(s) else np.nan),
                 median_adj_r2=("adj_r2", lambda s: np.nanmedian(s.values) if len(s) else np.nan),
             ))

    # pretty labels: Level vs Δ
    agg["Proxy"] = agg.apply(
        lambda r: (format_proxy_label_level(r["proxy_base"]) if r["scope"]=="Level"
                   else format_proxy_label_delta(r["proxy_base"])),
        axis=1
    )
    # order by proxy, Level first then Δ
    agg["__ord__"] = agg["proxy_base"] + agg["scope"].map({"Level":"_0","Delta":"_1"})
    agg = agg.sort_values("__ord__").drop(columns="__ord__")

    # format numbers
    def fmt(x): return "" if pd.isna(x) else f"{x:.3f}"
    out = agg[["Proxy","N_assets","mean_beta","median_beta","median_se","median_r2","median_adj_r2"]].copy()
    for c in ["mean_beta","median_beta","median_se","median_r2","median_adj_r2"]:
        out[c] = out[c].map(fmt)
    return out

# ---- run
long_df = collect_long_from_separate_files()
h1_summary = build_h1_summary(long_df)

# rename columns to nice LaTeX labels
h1_summary = h1_summary.rename(columns={
    "N_assets": r"\# Coins",
    "mean_beta": r"Mean $\beta_{\mathrm{sent}}$",
    "median_beta": r"Median $\beta_{\mathrm{sent}}$",
    "median_se": r"Median SE",
    "median_r2": r"Median $R^2$",
    "median_adj_r2": r"Median Adj.\ $R^2$"
})

latex = h1_summary.to_latex(
    index=False, escape=False,
    column_format="lcccc|cc",
    caption=(r"\textbf{H1 robustness summary: Level versus $\Delta$ proxies.}\\"
             r"For each sentiment proxy the table reports the number of coins used, "
             r"the mean and median of $\hat{\beta}_{\mathrm{sent}}$ across coins, "
             r"the median standard error, and the median $R^2$ and adjusted $R^2$."),
    label="tab:h1_level_vs_delta_summary"
)

(TAB_DIR / "h1_level_vs_delta_summary.tex").write_text(latex)
print("[ok] wrote:", TAB_DIR / "h1_level_vs_delta_summary.tex")

[ok] wrote: Regressions/Tables/h1_level_vs_delta_summary.tex


  patt = os.path.join(REG_DIR, f"checkrobustness_$\Delta$ {proxy_base}*.csv")


patt = os.path.join(REG_DIR, f"checkrobustness_$\Delta$ {proxy_base}*.csv")

In [8]:
import os
import glob
from pathlib import Path

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# ================== GLOBAL STYLE ==================
sns.set_style("whitegrid")
sns.set_context("paper", font_scale=1.2)
plt.rcParams.update({
    "font.family":       "serif",
    "font.serif":        ["DejaVu Serif"],
    "axes.titlesize":    22,
    "axes.labelsize":    20,
    "axes.titlecolor":   (38/255, 38/255, 38/255),
    "xtick.labelsize":   13,
    "ytick.labelsize":   13,
    "legend.fontsize":   10,
    "axes.titleweight":  "normal",
    "axes.edgecolor":    "black",
    "axes.linewidth":    0.5,
    "grid.color":        "0.85",
    "grid.linestyle":    "-",
    "grid.linewidth":    0.5,
    "figure.dpi":        300,
})

# ====== Paths ======
REG_DIR = "Regressions"
OUT_DIR = "Regressions/Figures"
Path(OUT_DIR).mkdir(parents=True, exist_ok=True)

# ====== Proxy label formatter ======
# ====== Proxy label formatter ======
def format_proxy_label(proxy: str) -> str:
    if proxy.startswith("diff_"):
        # Δ inside math mode, rest as plain italic text
        return rf"$\Delta$ {proxy.replace('diff_', '')}"
    return proxy

# ====== Classification ======
def classify_symbol(symbol: str) -> str:
    if pd.isna(symbol):
        return "Unclassified"
    if symbol == "MTL_METAL":
        symbol = "MTL"

    BTC = {"BTC"}
    ALT_HIGH = {"AAVE","ADA","ALGO","ATOM","AVAX","BCH","BNB","BSV","CRO","CRV","DOT","ENS","EOS","ETC",
                "ETH","FIL","FLOW","GALA","GRT","ICP","LDO","LINK","LTC","MANA","MKR","QNT","SAND","SOL",
                "TRX","UNI","VET","XLM","XMR","XRP","XTZ","ZEC"}
    ALT_MID  = {"1INCH","ANT","AUDIO","BAT","COMP","CVC","CVX","DASH","DCR","DGB","ELF","ENJ","FTT","FXS",
                "GAS","GLM","GNO","GNT","ICX","LPT","LRC","LUNA","NEO","QTUM","RSR","SKL","SNT","SNX",
                "SUSHI","UMA","WAVES","XVG","YFI","ZIL","ZRX"}
    ALT_LOW  = {"ALCX","ALPHA","API3","BADGER","BAL","BIT","BNT","BTG","CEL","DRGN","FUN","GRIN","HEDG","HT",
                "KNC","LEND","LOOM","LSK","MAID","MTL","NMR","OGN","OMG","PAY","PERP","POLY","POWR","PPT",
                "QASH","REN","REP","ROOK","SRM","STORJ","SWRV","VTC","WNXM","WTC","XEM"}
    STABLE   = {"USDT","USDC","BUSD","DAI","GUSD","HUSD","PAX","TUSD"}
    GOLDPEG  = {"PAXG","XAUT"}
    MEME     = {"DOGE","SHIB","APE"}

    if symbol in BTC:     return "Bitcoin"
    if symbol in ALT_HIGH:return "Altcoins (High)"
    if symbol in ALT_MID: return "Altcoins (Mid)"
    if symbol in ALT_LOW: return "Altcoins (Low)"
    if symbol in STABLE:  return "Stablecoins"
    if symbol in GOLDPEG: return "Gold-Pegged"
    if symbol in MEME:    return "Meme Coins"
    return "Unclassified"

GROUP_ORDER = ["Bitcoin","Altcoins (High)","Altcoins (Mid)","Altcoins (Low)",
               "Stablecoins","Gold-Pegged","Meme Coins"]

GROUP_PALETTE = {
    "Bitcoin": "#4C72B0",
    "Altcoins (High)": "#599da2",
    "Altcoins (Mid)":  "#83a075",
    "Altcoins (Low)":  "#aca24a",
    "Stablecoins":     "#eb9681",
    "Gold-Pegged":     "#d2a022",
    "Meme Coins":      "#8C8C8C"
}

# ====== Proxy detection (only diff proxies) ======
def detect_proxy(df: pd.DataFrame):
    # only keep coefficients for diff_* proxies
    cands = [c for c in df.columns if c.startswith("coef_diff_")]
    if not cands:
        return None, None, None
    proxy = cands[0].replace("coef_", "")
    coef_col = f"coef_{proxy}"
    pval_col = f"pval_{proxy}" if f"pval_{proxy}" in df.columns else None
    return proxy, coef_col, pval_col

# ====== Violin plot generator ======
def plot_violin(df, proxy, coef_col, pval_col):
    # classify
    df["Group"] = df["symbol"].map(classify_symbol)
    df = df[df["Group"].isin(GROUP_ORDER)]

    plt.figure(figsize=(8, 5))
    ax = sns.violinplot(
        data=df, x="Group", y=coef_col,
        order=GROUP_ORDER, palette=GROUP_PALETTE,
        inner=None, cut=2
    )
    sns.pointplot(
        data=df, x="Group", y=coef_col,
        order=GROUP_ORDER, join=False, estimator=np.median,
        color="black", markers="_", scale=1.3, errwidth=0
    )

    if pval_col and pval_col in df.columns:
        sig = df[df[pval_col] < 0.05]
        sns.stripplot(
            data=sig, x="Group", y=coef_col,
            order=GROUP_ORDER, color="red", size=4.5, jitter=True, alpha=0.85
        )

    ax.axhline(0, color="gray", lw=1)
    ax.set_title(rf"Distribution of $\beta_{{sent}}$ by Group — {format_proxy_label(proxy)}")
    ax.set_ylabel(r"Estimated $\beta_{\mathrm{sent}}$")
    ax.set_xlabel("")
    ax.set_xticklabels(GROUP_ORDER, rotation=15, ha="center")

    plt.tight_layout()
    plt.savefig(Path(OUT_DIR) / f"robustness_violin_{proxy}.png", dpi=300)
    plt.close()
    print(f"[ok] Saved robustness_violin_{proxy}.png")


# ====== Run for all ======
for csv in sorted(glob.glob(os.path.join(REG_DIR, "*.csv"))):
    df = pd.read_csv(csv)
    if "symbol" not in df.columns:
        continue
    proxy, coef_col, pval_col = detect_proxy(df)
    if proxy:
        plot_violin(df, proxy, coef_col, pval_col)


Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  ax = sns.violinplot(

The `scale` parameter is deprecated and will be removed in v0.15.0. You can now control the size of each plot element using matplotlib `Line2D` parameters (e.g., `linewidth`, `markersize`, etc.).

  sns.pointplot(

The `join` parameter is deprecated and will be removed in v0.15.0. You can remove the line between points with `linestyle='none'`.

  sns.pointplot(

The `errwidth` parameter is deprecated. And will be removed in v0.15.0. Pass `err_kws={'linewidth': 0}` instead.

  sns.pointplot(
  ax.set_xticklabels(GROUP_ORDER, rotation=15, ha="center")

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  ax = sns.violinplot(

The `scale` parameter is deprecated and will be removed in v0.

[ok] Saved robustness_violin_diff_ConSIX.png


  ax.set_xticklabels(GROUP_ORDER, rotation=15, ha="center")

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  ax = sns.violinplot(

The `scale` parameter is deprecated and will be removed in v0.15.0. You can now control the size of each plot element using matplotlib `Line2D` parameters (e.g., `linewidth`, `markersize`, etc.).

  sns.pointplot(

The `join` parameter is deprecated and will be removed in v0.15.0. You can remove the line between points with `linestyle='none'`.

  sns.pointplot(

The `errwidth` parameter is deprecated. And will be removed in v0.15.0. Pass `err_kws={'linewidth': 0}` instead.

  sns.pointplot(


[ok] Saved robustness_violin_diff_EPU_DUS.png
[ok] Saved robustness_violin_diff_InvSIX.png


  ax.set_xticklabels(GROUP_ORDER, rotation=15, ha="center")

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  ax = sns.violinplot(

The `scale` parameter is deprecated and will be removed in v0.15.0. You can now control the size of each plot element using matplotlib `Line2D` parameters (e.g., `linewidth`, `markersize`, etc.).

  sns.pointplot(

The `join` parameter is deprecated and will be removed in v0.15.0. You can remove the line between points with `linestyle='none'`.

  sns.pointplot(

The `errwidth` parameter is deprecated. And will be removed in v0.15.0. Pass `err_kws={'linewidth': 0}` instead.

  sns.pointplot(
  ax.set_xticklabels(GROUP_ORDER, rotation=15, ha="center")


[ok] Saved robustness_violin_diff_TwitSIX.png
[ok] Saved robustness_violin_diff_VIX.png



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  ax = sns.violinplot(

The `scale` parameter is deprecated and will be removed in v0.15.0. You can now control the size of each plot element using matplotlib `Line2D` parameters (e.g., `linewidth`, `markersize`, etc.).

  sns.pointplot(

The `join` parameter is deprecated and will be removed in v0.15.0. You can remove the line between points with `linestyle='none'`.

  sns.pointplot(

The `errwidth` parameter is deprecated. And will be removed in v0.15.0. Pass `err_kws={'linewidth': 0}` instead.

  sns.pointplot(
  ax.set_xticklabels(GROUP_ORDER, rotation=15, ha="center")

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  ax = sns.violinplot(

The `scale` parameter is deprecated and will be removed in v0.

[ok] Saved robustness_violin_diff_fng_value.png


  ax.set_xticklabels(GROUP_ORDER, rotation=15, ha="center")


In [17]:
# =========================================================
# H2 — Functional groups (Average → then Transform)
# Only two outputs:
#   1) Reg/h2_functional_avg_then_transform_results.csv  (full results: coef/SE/t/p + diagnostics)
#   2) Reg/h2_effective_membership_overall_table.tex     (functional groups table)
# Mirrors H1 column naming: coef_*, stderr_*, tval_*, pval_*
# =========================================================
import numpy as np
import pandas as pd
from pathlib import Path
from typing import List, Dict, Optional, Tuple
import statsmodels.formula.api as smf
from statsmodels.tsa.ar_model import AutoReg
from sklearn.preprocessing import StandardScaler
from statsmodels.stats.stattools import durbin_watson

# -------------------- TOGGLES --------------------
USE_JUMPS        = False  # include jump_* dummies as controls
STANDARDIZE_CONT = True   # z-score continuous regressors only (not jumps)
VERBOSE          = True
# -------------------------------------------------

# ---------------- CONFIG ----------------
DATE_COL   = "date"
SYMBOL_COL = "symbol"
RET_COL    = "log_daily_return"   # already in df_crypto
TARGET_COL = "R_g_t_plus1"        # target built below

# Raw blockchain columns to average at group level
BASE_BLOCKCHAIN = ["TxCnt", "AdrActCnt", "volume_trusted_spot_usd_1d"]

# Macro (log levels in df) and sentiments
MACRO_COLS     = ["log_UnemRt", "log_IndPro", "log_CPIPrc", "log_TotRes"]
SENTIMENT_COLS = ["diff_VIX", "diff_TwitSIX", "diff_EPU_DUS", "diff_InvSIX", "diff_fng_value", "diff_ConSIX"]

# Regression controls
MIN_OBS_FLOOR = 365    # minimum sample length
MARGIN_OBS    = 30     # buffer above number of parameters

# Outputs (ONLY these two)
EXPORT_DIR     = Path("Regressions")
RESULTS_CSV    = EXPORT_DIR / "rob_h2_functional_avg_then_transform_results.csv"
FUNC_TABLE_TEX = EXPORT_DIR / "trash.tex"
# ---------------------------------------------------------

# ---------- Functional groups (as per your taxonomy) ----------
UTILITY = {"1INCH","AAVE","ANT","APE","BADGER","BAL","BAT","BIT","CEL","COMP","CRV","DCR","ENS","FXS",
           "HEDG","LDO","MKR","QNT","ROOK","SUSHI","SWRV","UNI","YFI"}
ASSET   = {"ALCX","ALPHA","CVX","PAXG","XAUT"}
PAYMENT = {"BCH","BNB","BSV","BTC","BTG","CRO","DASH","DGB","DOGE","ETC","ETH","FTT","GRIN","HT","LTC",
           "MTL_METAL","PAY","SHIB","VTC","XLM","XMR","XRP","XVG","ZEC"}
HYBRID_UA = {"CVC"}
HYBRID_UP = {"ADA","ALGO","API3","ATOM","AUDIO","AVAX","BNT","DOT","DRGN","ELF","ENJ","EOS","FIL","FLOW",
             "FUN","GALA","GAS","GLM","GNO","GNT","GRT","ICP","ICX","KNC","LEND","LINK","LOOM","LPT","LRC",
             "LSK","LUNA","MAID","MANA","NEO","NMR","OGN","OMG","PERP","POLY","POWR","PPT","QASH","QTUM",
             "REN","REP","RSR","SAND","SKL","SNT","SNX","SOL","SRM","STORJ","TRX","UMA","VET","WAVES",
             "WNXM","WTC","XEM","XTZ","ZIL","ZRX"}
HYBRID_AP = {"BUSD","DAI","GUSD","HUSD","PAX","TUSD","USDC","USDT"}

GROUP_ORDER = ["Utility", "Asset", "Payment", "Hybrid (U-A)", "Hybrid (U-P)", "Hybrid (A-P)"]

def map_functional_group(sym: str) -> str:
    s = str(sym).upper()
    if s in UTILITY:   return "Utility"
    if s in ASSET:     return "Asset"
    if s in PAYMENT:   return "Payment"
    if s in HYBRID_UA: return "Hybrid (U-A)"
    if s in HYBRID_UP: return "Hybrid (U-P)"
    if s in HYBRID_AP: return "Hybrid (A-P)"
    return "Unclassified"

# ---------------- helpers ----------------
def min_required_n(regressors: List[str], floor: int = MIN_OBS_FLOOR, margin: int = MARGIN_OBS) -> int:
    p = 1 + len(regressors)  # + intercept
    return max(floor, p + margin)

def group_eligible(sub: pd.DataFrame, target: str, regressors: List[str]) -> Tuple[bool, str]:
    if any(r not in sub.columns for r in regressors):
        return False, "missing_columns"
    cc = sub.dropna(subset=[target])
    if len(cc) < min_required_n(regressors):
        return False, f"too_few_obs:{len(cc)}"
    # Require variation for continuous vars (allow jump_* to be constant)
    for r in regressors:
        if not r.startswith("jump_") and r in cc.columns and cc[r].nunique() < 2:
            return False, f"no_variation:{r}"
    return True, "ok"

def select_hac_lag_via_resid_aic(resid: pd.Series, kmax: int) -> int:
    n = resid.shape[0]
    if n < 20 or kmax <= 0:
        return 0
    best_k, best_aic = 0, np.inf
    for k in range(kmax + 1):
        try:
            if k == 0:
                e = resid - resid.mean()
                sigma2 = np.var(e, ddof=1)
                aic = n * np.log(sigma2 + 1e-12) + 2
            else:
                ar = AutoReg(resid, lags=k, old_names=False, trend="c").fit()
                aic = ar.aic
        except Exception:
            aic = np.inf
        if aic < best_aic:
            best_aic, best_k = aic, k
    return int(best_k)

def _tex_escape_ticker(s: str) -> str:
    return str(s).replace("_", r"\_")

# ---------- Build group panel (avg raw → transform) + capture daily contributors ----------
def make_group_panel_avg_then_transform_with_members(df: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame]:
    """
    Returns:
      grp : group panel with R_g_t, R_g_t_lag, R_g_t_plus1 and group-level log_diffs/jumps
      mem_day : long DataFrame with [date, h2_group, symbol] listing daily contributors
    """
    df = df.copy()
    df[DATE_COL]   = pd.to_datetime(df[DATE_COL], errors="coerce")
    df[SYMBOL_COL] = df[SYMBOL_COL].astype(str).str.upper()
    df["h2_group"] = df[SYMBOL_COL].map(map_functional_group)
    df = df[df["h2_group"].isin(GROUP_ORDER)].copy()

    # Eligible rows for raw group means: need raw blockchain cols present that day
    elig = df.dropna(subset=BASE_BLOCKCHAIN).copy()

    # (A) daily contributor list per (date, group)
    mem_day = (elig.loc[:, [DATE_COL, "h2_group", SYMBOL_COL]]
                    .drop_duplicates()
                    .rename(columns={SYMBOL_COL: "symbol"})
                    .sort_values(["h2_group", DATE_COL, "symbol"])
                    .reset_index(drop=True))

    # (B) group means of RAW blockchain + returns
    agg_cols = [RET_COL] + BASE_BLOCKCHAIN
    grp_raw = (elig.groupby([DATE_COL, "h2_group"], dropna=False)[agg_cols]
                    .mean(numeric_only=True)
                    .reset_index()
                    .sort_values(["h2_group", DATE_COL]))

    # (C) zero-safe log-diffs (+ optional jump dummies) on group averages
    for col in BASE_BLOCKCHAIN:
        prev = grp_raw.groupby("h2_group")[col].shift(1)
        curr = grp_raw[col]
        if USE_JUMPS:
            grp_raw[f"jump_{col}"] = ((prev == 0) & (curr > 0)).astype(int)
        grp_raw[f"log_diff_{col}"] = np.where((prev > 0) & (curr > 0), np.log(curr / prev), 0.0)

    # (D) attach date-level macro + sentiment (unique per date)
    date_cols = [DATE_COL] + [c for c in (MACRO_COLS + SENTIMENT_COLS) if c in df.columns]
    date_level = df[date_cols].drop_duplicates(subset=[DATE_COL]).sort_values(DATE_COL)
    grp = grp_raw.merge(date_level, on=DATE_COL, how="left")

    # (E) target and lags
    grp["R_g_t"]       = grp.groupby("h2_group")[RET_COL].shift(0)
    grp["R_g_t_lag"]   = grp.groupby("h2_group")[RET_COL].shift(1)
    grp["R_g_t_plus1"] = grp.groupby("h2_group")[RET_COL].shift(-1)

    # (F) drop missing edges
    grp = grp.dropna(subset=["R_g_t_lag", "R_g_t_plus1"]).reset_index(drop=True)
    return grp, mem_day

# ---------- Effective membership utilities ----------
def build_effective_membership(mem_day: pd.DataFrame,
                               used_dates: pd.Index,
                               group_name: str) -> pd.DataFrame:
    sub = mem_day[(mem_day["h2_group"] == group_name) & (mem_day[DATE_COL].isin(used_dates))].copy()
    if sub.empty:
        return pd.DataFrame(columns=["group","symbol","contrib_days"])
    out = (sub.groupby(["h2_group","symbol"], as_index=False)
               .agg(contrib_days=("symbol","size"))
               .rename(columns={"h2_group":"group"}))
    return out

def summarize_membership(df_long: pd.DataFrame,
                         group_order=GROUP_ORDER) -> pd.DataFrame:
    if df_long.empty:
        return pd.DataFrame(columns=["Token Type","Ticker","# Coins"])
    tmp = (df_long.groupby("group", as_index=False)
                 .agg(symbols=("symbol", lambda s: sorted(set(s))),
                      n_coins=("symbol", "nunique")))
    tmp["Ticker"] = tmp["symbols"].apply(lambda lst: ", ".join(_tex_escape_ticker(s) for s in lst))
    tmp = tmp.drop(columns=["symbols"]).rename(columns={"group":"Token Type", "n_coins":"# Coins"})
    tmp["Token Type"] = pd.Categorical(tmp["Token Type"], categories=group_order, ordered=True)
    return tmp.sort_values("Token Type").reset_index(drop=True)

def render_latex_functional_table(df_tbl: pd.DataFrame,
                                  caption: str,
                                  label: str,
                                  colwidths=("3cm","10cm","1.2cm")) -> str:
    lines = []
    lines += [r"\begin{table}[ht]",
              r"\centering",
              r"\scriptsize",
              r"\setlength{\tabcolsep}{4pt}",
              rf"\begin{{tabular}}{{p{{{colwidths[0]}}}p{{{colwidths[1]}}}c}}",
              r"\toprule",
              r"\textbf{Token Type} & \textbf{Ticker} & \textbf{\# Coins} \\",
              r"\midrule"]
    total = 0
    for _, row in df_tbl.iterrows():
        lines.append(f"{row['Token Type']} & {row['Ticker']} & {int(row['# Coins'])} \\\\")
        lines.append(r"\addlinespace")
        total += int(row['# Coins'])
    lines += [rf"\textbf{{Total:}} & & \textbf{{{total}}} \\",
              r"\bottomrule",
              r"\end{tabular}",
              rf"\caption{{\textbf{{{caption}}}}}",
              rf"\label{{{label}}}",
              r"\end{table}"]
    return "\n".join(lines)

# ---------- One regression (group × proxy) that returns H1-style row ----------
def run_group_proxy_regression(dfg: pd.DataFrame,
                               group_name: str,
                               proxy_label: Optional[str],
                               regressors: List[str]) -> Optional[Dict]:
    """
    dfg must contain: TARGET_COL and regressors (already cleaned & standardized if needed)
    Returns one row with H1-like columns: n, nobs, n_regs, regs_used, opt_hac_lag, r2, adj_r2, aic, bic,
    log_likelihood, fstat, f_pval, durbin_watson, and per-parameter coef_*, stderr_*, tval_*, pval_*.
    """
    rhs = " + ".join(regressors)
    fit = smf.ols(f"{TARGET_COL} ~ {rhs}", data=dfg).fit()
    n = int(fit.nobs)
    if n <= 8:
        return None

    # HAC lag selection via AR(k)-AIC
    kmax = int(n ** 0.25)
    best_aic, best_lag, best_res = np.inf, 0, None
    for lag in range(kmax + 1):
        r = fit.get_robustcov_results(cov_type="HAC", maxlags=lag, use_correction=True)
        if r.aic < best_aic:
            best_aic, best_lag, best_res = r.aic, lag, r

    # Build output row mirroring H1 style
    row = {
        "group": group_name,
        "proxy": proxy_label if proxy_label is not None else "no_sentiment",
        "n": n,                        # H1 included both n and nobs
        "nobs": int(best_res.nobs),
        "n_regs": len(regressors),
        "regs_used": ",".join(regressors),
        "opt_hac_lag": int(best_lag),
        "r2": float(best_res.rsquared),
        "adj_r2": float(best_res.rsquared_adj),
        "aic": float(best_res.aic),
        "bic": float(best_res.bic),
        "log_likelihood": float(best_res.llf),
        "fstat": float(best_res.fvalue) if best_res.fvalue is not None else np.nan,
        "f_pval": float(best_res.f_pvalue) if best_res.f_pvalue is not None else np.nan,
        "durbin_watson": float(durbin_watson(fit.resid)),
    }

    # Add per-parameter columns, as in H1
    for name, b, se, t, p in zip(best_res.model.exog_names,
                                 best_res.params, best_res.bse, best_res.tvalues, best_res.pvalues):
        row[f"coef_{name}"]   = float(b)
        row[f"stderr_{name}"] = float(se)
        row[f"tval_{name}"]   = float(t)
        row[f"pval_{name}"]   = float(p)

    return row

# ---------- Driver: run all proxies, write one CSV + one LaTeX table ----------
def run_h2_write_only_results_and_func_table(df_crypto: pd.DataFrame,
                                             export_dir: Path = EXPORT_DIR,
                                             verbose: bool = VERBOSE) -> pd.DataFrame:
    export_dir.mkdir(parents=True, exist_ok=True)

    # 1) Build group panel and daily membership (to know who actually contributes)
    grp, mem_day = make_group_panel_avg_then_transform_with_members(df_crypto)

    # 2) Regressors lists
    base_cont = ["R_g_t_lag",
                 "log_diff_AdrActCnt", "log_diff_volume_trusted_spot_usd_1d", "log_diff_TxCnt"]
    base_jumps = (["jump_AdrActCnt", "jump_volume_trusted_spot_usd_1d", "jump_TxCnt"] if USE_JUMPS else [])
    macro      = [c for c in MACRO_COLS if c in grp.columns]

    proxy_dict = {
        "no_sentiment": None,
        "EPU_DUS":      "EPU_DUS",
        "VIX":          "VIX",
        "InvSIX":       "InvSIX",
        "TwitSIX":      "TwitSIX",
        "ConSIX":       "ConSIX",
        "fng_value":    "fng_value"
    }

    results_rows = []
    overall_members = []  # collect by (group, symbol) across all successful regressions

    present_groups = [g for g in GROUP_ORDER if g in grp["h2_group"].unique()]
    if verbose:
        print(f"[INFO] Groups present: {present_groups}")

    for label, proxy in proxy_dict.items():
        for g in present_groups:
            sub = grp[grp["h2_group"] == g].copy()

            # Assemble RHS
            regressors = base_cont.copy()
            if proxy is not None and proxy in sub.columns:
                regressors.insert(1, proxy)  # sentiment after lagged return
            regressors += base_jumps + macro

            # Keep only needed cols + date for membership
            cols_needed = [TARGET_COL] + regressors
            dfg = sub[cols_needed + [DATE_COL]].copy()

            # Coerce numeric & drop non-finite
            for c in cols_needed:
                dfg[c] = pd.to_numeric(dfg[c], errors="coerce")
            dfg = dfg.replace([np.inf, -np.inf], np.nan).dropna(subset=cols_needed)

            # Eligibility
            ok, reason = group_eligible(dfg, TARGET_COL, regressors)
            if not ok:
                if verbose: print(f"[SKIP] {g} × {proxy or 'no_sentiment'} — {reason}")
                continue

            # Standardize continuous (not jumps)
            cont = [r for r in regressors if not r.startswith("jump_")]
            if STANDARDIZE_CONT and cont:
                dfg[cont] = StandardScaler().fit_transform(dfg[cont])

            # Run regression and append H1-style row
            row = run_group_proxy_regression(dfg, g, proxy, regressors)
            if row is None:
                if verbose: print(f"[SKIP] {g} × {proxy or 'no_sentiment'} — too few nobs after clean")
                continue
            results_rows.append(row)

            # Track effective membership for the dates actually used in this regression
            used_dates = dfg[DATE_COL]
            eff = build_effective_membership(mem_day, used_dates, g)
            if not eff.empty:
                overall_members.append(eff)

    # 3) Write the single results CSV
    results_df = pd.DataFrame(results_rows).sort_values(["group","proxy"]).reset_index(drop=True)
    results_df.to_csv(RESULTS_CSV, index=False)
    if verbose:
        print(f"[INFO] Wrote results to {RESULTS_CSV} (rows={len(results_df)})")

    # 4) Build and write ONLY the functional membership LaTeX table (overall union)
    if overall_members:
        overall_long = pd.concat(overall_members, ignore_index=True)
        # Union across proxies by (group, symbol)
        overall_union = overall_long.drop_duplicates(subset=["group","symbol"])
        overall_wide  = summarize_membership(overall_union)
        latex_code = render_latex_functional_table(
            overall_wide,
            caption="Functional Groups Used in the Final Regression Sample",
            label="tab:functional_groups_effective_sample"
        )
        with open(FUNC_TABLE_TEX, "w") as f:
            f.write(latex_code)
        if verbose:
            print(f"[INFO] Wrote functional membership table to {FUNC_TABLE_TEX}")
    else:
        if verbose:
            print("[WARN] No functional membership captured (no regressions passed eligibility).")

    return results_df

# ---------------- RUN ----------------
# df_crypto must already be loaded & pre-engineered as in your earlier prep
results_h2 = run_h2_write_only_results_and_func_table(df_crypto)
results_h2.head()

  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)


[INFO] Groups present: ['Utility', 'Asset', 'Payment', 'Hybrid (U-A)', 'Hybrid (U-P)', 'Hybrid (A-P)']
[INFO] Wrote results to Regressions/rob_h2_functional_avg_then_transform_results.csv (rows=42)
[INFO] Wrote functional membership table to Regressions/trash.tex


Unnamed: 0,group,proxy,n,nobs,n_regs,regs_used,opt_hac_lag,r2,adj_r2,aic,...,tval_log_IndPro,pval_log_IndPro,coef_log_CPIPrc,stderr_log_CPIPrc,tval_log_CPIPrc,pval_log_CPIPrc,coef_log_TotRes,stderr_log_TotRes,tval_log_TotRes,pval_log_TotRes
0,Asset,ConSIX,1922,1922,8,"R_g_t_lag,log_diff_AdrActCnt,log_diff_volume_t...",0,0.009822,0.005681,13803.617586,...,-0.968898,0.332718,-0.751439,0.834519,-0.900446,0.367996,0.027805,0.0824,0.337446,0.735818
1,Asset,EPU_DUS,1922,1922,8,"R_g_t_lag,log_diff_AdrActCnt,log_diff_volume_t...",0,0.009822,0.005681,13803.617586,...,-0.968898,0.332718,-0.751439,0.834519,-0.900446,0.367996,0.027805,0.0824,0.337446,0.735818
2,Asset,InvSIX,1922,1922,8,"R_g_t_lag,log_diff_AdrActCnt,log_diff_volume_t...",0,0.009822,0.005681,13803.617586,...,-0.968898,0.332718,-0.751439,0.834519,-0.900446,0.367996,0.027805,0.0824,0.337446,0.735818
3,Asset,TwitSIX,1922,1922,8,"R_g_t_lag,log_diff_AdrActCnt,log_diff_volume_t...",0,0.009822,0.005681,13803.617586,...,-0.968898,0.332718,-0.751439,0.834519,-0.900446,0.367996,0.027805,0.0824,0.337446,0.735818
4,Asset,VIX,1922,1922,8,"R_g_t_lag,log_diff_AdrActCnt,log_diff_volume_t...",0,0.009822,0.005681,13803.617586,...,-0.968898,0.332718,-0.751439,0.834519,-0.900446,0.367996,0.027805,0.0824,0.337446,0.735818


In [19]:
# =========================================================
# H2 Wald test: equality of sentiment slopes across groups
# - Pooled OLS with group FE and group-specific sentiment slopes
# - Newey–West HAC (AIC-selected lag)
# - One χ² Wald test per sentiment proxy
# - Returns DataFrame; optionally writes Reg/h2_wald_tests.csv
# =========================================================
import numpy as np
import pandas as pd
import re
from pathlib import Path
import statsmodels.formula.api as smf
from statsmodels.tsa.ar_model import AutoReg

# -------------------- CONFIG --------------------
DATE_COL   = "date"
SYMBOL_COL = "symbol"
RET_COL    = "log_daily_return"
TARGET_COL = "R_g_t_plus1"  # constructed below

# Raw blockchain columns to be averaged (RAW → then transform)
BASE_BLOCKCHAIN = ["TxCnt", "AdrActCnt", "volume_trusted_spot_usd_1d"]

# Macros (log levels already in df_crypto) and Sentiments
MACRO_COLS     = ["log_UnemRt", "log_IndPro", "log_CPIPrc", "log_TotRes"]
SENTIMENT_COLS = ["diff_VIX", "diff_TwitSIX", "diff_EPU_DUS", "diff_InvSIX", "diff_fng_value", "diff_ConSIX"]

# Functional groups (as in your taxonomy)
UTILITY = {"1INCH","AAVE","ANT","APE","BADGER","BAL","BAT","BIT","CEL","COMP","CRV","DCR","ENS","FXS",
           "HEDG","LDO","MKR","QNT","ROOK","SUSHI","SWRV","UNI","YFI"}
ASSET   = {"ALCX","ALPHA","CVX","PAXG","XAUT"}
PAYMENT = {"BCH","BNB","BSV","BTC","BTG","CRO","DASH","DGB","DOGE","ETC","ETH","FTT","GRIN","HT","LTC",
           "MTL_METAL","PAY","SHIB","VTC","XLM","XMR","XRP","XVG","ZEC"}
HYBRID_UA = {"CVC"}
HYBRID_UP = {"ADA","ALGO","API3","ATOM","AUDIO","AVAX","BNT","DOT","DRGN","ELF","ENJ","EOS","FIL","FLOW",
             "FUN","GALA","GAS","GLM","GNO","GNT","GRT","ICP","ICX","KNC","LEND","LINK","LOOM","LPT","LRC",
             "LSK","LUNA","MAID","MANA","NEO","NMR","OGN","OMG","PERP","POLY","POWR","PPT","QASH","QTUM",
             "REN","REP","RSR","SAND","SKL","SNT","SNX","SOL","SRM","STORJ","TRX","UMA","VET","WAVES",
             "WNXM","WTC","XEM","XTZ","ZIL","ZRX"}
HYBRID_AP = {"BUSD","DAI","GUSD","HUSD","PAX","TUSD","USDC","USDT"}

GROUP_ORDER = ["Utility", "Asset", "Payment", "Hybrid (U-A)", "Hybrid (U-P)", "Hybrid (A-P)"]

# Output
EXPORT_CSV       = Path("Regressions/rob_h2_wald_tests.csv")
WRITE_WALD_CSV   = True  # set False if you don't want a CSV

# -------------------- HELPERS --------------------
def map_functional_group(sym: str) -> str:
    s = str(sym).upper()
    if s in UTILITY:   return "Utility"
    if s in ASSET:     return "Asset"
    if s in PAYMENT:   return "Payment"
    if s in HYBRID_UA: return "Hybrid (U-A)"
    if s in HYBRID_UP: return "Hybrid (U-P)"
    if s in HYBRID_AP: return "Hybrid (A-P)"
    return "Unclassified"

def select_hac_lag_via_resid_aic(resid: pd.Series, kmax: int) -> int:
    """
    Choose HAC lag via AIC on AR(k) residual model (k=0..kmax).
    For k=0, use log-variance proxy objective.
    """
    n = resid.shape[0]
    if n < 20 or kmax <= 0:
        return 0
    best_k, best_aic = 0, np.inf
    for k in range(kmax + 1):
        try:
            if k == 0:
                e = resid - resid.mean()
                sigma2 = np.var(e, ddof=1)
                aic = n * np.log(sigma2 + 1e-12) + 2
            else:
                ar = AutoReg(resid, lags=k, old_names=False, trend="c").fit()
                aic = ar.aic
        except Exception:
            aic = np.inf
        if aic < best_aic:
            best_aic, best_k = aic, k
    return int(best_k)

def make_group_panel_avg_then_transform(df: pd.DataFrame) -> pd.DataFrame:
    """
    H2 panel: average RAW → zero-safe log-diffs → build lags.
    """
    d = df.copy()
    d[DATE_COL]   = pd.to_datetime(d[DATE_COL], errors="coerce")
    d[SYMBOL_COL] = d[SYMBOL_COL].astype(str).str.upper()
    d["h2_group"] = d[SYMBOL_COL].map(map_functional_group)
    d = d[d["h2_group"].isin(GROUP_ORDER)].copy()

    # days where all raw blockchain inputs are present
    elig = d.dropna(subset=BASE_BLOCKCHAIN).copy()

    # equal-weight means (RAW)
    agg_cols = [RET_COL] + BASE_BLOCKCHAIN
    grp_raw = (elig.groupby([DATE_COL, "h2_group"], dropna=False)[agg_cols]
                   .mean(numeric_only=True)
                   .reset_index()
                   .sort_values(["h2_group", DATE_COL]))

    # zero-safe log-diffs on group averages
    for col in BASE_BLOCKCHAIN:
        prev = grp_raw.groupby("h2_group")[col].shift(1)
        curr = grp_raw[col]
        grp_raw[f"log_diff_{col}"] = np.where((prev > 0) & (curr > 0), np.log(curr/prev), 0.0)

    # attach date-level macros + sentiments (unique per date)
    date_cols = [DATE_COL] + [c for c in (MACRO_COLS + SENTIMENT_COLS) if c in d.columns]
    date_level = d[date_cols].drop_duplicates(subset=[DATE_COL]).sort_values(DATE_COL)
    grp = grp_raw.merge(date_level, on=DATE_COL, how="left")

    # target and lags
    grp["R_g_t"]       = grp.groupby("h2_group")[RET_COL].shift(0)
    grp["R_g_t_lag"]   = grp.groupby("h2_group")[RET_COL].shift(1)
    grp["R_g_t_plus1"] = grp.groupby("h2_group")[RET_COL].shift(-1)

    grp = grp.dropna(subset=["R_g_t_lag", "R_g_t_plus1"]).reset_index(drop=True)
    return grp

def find_interaction_param_names(param_names, group_list, proxy):
    """
    Map each group to the parameter name of its interaction slope with the proxy.
    Handles both 'C(h2_group)[T.Group]:Proxy' and 'Proxy:C(h2_group)[T.Group]'.
    """
    mapping = {}
    for g in group_list:
        pat1 = re.compile(rf"C\(h2_group\)\[(?:T\.)?{re.escape(g)}\]\:{re.escape(proxy)}$")
        pat2 = re.compile(rf"{re.escape(proxy)}\:C\(h2_group\)\[(?:T\.)?{re.escape(g)}\]$")
        hit = next((n for n in param_names if pat1.search(n) or pat2.search(n)), None)
        mapping[g] = hit
    return mapping

def build_R_equal_betas(param_names, beta_names):
    """
    Build R for H0: beta_g1 = beta_g2 = ... = beta_gk, implemented as
    (beta_gi - beta_g1) = 0 for i=2..k.
    """
    valid = [b for b in beta_names if b is not None]
    if len(valid) < 2:
        return None, None
    p = len(param_names)
    base = valid[0]
    rows = []
    for b in valid[1:]:
        r = np.zeros(p)
        r[param_names.index(b)] = 1.0
        r[param_names.index(base)] = -1.0
        rows.append(r)
    R = np.vstack(rows)
    r = np.zeros(R.shape[0])
    return R, r

# -------------------- MAIN --------------------
def run_h2_wald_tests(df_crypto: pd.DataFrame,
                      write_csv: bool = WRITE_WALD_CSV,
                      export_csv: Path = EXPORT_CSV) -> pd.DataFrame:
    grp = make_group_panel_avg_then_transform(df_crypto)

    present_groups = [g for g in GROUP_ORDER if g in grp["h2_group"].unique()]
    if len(present_groups) < 2:
        raise ValueError("Not enough groups present for a Wald equality test.")

    # common controls (same slope across groups)
    controls = ["R_g_t_lag", "log_diff_AdrActCnt",
                "log_diff_volume_trusted_spot_usd_1d", "log_diff_TxCnt"] \
               + [c for c in MACRO_COLS if c in grp.columns]

    out_rows = []
    for proxy in [c for c in SENTIMENT_COLS if c in grp.columns]:
        # keep rows with all needed vars
        sub = grp.dropna(subset=[TARGET_COL, proxy] + controls).copy()
        if sub.empty:
            continue

        # standardize continuous controls + proxy (not FE)
        z_cols = controls + [proxy]
        for c in z_cols:
            m, s = sub[c].mean(), sub[c].std(ddof=0)
            if s and np.isfinite(s) and s > 0:
                sub[c] = (sub[c] - m) / s

        # pooled OLS: group FE + group-specific sentiment slopes + common controls
        # FE: 0 + C(h2_group)
        # Group-specific proxy slopes: 0 + C(h2_group):proxy
        rhs = " + ".join(["0 + C(h2_group)", f"0 + C(h2_group):{proxy}"] + controls)
        formula = f"{TARGET_COL} ~ {rhs}"

        fit = smf.ols(formula, data=sub).fit()
        n = int(fit.nobs)
        if n <= 8:
            continue

        # HAC lag selection by AIC
        kmax = int(n ** 0.25)
        best_aic, best_lag, best_res = np.inf, 0, None
        for lag in range(kmax + 1):
            r = fit.get_robustcov_results(cov_type="HAC", maxlags=lag, use_correction=True)
            if r.aic < best_aic:
                best_aic, best_lag, best_res = r.aic, lag, r

        # parameter names (order) from the robust model's design
        param_names = list(best_res.model.exog_names)

        # pull the names of the per-group proxy slopes
        beta_map = find_interaction_param_names(param_names, present_groups, proxy)
        beta_names = [beta_map[g] for g in present_groups]

        # build R for equality of those slopes
        R, rvec = build_R_equal_betas(param_names, beta_names)
        if R is None:
            # likely some groups dropped in design (no variation)
            continue

        wt = best_res.wald_test((R, rvec), scalar=False)  # χ²
        chi2 = float(np.asarray(wt.statistic).ravel()[0])
        pval = float(np.asarray(wt.pvalue).ravel()[0])
        df_num = int(R.shape[0])

        out_rows.append({
            "proxy": proxy,
            "groups_tested": ",".join(present_groups),
            "nobs": n,
            "hac_maxlags": best_lag,
            "wald_test": "equal_betas_across_groups",
            "chi2": chi2,
            "df_num": df_num,
            "p_value": pval
        })

    out = pd.DataFrame(out_rows).sort_values(["proxy"]).reset_index(drop=True)
    if write_csv:
        export_csv.parent.mkdir(parents=True, exist_ok=True)
        out.to_csv(export_csv, index=False)
    return out

# -------------------- RUN (example) --------------------
wald_results = run_h2_wald_tests(df_crypto)
wald_results.head()

  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)


Unnamed: 0,proxy,groups_tested,nobs,hac_maxlags,wald_test,chi2,df_num,p_value
0,diff_ConSIX,"Utility,Asset,Payment,Hybrid (U-A),Hybrid (U-P...",18008,0,equal_betas_across_groups,0.619952,5,0.68461
1,diff_EPU_DUS,"Utility,Asset,Payment,Hybrid (U-A),Hybrid (U-P...",18008,0,equal_betas_across_groups,2.108094,5,0.061344
2,diff_InvSIX,"Utility,Asset,Payment,Hybrid (U-A),Hybrid (U-P...",18008,0,equal_betas_across_groups,0.230229,5,0.949472
3,diff_TwitSIX,"Utility,Asset,Payment,Hybrid (U-A),Hybrid (U-P...",13888,0,equal_betas_across_groups,1.113876,5,0.350454
4,diff_VIX,"Utility,Asset,Payment,Hybrid (U-A),Hybrid (U-P...",18008,0,equal_betas_across_groups,0.473229,5,0.796498


In [12]:
import os, glob
import numpy as np
import pandas as pd
from pathlib import Path

# ---------------- Paths ----------------
REG_DIR = "Regressions"
TAB_DIR = Path("Regressions/Tables")
TAB_DIR.mkdir(parents=True, exist_ok=True)

# Per-asset files
LEVEL_PATTERN = "crypto_regression_summary*{proxy}*.csv"
DELTA_PATTERN = "checkrobustness_$\Delta$ {proxy}*.csv"

# Wald files
LEVEL_WALD = os.path.join(REG_DIR, "h2_wald_tests.csv")
DELTA_WALD = os.path.join(REG_DIR, "rob_h2_wald_tests.csv")  # proxies named diff_<proxy>

# Mapping file (same folder as notebook)
CRYPTO_MAP = "cryptomap.csv"   # needs columns: symbol, Final_Category

# Proxies to include (adjust if needed)
SENTIMENT_PROXIES = ["ConSIX","VIX","TwitSIX","InvSIX","EPU_DUS","fng_value"]

# ---------------- Load category map ----------------
def load_category_map(path=CRYPTO_MAP):
    m = pd.read_csv(path, dtype=str)
    # normalize
    cols = {c.lower(): c for c in m.columns}
    sym_col = cols.get("symbol") or cols.get("ticker") or "symbol"
    cat_col = cols.get("final_category") or "Final_Category"
    m[sym_col] = m[sym_col].str.upper().str.strip()
    m[cat_col] = m[cat_col].str.strip()
    return dict(zip(m[sym_col], m[cat_col]))

SYM2CAT = load_category_map()

# Determine group (category) order from mapping; prefer a nice order if present
preferred = ["Utility Tokens", "Asset Tokens", "Payment Tokens",  "Hybrid_UA", "Hybrid_UP", "Hybrid_AP"]
all_groups = pd.Series(list(SYM2CAT.values())).dropna().unique().tolist()
GROUP_ORDER = [g for g in preferred if g in all_groups] + [g for g in all_groups if g not in preferred]

# ---------------- Helpers ----------------
def _attach_category(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    df["symbol"] = df["symbol"].astype(str).str.upper()
    df["Category"] = df["symbol"].map(SYM2CAT).fillna("Other")
    return df

def _read_level_proxy(proxy: str) -> pd.DataFrame:
    patt = os.path.join(REG_DIR, LEVEL_PATTERN.format(proxy=proxy))
    files = glob.glob(patt)
    if not files: return pd.DataFrame(columns=["symbol","Category","beta"])
    df = pd.read_csv(files[0])
    if "symbol" not in df.columns: return pd.DataFrame(columns=["symbol","Category","beta"])
    coef_col = f"coef_{proxy}"
    if coef_col not in df.columns:
        # Fallback: if exactly one non-control coef exists, use it
        cand = [c for c in df.columns if c.startswith("coef_") and not c.startswith(("coef_log_","coef_R_","coef_diff_"))]
        if len(cand) == 1:
            coef_col = cand[0]
        else:
            return pd.DataFrame(columns=["symbol","Category","beta"])
    out = df[["symbol", coef_col]].rename(columns={coef_col: "beta"})
    out = _attach_category(out)
    return out

def _read_delta_proxy(proxy: str) -> pd.DataFrame:
    patt = os.path.join(REG_DIR, DELTA_PATTERN.format(proxy=proxy))
    files = glob.glob(patt)
    if not files: return pd.DataFrame(columns=["symbol","Category","beta"])
    df = pd.read_csv(files[0])
    if "symbol" not in df.columns: return pd.DataFrame(columns=["symbol","Category","beta"])
    coef_col = f"coef_diff_{proxy}"
    if coef_col not in df.columns:
        cand = [c for c in df.columns if c.startswith("coef_diff_")]
        if len(cand) == 1:
            coef_col = cand[0]
        else:
            return pd.DataFrame(columns=["symbol","Category","beta"])
    out = df[["symbol", coef_col]].rename(columns={coef_col: "beta"})
    out = _attach_category(out)
    return out

def _load_wald(path: str, proxies: list[str], is_delta: bool) -> pd.Series:
    """Return Series index=proxy_base -> p_value; Δ file uses 'diff_<proxy>' rows."""
    if not os.path.exists(path): return pd.Series(dtype=float)
    df = pd.read_csv(path)
    cols = {c.lower(): c for c in df.columns}
    proxy_col = cols.get("proxy") or list(df.columns)[0]
    p_col = cols.get("p_value") or cols.get("wald_p") or cols.get("p") or list(df.columns)[-1]
    ser = {}
    for p in proxies:
        key = f"diff_{p}" if is_delta else p
        r = df[df[proxy_col] == key]
        if not r.empty:
            try:
                ser[p] = float(r.iloc[0][p_col])
            except Exception:
                pass
    return pd.Series(ser)

# ---------------- Build table ----------------
def build_h2_table() -> pd.DataFrame:
    rows = []
    wald_L = _load_wald(LEVEL_WALD, SENTIMENT_PROXIES, is_delta=False)
    wald_D = _load_wald(DELTA_WALD, SENTIMENT_PROXIES, is_delta=True)

    for prox in SENTIMENT_PROXIES:
        # Level row
        lvl = _read_level_proxy(prox)
        if not lvl.empty:
            med = lvl.groupby("Category")["beta"].median()
            rowL = {"Proxy": prox}
            for g in GROUP_ORDER:
                v = med.get(g, np.nan)
                rowL[g] = "" if pd.isna(v) else f"{v:.3f}"
            rowL["Wald $p$-value"] = "" if prox not in wald_L.index else f"{wald_L.loc[prox]:.3f}"
            rows.append(rowL)

        # Δ row
        dlt = _read_delta_proxy(prox)
        if not dlt.empty:
            med = dlt.groupby("Category")["beta"].median()
            rowD = {"Proxy": rf"$\Delta$ {prox}"}
            for g in GROUP_ORDER:
                v = med.get(g, np.nan)
                rowD[g] = "" if pd.isna(v) else f"{v:.3f}"
            rowD["Wald $p$-value"] = "" if prox not in wald_D.index else f"{wald_D.loc[prox]:.3f}"
            rows.append(rowD)

    out = pd.DataFrame(rows)
    cols = ["Proxy"] + GROUP_ORDER + ["Wald $p$-value"]
    for c in cols:
        if c not in out.columns:
            out[c] = ""
    return out[cols]

# ---- Run & export
h2 = build_h2_table()

# After you run h2 = build_h2_table()

RENAME_GROUPS = {
    "Utility Tokens": "Utility",
    "Asset Tokens": "Asset",
    "Payment Tokens": "Payment",
    "Hybrid_UA": "Hybrid (U-A)",
    "Hybrid_UP": "Hybrid (U-P)",
    "Hybrid_AP": "Hybrid (A-P)",
}

h2 = h2.rename(columns=RENAME_GROUPS)


# Now enforce the correct order
GROUP_ORDER = ["Utility", "Asset", "Payment", "Hybrid (U-A)", "Hybrid (U-P)", "Hybrid (A-P)"]
cols = ["Proxy"] + GROUP_ORDER + ["Wald $p$-value"]
h2 = h2[cols]

latex = h2.to_latex(
    index=False, escape=False,
    column_format="l" + "c"*len(GROUP_ORDER) + "c",
    caption=(r"\textbf{H2 robustness summary (functional groups): Level vs.\ $\Delta$ proxies.} "
             r"Cells report the median $\hat{\beta}_{\mathrm{sent}}$ within each functional group; "
             r"Wald $p$ refers to the cross-group equality test for the corresponding specification."),
    label="tab:h2_level_vs_delta_summary"
)

(TAB_DIR / "h2_level_vs_delta_summary.tex").write_text(latex)
print("[ok] wrote:", TAB_DIR / "h2_level_vs_delta_summary.tex")

[ok] wrote: Regressions/Tables/h2_level_vs_delta_summary.tex


  DELTA_PATTERN = "checkrobustness_$\Delta$ {proxy}*.csv"
