In [2]:
from pathlib import Path
import sys, time
import numpy as np
import pandas as pd

CWD = Path.cwd()
if (CWD / "src").is_dir():
    PROJECT_ROOT = CWD
elif (CWD.parent / "src").is_dir():
    PROJECT_ROOT = CWD.parent
elif (CWD.parent.parent / "src").is_dir():
    PROJECT_ROOT = CWD.parent.parent
else:
    raise RuntimeError("Could not find a 'src' folder above the current working directory.")
sys.path.append(str(PROJECT_ROOT / "src"))
print("Using PROJECT_ROOT:", PROJECT_ROOT)

try:
    from paths import p
except Exception:
    def p(rel: str | Path) -> Path:
        return (PROJECT_ROOT / Path(rel)).resolve()

from ksg_tools import redundancy_matrix, ksg_mi_estimator_I, mi_percentile_ci_mbb

# parquet location
CRSP_PARQUET = p("data/processed/crsp_monthly_core.parquet")
print("Parquet:", CRSP_PARQUET)

Using PROJECT_ROOT: C:\Users\bseis\ABthesis
Parquet: C:\Users\bseis\ABthesis\data\processed\crsp_monthly_core.parquet


In [3]:
def _coerce_to_datetime_month_end(s: pd.Series) -> pd.DatetimeIndex:
    idx = pd.to_datetime(s).tz_localize(None)
    return idx.to_period("M").to_timestamp("M")

def _build_prices_from_returns(df, id_col: str, date_col: str, ret_col: str) -> pd.DataFrame:
    tmp = df[[id_col, date_col, ret_col]].copy()
    tmp[ret_col] = pd.to_numeric(tmp[ret_col], errors="coerce")
    tmp = tmp.sort_values([id_col, date_col])
    tmp["rel"] = 1.0 + tmp[ret_col]
    tmp = tmp[~tmp["rel"].isna()]
    tmp["px"] = tmp.groupby(id_col)["rel"].cumprod() * 100.0
    px = tmp.pivot(index=date_col, columns=id_col, values="px")
    px.index = _coerce_to_datetime_month_end(px.index)
    return px.sort_index().dropna(how="all")

def _wide_prices_from_price_col(df, id_col: str, date_col: str, price_col: str) -> pd.DataFrame:
    tmp = df[[id_col, date_col, price_col]].copy().sort_values([id_col, date_col])
    px = tmp.pivot(index=date_col, columns=id_col, values=price_col)
    px.index = _coerce_to_datetime_month_end(px.index)
    return px.sort_index().dropna(how="all")

def load_crsp_monthly_prices(parquet_path: Path, tickers=None, start=None, end=None) -> pd.DataFrame:
    df = pd.read_parquet(parquet_path)
    if "date" not in df.columns or "ticker" not in df.columns:
        raise KeyError("Parquet must include 'date' and 'ticker'")
    if tickers is not None:
        df = df[df["ticker"].isin(list(tickers))].copy()
        if df.empty: raise ValueError("None of requested tickers present.")
    if start is not None: df = df[pd.to_datetime(df["date"]) >= pd.to_datetime(start)]
    if end is not None:   df = df[pd.to_datetime(df["date"]) <= pd.to_datetime(end)]
    if df.empty: raise ValueError("No rows after date/ticker filtering.")

    price_col = next((c for c in ["adj_prc","adj_price","px","price","prc_adj","prc"] if c in df.columns), None)
    if price_col is not None:
        px = _wide_prices_from_price_col(df, "ticker", "date", price_col)
    else:
        ret_col = "ret" if "ret" in df.columns else ("retx" if "retx" in df.columns else None)
        if ret_col is None: raise KeyError("Found neither price nor ('ret'/'retx') columns.")
        px = _build_prices_from_returns(df, "ticker", "date", ret_col)

    if tickers is not None: px = px.reindex(columns=list(tickers))
    return px.dropna(how="all")

def subset_by_coverage(prices: pd.DataFrame, start: str, end: str, min_obs: int = 60):
    """Keep tickers with >= min_obs log-return observations in [start, end]."""
    start_ts, end_ts = pd.to_datetime(start), pd.to_datetime(end)
    px = prices[(prices.index >= start_ts) & (prices.index <= end_ts)]
    if px.empty: raise ValueError("No prices in requested window.")
    rets = np.log(px / px.shift(1))
    rets = rets[(rets.index >= start_ts) & (rets.index <= end_ts)]
    counts = rets.notna().sum()
    keep = counts[counts >= min_obs].index.tolist()
    if len(keep) < 2:
        failing = counts[counts < min_obs].index.tolist()
        raise ValueError(f"Not enough tickers with >= {min_obs} obs. Failing: {failing}")
    return px[keep], keep

In [4]:
# Universe & sampling
def get_all_tickers(parquet_path: Path) -> list[str]:
    df = pd.read_parquet(parquet_path, columns=["ticker"])
    return sorted(pd.Index(df["ticker"]).dropna().unique().tolist())

ALL_TICKERS = get_all_tickers(CRSP_PARQUET)
print("Universe size:", len(ALL_TICKERS))

rng = np.random.default_rng(123)
SAMPLE_SIZE = 80
SAMPLE = sorted(rng.choice(ALL_TICKERS, size=SAMPLE_SIZE, replace=False).tolist())

Universe size: 13900


In [5]:
# One big MI matrix on a long window
START, END = "2010-01-01", "2019-12-31" 
MIN_OBS = 60 
K = 4                                      

prices = load_crsp_monthly_prices(CRSP_PARQUET, tickers=SAMPLE, start=START, end=END)
px_win, keep = subset_by_coverage(prices, START, END, min_obs=MIN_OBS)
print(f"Window {START} → {END} | kept {len(keep)} tickers")

t0 = time.perf_counter()
R_ksg, used = redundancy_matrix(
    px_win,
    start=START,
    end=END,
    use_log_returns=True,
    standardize=True,
    min_obs=MIN_OBS,
    k=K,
    return_bits=True,
)
t1 = time.perf_counter()
print(f"Computed MI matrix {R_ksg.shape} in {t1 - t0:.2f}s")

# show a corner
display(R_ksg.round(3).iloc[:12, :12])

Window 2010-01-01 → 2019-12-31 | kept 24 tickers
Computed MI matrix (24, 24) in 0.48s


Unnamed: 0,ADES,AGCO,CERS,CMA,CSGP,CSPI,CVU,DDS,DLA,DMRC,HGSH,HLT
ADES,1.0,0.093,-0.078,0.013,0.254,0.066,0.087,-0.206,0.06,-0.127,0.065,
AGCO,0.093,1.0,0.162,0.401,0.009,-0.151,-0.012,0.006,0.013,-0.039,0.074,0.161
CERS,-0.078,0.162,1.0,0.248,0.044,-0.008,0.361,0.213,-0.03,0.139,0.178,0.22
CMA,0.013,0.401,0.248,1.0,0.261,0.112,0.294,0.088,0.161,-0.061,0.106,0.197
CSGP,0.254,0.009,0.044,0.261,1.0,-0.002,-0.011,0.203,-0.068,-0.058,-0.008,0.193
CSPI,0.066,-0.151,-0.008,0.112,-0.002,1.0,-0.138,0.011,0.093,0.088,0.041,0.065
CVU,0.087,-0.012,0.361,0.294,-0.011,-0.138,1.0,0.051,0.091,-0.176,-0.011,-0.023
DDS,-0.206,0.006,0.213,0.088,0.203,0.011,0.051,1.0,0.104,0.08,0.201,0.117
DLA,0.06,0.013,-0.03,0.161,-0.068,0.093,0.091,0.104,1.0,-0.026,-0.076,-0.1
DMRC,-0.127,-0.039,0.139,-0.061,-0.058,0.088,-0.176,0.08,-0.026,1.0,0.061,-0.032
