In [260]:
import os
import re
from typing import List, Optional, Dict, Tuple

import numpy as np
import pandas as pd
from feature_blocks import add_feature_blocks


# --------------------- CONFIG ---------------------
DATA_DIR = "data"
OUT_DIR  = "data/master"
LAG_MAX  = 12             # add y_lag1..y_lag12 and dy_lag1..dy_lag12
ROLL_WINDOWS = (3, 6, 12) # simple MA-like proxies on y

FRED_FILES  = ["fred_monthly_raw.csv"]
TRACE_FILES = ["MONTHLY_MARKET_WRPT.csv"]

# --------------------- IO HELPERS ---------------------
def load_first(dirpath: str, candidates: List[str]) -> Optional[pd.DataFrame]:
    for name in candidates:
        p = os.path.join(dirpath, name)
        if os.path.exists(p):
            df = pd.read_csv(p)
            print(f"[INFO] Loaded {p} (shape={df.shape})")
            return df
    print(f"[WARN] None of {candidates} found under {dirpath}.")
    return None

fred_df = load_first(DATA_DIR, FRED_FILES)
trace_df = load_first(DATA_DIR, TRACE_FILES)

[INFO] Loaded data/fred_monthly_raw.csv (shape=(312, 19))
[INFO] Loaded data/MONTHLY_MARKET_WRPT.csv (shape=(116, 5))


In [262]:
data.columns

Index(['BAMLC0A0CMEY', 'BAMLH0A0HYM2EY', 'BAMLC0A0CM', 'BAMLH0A0HYM2',
       'VIXCLS', 'DGS10', 'DGS5', 'DGS2', 'DGS30', 'CPIAUCSL', 'CPILFESL',
       'UNRATE', 'CCSA', 'T10YIE', 'T5YIE', 'T5YIFR', 'INDPRO', 'PCEPI',
       'mkt_par_lb', 'mkt_n_trades', 'mkt_share_rpt', 'mkt_px_std_daily',
       'IG_spread', 'HY_spread'],
      dtype='object')

In [None]:
plt.plot(data.BAMLC0A0CMEY)
plt.plot(data.BAMLH0A0HYM2EY)
plt.ploy(data.

In [None]:
'BAMLC0A0CMEY', 'BAMLH0A0HYM2EY', 'BAMLC0A0CM', 'BAMLH0A0HYM2',
       'VIXCLS', 'DGS10', 'DGS5', 'DGS2', 'DGS30', 'CPIAUCSL', 'CPILFESL',
       'UNRATE', 'CCSA', 'T10YIE', 'T5YIE', 'T5YIFR', 'INDPRO', 'PCEPI',
       'mkt_par_lb', 'mkt_n_trades', 'mkt_share_rpt', 'mkt_px_std_daily',
       'IG_spread', 'HY_spread'

In [246]:
def parse_month_yyyymm(series: pd.Series) -> pd.Series:
    """TRACE 'month' like 200207 â†’ month-end Timestamp."""
    s = series.astype(str).str.extract(r"(\d{6})", expand=False)
    dt = pd.to_datetime(s, format="%Y%m", errors="coerce")
    return dt + pd.offsets.MonthEnd(0)

def to_monthly_index(df: pd.DataFrame) -> pd.DataFrame:
    """Standardize to monthly DatetimeIndex (month end)."""
    if "Date" in df.columns:
        df["Date"] = pd.to_datetime(df["Date"], errors="coerce")
        return df.set_index("Date").sort_index().resample("ME").last()
    if "date" in df.columns:
        df["date"] = pd.to_datetime(df["date"], errors="coerce")
        return df.set_index("date").sort_index().resample("ME").last()
    if "month" in df.columns:
        dt = parse_month_yyyymm(df["month"])
        df = df.set_index(dt).drop(columns=["month"])
        df.index.name = "Date"
        return df.sort_index()  # already monthly
    # fallback: try index
    try:
        df.index = pd.to_datetime(df.index, errors="coerce")
        return df.sort_index().resample("ME").last()
    except Exception:
        return df

fred_m = to_monthly_index(fred_df)
trace_m = to_monthly_index(trace_df) if trace_df is not None else None
data = fred_m.join(trace_m, how="left", rsuffix="_TRACE") if trace_m is not None else fred_m

# Numeric coercion
for c in data.columns:
    data[c] = pd.to_numeric(data[c], errors="coerce")

# Targets
data, notes = define_targets_in_bps(data)

In [247]:
# data = data.bfill().ffill()

data["IG_spread"] = (data["BAMLC0A0CMEY"] - data['DGS10']) * 100.0
data["HY_spread"] = (data["BAMLH0A0HYM2EY"] - data['DGS5']) * 100.0

In [248]:
data.columns

Index(['BAMLC0A0CMEY', 'BAMLH0A0HYM2EY', 'BAMLC0A0CM', 'BAMLH0A0HYM2',
       'VIXCLS', 'DGS10', 'DGS5', 'DGS2', 'DGS30', 'CPIAUCSL', 'CPILFESL',
       'UNRATE', 'CCSA', 'T10YIE', 'T5YIE', 'T5YIFR', 'INDPRO', 'PCEPI',
       'mkt_par_lb', 'mkt_n_trades', 'mkt_share_rpt', 'mkt_px_std_daily',
       'IG_spread', 'HY_spread'],
      dtype='object')

In [249]:
data_without_lqd = data[['VIXCLS','CPIAUCSL', 'CPILFESL',
       'UNRATE', 'CCSA', 'T10YIE', 'T5YIE', 'T5YIFR', 'INDPRO', 'PCEPI',
       'IG_spread', 'HY_spread','DGS10', 'DGS5', 'DGS2', 'DGS30']].copy()
# data_without_lqd = data_without_lqd.bfill().ffill()
start_date = pd.to_datetime('2002-07-31')
end_date = pd.to_datetime('2012-02-29')
data_w_lqd = data[(data.index >= start_date) & (data.index <= end_date)]
# data_w_lqd = data_w_lqd.bfill().ffill()

data_w_lqd = data_w_lqd[['VIXCLS','CPIAUCSL', 'CPILFESL',
       'UNRATE', 'CCSA', 'T10YIE', 'T5YIE', 'T5YIFR', 'INDPRO', 'PCEPI',
       'IG_spread', 'HY_spread',
       'mkt_par_lb', 'mkt_n_trades', 'mkt_px_std_daily','DGS10', 'DGS5', 'DGS2', 'DGS30']]

data_without_lqd.to_csv('data/data_wo_lqd.csv')
data_w_lqd.to_csv('data/data_w_lqd.csv')

In [250]:
def add_macro_transforms(df):
    df = df.sort_values("Date").copy()

    # --- Target: changes in spreads + AR(1) ---
    df["dIG_spread"] = df["IG_spread"].diff()
    df["dHY_spread"] = df["HY_spread"].diff()
    df["dIG_spread_lag1"] = df["dIG_spread"].shift(1)
    df["dHY_spread_lag1"] = df["dHY_spread"].shift(1)

    # --- VIX: contemporaneous level ---
    df["VIXCLS_level"] = df["VIXCLS"]          # (optionally: np.log(df["VIXCLS"]))

    # --- Price indices & activity: log-diff (growth), lagged ---
    for col, new in [
        ("CPIAUCSL", "cpi_mom"),
        ("CPILFESL", "core_cpi_mom"),
        ("PCEPI", "pce_mom"),
        ("INDPRO", "indpro_mom"),
        ("CCSA", "ccsa_growth"),
    ]:
        df[new] = np.log(df[col]).diff()
        df[new + "_lag1"] = df[new].shift(1)

    # --- Unemployment rate: lagged level ---
    df["UNRATE_lag1"] = df["UNRATE"].shift(1)

    # --- Breakeven / inflation forwards: first diff, lagged ---
    for col in ["T5YIFR", "T5YIE", "T10YIE"]:
        df[f"d{col}"] = df[col].diff()
        df[f"d{col}_lag1"] = df[f"d{col}"].shift(1)

    return df


def add_liquidity_transforms(df):
    df = df.copy()

    # Level + log-level + lag for size / trading activity
    if "mkt_par_lb" in df.columns:
        df["ln_mkt_par_lb"] = np.log(df["mkt_par_lb"])
        df["ln_mkt_par_lb_lag1"] = df["ln_mkt_par_lb"].shift(1)

    if "mkt_n_trades" in df.columns:
        df["ln_mkt_n_trades"] = np.log(df["mkt_n_trades"])
        df["ln_mkt_n_trades_lag1"] = df["ln_mkt_n_trades"].shift(1)

    # Volatility of prices: level, change, and lag
    if "mkt_px_std_daily" in df.columns:
        df["mkt_px_std_daily_lag1"] = df["mkt_px_std_daily"].shift(1)
        df["dmkt_px_std_daily"] = df["mkt_px_std_daily"].diff()
        df["dmkt_px_std_daily_lag1"] = df["dmkt_px_std_daily"].shift(1)

    return df


# Apply to both datasets
data_without_lqd = add_macro_transforms(data_without_lqd)
data_w_lqd = add_macro_transforms(data_w_lqd)
data_w_lqd = add_liquidity_transforms(data_w_lqd)

# Optionally drop initial NaNs created by diff/lag when you build X,y
data_without_lqd = data_without_lqd.dropna()
data_w_lqd  = data_w_lqd.dropna()


In [255]:
data_w_lqd = data_w_lqd[['mkt_par_lb','mkt_n_trades', 'mkt_px_std_daily',
       'dIG_spread', 'dHY_spread', 'dIG_spread_lag1', 'dHY_spread_lag1',
       'VIXCLS_level', 'cpi_mom', 'cpi_mom_lag1', 'core_cpi_mom',
       'core_cpi_mom_lag1', 'pce_mom', 'pce_mom_lag1', 'indpro_mom',
       'indpro_mom_lag1', 'ccsa_growth', 'ccsa_growth_lag1', 'UNRATE_lag1',
       'dT5YIFR', 'dT5YIFR_lag1', 'dT5YIE', 'dT5YIE_lag1', 'dT10YIE',
       'dT10YIE_lag1', 'ln_mkt_par_lb', 'ln_mkt_par_lb_lag1',
       'ln_mkt_n_trades', 'ln_mkt_n_trades_lag1', 'mkt_px_std_daily_lag1',
       'dmkt_px_std_daily', 'dmkt_px_std_daily_lag1']].copy()

data_without_lqd = data_without_lqd[['dIG_spread', 'dHY_spread', 'dIG_spread_lag1',
       'dHY_spread_lag1', 'VIXCLS_level', 'cpi_mom', 'cpi_mom_lag1',
       'core_cpi_mom', 'core_cpi_mom_lag1', 'pce_mom', 'pce_mom_lag1',
       'indpro_mom', 'indpro_mom_lag1', 'ccsa_growth', 'ccsa_growth_lag1',
       'UNRATE_lag1', 'dT5YIFR', 'dT5YIFR_lag1', 'dT5YIE', 'dT5YIE_lag1',
       'dT10YIE', 'dT10YIE_lag1']].copy()

In [256]:
ig_master_w_lqd = data_w_lqd[['mkt_par_lb', 'mkt_n_trades', 'mkt_px_std_daily', 'dIG_spread','dIG_spread_lag1', 'VIXCLS_level',
       'cpi_mom', 'cpi_mom_lag1', 'core_cpi_mom', 'core_cpi_mom_lag1',
       'pce_mom', 'pce_mom_lag1', 'indpro_mom', 'indpro_mom_lag1',
       'ccsa_growth', 'ccsa_growth_lag1', 'UNRATE_lag1', 'dT5YIFR',
       'dT5YIFR_lag1', 'dT5YIE', 'dT5YIE_lag1', 'dT10YIE', 'dT10YIE_lag1',
       'ln_mkt_par_lb', 'ln_mkt_par_lb_lag1', 'ln_mkt_n_trades',
       'ln_mkt_n_trades_lag1', 'mkt_px_std_daily_lag1', 'dmkt_px_std_daily',
       'dmkt_px_std_daily_lag1']].copy()

hy_master_w_lqd = data_w_lqd[['mkt_par_lb', 'mkt_n_trades', 'mkt_px_std_daily', 'dHY_spread','dHY_spread_lag1', 'VIXCLS_level',
       'cpi_mom', 'cpi_mom_lag1', 'core_cpi_mom', 'core_cpi_mom_lag1',
       'pce_mom', 'pce_mom_lag1', 'indpro_mom', 'indpro_mom_lag1',
       'ccsa_growth', 'ccsa_growth_lag1', 'UNRATE_lag1', 'dT5YIFR',
       'dT5YIFR_lag1', 'dT5YIE', 'dT5YIE_lag1', 'dT10YIE', 'dT10YIE_lag1',
       'ln_mkt_par_lb', 'ln_mkt_par_lb_lag1', 'ln_mkt_n_trades',
       'ln_mkt_n_trades_lag1', 'mkt_px_std_daily_lag1', 'dmkt_px_std_daily',
       'dmkt_px_std_daily_lag1']].copy()

ig_master_wo_lqd = data_without_lqd[['dIG_spread','dIG_spread_lag1',
       'VIXCLS_level', 'cpi_mom', 'cpi_mom_lag1',
       'core_cpi_mom', 'core_cpi_mom_lag1', 'pce_mom', 'pce_mom_lag1',
       'indpro_mom', 'indpro_mom_lag1', 'ccsa_growth', 'ccsa_growth_lag1',
       'UNRATE_lag1', 'dT5YIFR', 'dT5YIFR_lag1', 'dT5YIE', 'dT5YIE_lag1',
       'dT10YIE', 'dT10YIE_lag1']].copy()
hy_master_wo_lqd = data_without_lqd[['dHY_spread','dHY_spread_lag1',
       'VIXCLS_level', 'cpi_mom', 'cpi_mom_lag1',
       'core_cpi_mom', 'core_cpi_mom_lag1', 'pce_mom', 'pce_mom_lag1',
       'indpro_mom', 'indpro_mom_lag1', 'ccsa_growth', 'ccsa_growth_lag1',
       'UNRATE_lag1', 'dT5YIFR', 'dT5YIFR_lag1', 'dT5YIE', 'dT5YIE_lag1',
       'dT10YIE', 'dT10YIE_lag1']].copy()

In [257]:
ig_path = os.path.join(OUT_DIR, "master_IG_wo_lqd.csv")
hy_path = os.path.join(OUT_DIR, "master_HY_wo_lqd.csv")
ig_master_wo_lqd.to_csv(ig_path)
hy_master_wo_lqd.to_csv(hy_path)

ig_path = os.path.join(OUT_DIR, "master_IG_w_lqd.csv")
hy_path = os.path.join(OUT_DIR, "master_HY_w_lqd.csv")
ig_master_w_lqd.to_csv(ig_path)
hy_master_w_lqd.to_csv(hy_path)
