
# Factors: Construction and Diagnostics

This notebook analyzes the factor dataset produced by `build_factors.py`. The dataset contains daily proxies for systematic influences such as market, value, and momentum exposures. The objective is to confirm stability, distributional properties, and correlations to ensure residual mean-reversion signals are not confounded by latent factor tilts.


In [None]:

import os, warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

warnings.filterwarnings("ignore")
plt.rcParams["figure.dpi"] = 120

DATA_DIR = "/mnt/data"
print("Using DATA_DIR =", DATA_DIR)

def annualize_sharpe(returns, periods_per_year=252):
    r = pd.Series(returns).dropna()
    if len(r) == 0:
        return np.nan
    mu = r.mean() * periods_per_year
    sd = r.std(ddof=0) * np.sqrt(periods_per_year)
    return mu / sd if sd != 0 else np.nan

def equity_curve(returns):
    r = pd.Series(returns).fillna(0.0)
    return (1.0 + r).cumprod()

def drawdown_curve(equity):
    e = pd.Series(equity).astype(float)
    peak = e.cummax()
    return e/peak - 1.0

def infer_returns(daily_df):
    ret_col = next((c for c in daily_df.columns if c.lower() in ["ret","return","strategy_ret","pnl_ret","daily_ret"]), None)
    if ret_col is not None:
        return pd.to_numeric(daily_df[ret_col], errors="coerce").fillna(0.0)
    eq_col = next((c for c in daily_df.columns if "equity" in c.lower() or "cum" in c.lower()), None)
    if eq_col is not None:
        eq = pd.to_numeric(daily_df[eq_col], errors="coerce")
        return eq.pct_change().fillna(0.0)
    raise ValueError("Could not infer returns column.")

def infer_active_positions_per_day(daily_df, trades_df):
    pos_col = next((c for c in daily_df.columns if c.lower() in ["n_positions","positions","positions_live","breadth"]), None)
    if pos_col is not None:
        s = pd.to_numeric(daily_df[pos_col], errors="coerce").fillna(0.0)
        return s, "daily"
    if trades_df is not None and not trades_df.empty:
        if "entry_date" in trades_df.columns and "exit_date" in trades_df.columns:
            idx = pd.date_range(daily_df["date"].min(), daily_df["date"].max(), freq="D")
            active = pd.Series(0, index=idx)
            for _, row in trades_df.iterrows():
                try:
                    start = pd.to_datetime(row["entry_date"]).normalize()
                    end = pd.to_datetime(row["exit_date"]).normalize() if not pd.isna(row["exit_date"]) else idx.max()
                    if start > idx.max():
                        continue
                    s = max(start, idx.min())
                    e = min(end, idx.max())
                    if s <= e:
                        active.loc[s:e] += 1
                except Exception:
                    continue
            act = active.reindex(pd.to_datetime(daily_df["date"]).dt.normalize()).fillna(0).astype(int)
            act.index = daily_df["date"].values
            return act, "trades"
    return pd.Series(0, index=daily_df["date"]), "none"

def infer_turnover(daily_df, trades_df, active_positions):
    turn_col = next((c for c in daily_df.columns if "turnover" in c.lower()), None)
    if turn_col is not None:
        t = pd.to_numeric(daily_df[turn_col], errors="coerce").fillna(0.0)
        return float(t.mean()), "daily"
    if trades_df is not None and not trades_df.empty and len(active_positions) > 0:
        entries = trades_df["entry_date"].dt.normalize().value_counts() if "entry_date" in trades_df.columns else pd.Series(dtype=int)
        exits = trades_df["exit_date"].dt.normalize().value_counts() if "exit_date" in trades_df.columns else pd.Series(dtype=int)
        all_days = pd.to_datetime(daily_df["date"]).dt.normalize()
        ent = entries.reindex(all_days).fillna(0)
        exi = exits.reindex(all_days).fillna(0)
        chg = ent.add(exi, fill_value=0)
        avg_active = max(active_positions.replace(0, np.nan).mean(skipna=True), 1.0)
        approx_daily_turnover = (chg / (2.0 * avg_active)).replace([np.inf, -np.inf], 0).fillna(0.0)
        return float(approx_daily_turnover.mean()), "approx_trades"
    return float("nan"), "none"


## Data Inspection

In [None]:

from caas_jupyter_tools import display_dataframe_to_user
fac_path = os.path.join(DATA_DIR, "factors.csv")
factors = pd.read_csv(fac_path, parse_dates=["date"])
print("factors.csv shape:", factors.shape)
display_dataframe_to_user("factors_head", factors.head(25))


## Statistical Properties

In [None]:

num = factors.select_dtypes(include=["number"])
summary = num.describe().T
from caas_jupyter_tools import display_dataframe_to_user
display_dataframe_to_user("factor_summary", summary.round(4))
corr = num.corr()
print("Correlation matrix (top-left 10×10):")
print(corr.iloc[:10,:10].round(3))



Monitoring factor dynamics provides assurance that mean-reversion profits arise from statistical arbitrage rather than hidden beta exposures.
