# 05_agentic_evaluation_loop

05 — Agentic Evaluation Loop (Routing, Workers, Evaluator–Optimizer, Self-Reflection)
Goal: Treat quant research as an agent workflow:

Route question → pick workflow

Generate multiple strategy variants in parallel

Evaluate and select best candidate under risk constraints

Produce an audit-friendly research memo

### Setup

In [None]:
import os
import itertools
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

pd.set_option("display.max_columns", 300)
pd.set_option("display.width", 160)

REPO_ROOT = os.path.abspath(os.path.join(os.getcwd(), ".."))
DATA_PROCESSED = os.path.join(REPO_ROOT, "data", "processed")
REPORTS = os.path.join(REPO_ROOT, "reports")
os.makedirs(REPORTS, exist_ok=True)

signals_path = os.path.join(DATA_PROCESSED, "signals.parquet")
assert os.path.exists(signals_path), f"Missing {signals_path}. Run Notebook 03 first."

df = pd.read_parquet(signals_path)
df["date"] = pd.to_datetime(df["date"])
df = df.sort_values(["ticker","date"]).reset_index(drop=True)

# Add returns if not present
if "ret" not in df.columns:
    df["ret"] = df.groupby("ticker")["adj_close"].pct_change().fillna(0.0)

df.head()


### Core “backtest engine” helpers

In [None]:
TRADING_DAYS = 252

def portfolio_returns_from_weights(df, weight_col, ret_col="ret"):
    return df.groupby("date").apply(lambda g: np.sum(g[weight_col].values * g[ret_col].values))

def daily_turnover_from_weights(df, weight_col):
    w = df.pivot(index="date", columns="ticker", values=weight_col).fillna(0.0)
    turnover = 0.5 * w.diff().abs().sum(axis=1)
    return turnover

def drawdown_series(returns):
    eq = (1 + returns).cumprod()
    peak = eq.cummax()
    return eq/peak - 1

def perf_stats(returns):
    returns = returns.fillna(0.0)
    eq = (1 + returns).cumprod()
    total_return = eq.iloc[-1] - 1.0
    cagr = (eq.iloc[-1]) ** (TRADING_DAYS / len(eq)) - 1.0 if len(eq) else np.nan

    vol = returns.std() * np.sqrt(TRADING_DAYS)
    sharpe = (returns.mean() / returns.std()) * np.sqrt(TRADING_DAYS) if returns.std() != 0 else np.nan

    downside = returns[returns < 0].std() * np.sqrt(TRADING_DAYS)
    sortino = (returns.mean() / returns[returns < 0].std()) * np.sqrt(TRADING_DAYS) if returns[returns < 0].std() != 0 else np.nan

    dd = drawdown_series(returns)
    mdd = dd.min()
    calmar = cagr / abs(mdd) if mdd != 0 else np.nan

    return {
        "total_return": total_return,
        "cagr": cagr,
        "volatility": vol,
        "sharpe": sharpe,
        "sortino": sortino,
        "max_drawdown": mdd,
        "calmar": calmar,
        "win_rate_daily": (returns > 0).mean(),
        "days": len(eq)
    }


### Router

In [None]:
def route_intent(user_question: str) -> str:
    q = user_question.lower()
    if any(k in q for k in ["drawdown", "risk", "volatility", "max dd", "turnover", "cost", "slippage"]):
        return "risk_review"
    if any(k in q for k in ["fundamental", "valuation", "quality", "growth", "pe", "margin"]):
        return "fundamentals_workflow"
    if any(k in q for k in ["candlestick", "stochastic", "ema", "technical", "momentum", "oversold"]):
        return "technical_workflow"
    if any(k in q for k in ["hybrid", "combine", "timing", "select then time"]):
        return "hybrid_workflow"
    return "hybrid_workflow"  # sensible default

# Example
question = "Find a robust hybrid strategy with low drawdown and reasonable turnover."
intent = route_intent(question)
intent


### Worker: Strategy variant generator (pure TA + optional candle confirm)

Instead of re-running Notebook 03 logic, we regenerate TA weights in Notebook 5 for parameter sweeps.

In [None]:
def generate_ta_weights(df, oversold_th=20, use_candle_confirm=True, use_trend_filter=True):
    # required columns
    needed = ["stoch_k_14","stoch_d_14_3","ema20","adj_close","is_hammer","is_bull_engulf"]
    missing = [c for c in needed if c not in df.columns]
    if missing:
        raise ValueError(f"Missing required columns for TA generation: {missing}")

    g = df.groupby("ticker", group_keys=False)
    k = df["stoch_k_14"]
    d = df["stoch_d_14_3"]
    k_prev = g["stoch_k_14"].shift(1)
    d_prev = g["stoch_d_14_3"].shift(1)

    cross_up = (k_prev < d_prev) & (k > d)
    oversold = (k_prev < oversold_th)
    trend_ok = (df["adj_close"] > df["ema20"]) if use_trend_filter else True
    bull_candle = (df["is_hammer"] == 1) | (df["is_bull_engulf"] == 1)

    entry = cross_up & oversold & trend_ok
    if use_candle_confirm:
        entry = entry & bull_candle

    # exit: trend break or stoch overbought
    exit_sig = (df["adj_close"] < df["ema20"]) | (k > 80)

    # build position per ticker
    pos = []
    for tkr, gg in df.groupby("ticker"):
        in_pos = 0
        for _, r in gg.iterrows():
            if in_pos == 0 and bool(entry.loc[r.name]):
                in_pos = 1
            elif in_pos == 1 and bool(exit_sig.loc[r.name]):
                in_pos = 0
            pos.append(in_pos)

    pos = pd.Series(pos, index=df.index)
    member = (pos == 1).astype(int)
    count = df.groupby("date")["date"].transform(lambda s: np.nan)  # placeholder
    
    # equal-weight among active TA positions
    active = member.groupby(df["date"]).transform("sum").replace(0, np.nan)
    w = np.where(member == 1, 1.0 / active, 0.0)
    w = np.nan_to_num(w, nan=0.0)
    return pd.Series(w, index=df.index)


### Worker: Fundamentals selection variants (Top N)

We’ll sweep TOP_N.

In [None]:
def generate_fund_weights(df, top_n=10):
    if "fundamental_score" not in df.columns:
        raise ValueError("Missing fundamental_score")

    # Use monthly selection logic (first trading day of month)
    tmp = df.copy()
    tmp["month"] = tmp["date"].dt.to_period("M")
    tmp["has_fund"] = tmp["fundamental_score"].notna().astype(int)

    first_of_month = (
        tmp[tmp["has_fund"] == 1]
        .sort_values(["ticker","date"])
        .groupby(["ticker","month"], as_index=False)
        .first()
    )
    first_of_month["rank"] = first_of_month.groupby("month")["fundamental_score"].rank(ascending=False, method="first")
    first_of_month["sel"] = (first_of_month["rank"] <= top_n).astype(int)

    tmp = tmp.merge(first_of_month[["ticker","month","sel"]], on=["ticker","month"], how="left")
    tmp["sel"] = tmp["sel"].fillna(0).astype(int)

    count = tmp.groupby("date")["sel"].transform("sum").replace(0, np.nan)
    w = np.where(tmp["sel"] == 1, 1.0 / count, 0.0)
    w = np.nan_to_num(w, nan=0.0)
    return pd.Series(w, index=df.index)


### Hybrid worker: selection + TA timing

In [None]:
def generate_hybrid_weights(df, top_n=10, oversold_th=20, use_candle_confirm=True, use_trend_filter=True):
    w_fund = generate_fund_weights(df, top_n=top_n)
    w_ta = generate_ta_weights(df, oversold_th=oversold_th, use_candle_confirm=use_candle_confirm, use_trend_filter=use_trend_filter)

    # Convert weights to membership
    fund_member = (w_fund > 0).astype(int)
    ta_member = (w_ta > 0).astype(int)

    hybrid_member = (fund_member & ta_member).astype(int)
    count = hybrid_member.groupby(df["date"]).transform("sum").replace(0, np.nan)
    w = np.where(hybrid_member == 1, 1.0 / count, 0.0)
    w = np.nan_to_num(w, nan=0.0)
    return pd.Series(w, index=df.index)


### Evaluator: scoring function

Score candidates with a balanced objective:

- reward Sharpe & CAGR
- penalize Max Drawdown and high Turnover

In [None]:
def evaluate_candidate(df, weights, cost_bps=5):
    cost_rate = cost_bps / 10000.0

    tmp = df.copy()
    tmp["w"] = weights.values

    port = portfolio_returns_from_weights(tmp, "w")
    turnover = daily_turnover_from_weights(tmp, "w")
    net = port - cost_rate * turnover

    stats = perf_stats(net)
    stats["avg_turnover"] = turnover.mean()
    stats["median_turnover"] = turnover.median()

    # Composite score (tunable)
    # Higher is better:
    # - Sharpe and CAGR positive contribution
    # - Drawdown and turnover negative
    score = (
        2.0 * (stats["sharpe"] if not np.isnan(stats["sharpe"]) else -5) +
        1.0 * (stats["cagr"] if not np.isnan(stats["cagr"]) else -5) +
        1.5 * (stats["max_drawdown"]) -   # max_drawdown is negative
        0.5 * (stats["avg_turnover"])
    )
    stats["score"] = score
    return net, stats


### Parallel workers: run a small grid search

This is your worker orchestration + parallelization section. (We keep it sequential for simplicity, but conceptually it’s parallel workers.)

In [None]:
grid = {
    "top_n": [5, 10, 15],
    "oversold_th": [10, 20, 30],
    "use_candle_confirm": [True, False],
    "use_trend_filter": [True, False]
}

candidates = []
returns_store = {}

for top_n, th, candle, trend in itertools.product(
    grid["top_n"], grid["oversold_th"], grid["use_candle_confirm"], grid["use_trend_filter"]
):
    w = generate_hybrid_weights(df, top_n=top_n, oversold_th=th, use_candle_confirm=candle, use_trend_filter=trend)
    net, stats = evaluate_candidate(df, w, cost_bps=5)

    key = f"hyb_top{top_n}_th{th}_c{int(candle)}_t{int(trend)}"
    stats.update({"candidate": key, "top_n": top_n, "oversold_th": th, "candle_confirm": candle, "trend_filter": trend})
    candidates.append(stats)
    returns_store[key] = net

results = pd.DataFrame(candidates).sort_values("score", ascending=False).reset_index(drop=True)
results.head(10)


### Optimizer: pick best + explain

In [None]:
best = results.iloc[0].to_dict()
best_key = best["candidate"]
best_returns = returns_store[best_key]

print("BEST CANDIDATE:", best_key)
best


Plot best equity curve + drawdown

In [None]:
eq = (1 + best_returns).cumprod()
dd = drawdown_series(best_returns)

eq.plot(figsize=(12,4), title=f"Best Candidate Equity Curve (net) — {best_key}")
plt.show()

dd.plot(figsize=(12,3), title=f"Best Candidate Drawdown — {best_key}")
plt.show()


### Self-reflection checks

These are lightweight but important. Recruiters love seeing this.

In [None]:
def self_reflection_checks(df, weights_row_level: pd.Series):
    """
    weights_row_level must be aligned to df rows: len(weights) == len(df),
    representing per-(date,ticker) portfolio weights.
    """
    if len(weights_row_level) != len(df):
        raise ValueError(f"Expected weights length {len(df)}, got {len(weights_row_level)}")

    notes = []

    # Pivot weights to (date x ticker)
    W = df.assign(w=weights_row_level.values).pivot(index="date", columns="ticker", values="w").fillna(0.0)

    # Check 1: Execution realism (look-ahead hint)
    notes.append("Execution realism: if signals use close-of-day data, shift weights by +1 day before applying returns.")

    # Check 2: Invested days (sparsity)
    invested_days = (W.sum(axis=1) > 0).mean()
    notes.append(f"Sparsity: portfolio invested on {invested_days*100:.1f}% of days.")

    # Check 3: Concentration
    max_weight = W.max(axis=1).mean()
    notes.append(f"Concentration: average max single-name weight = {max_weight:.2f} (lower is better).")

    # Check 4: Turnover
    turnover = 0.5 * W.diff().abs().sum(axis=1)
    notes.append(f"Turnover: avg daily turnover = {turnover.mean():.3f}.")

    # Check 5: Universe stability (how many names held)
    names_held = (W > 0).sum(axis=1)
    notes.append(f"Holdings: avg names held = {names_held.mean():.1f}, median = {names_held.median():.0f}.")

    return notes


In [None]:
best_weights = generate_hybrid_weights(
    df,
    top_n=int(best["top_n"]),
    oversold_th=int(best["oversold_th"]),
    use_candle_confirm=bool(best["candle_confirm"]),
    use_trend_filter=bool(best["trend_filter"])
)

reflection_notes = self_reflection_checks(df, best_weights)

for n in reflection_notes:
    print("-", n)


### Produce a “Research Memo” (Markdown + save)

In [None]:
memo = f"""# Agentic Research Memo — Best Hybrid Candidate

**Question / Intent:** {question}  
**Router Decision:** {intent}  

## Selected Candidate
- **ID:** {best_key}
- **Top N (fundamental selection):** {best['top_n']}
- **Stochastic oversold threshold:** {best['oversold_th']}
- **Candle confirmation:** {best['candle_confirm']}
- **Trend filter (price > EMA20):** {best['trend_filter']}

## Performance (Net of Costs)
- CAGR: {best['cagr']:.2%}
- Sharpe: {best['sharpe']:.2f}
- Sortino: {best['sortino']:.2f}
- Max Drawdown: {best['max_drawdown']:.2%}
- Calmar: {best['calmar']:.2f}
- Avg Turnover: {best['avg_turnover']:.3f}

## Why this candidate won (Evaluator Summary)
- Optimizes risk-adjusted return (Sharpe) while controlling drawdown and turnover.
- Uses fundamentals for selection and technical signals for timing (reduces overtrading).

## Self-Reflection / Caveats
""" + "\n".join([f"- {n}" for n in reflection_notes]) + """

## Next Improvements
- Shift weights by 1 day for strict execution realism.
- Add walk-forward splits and evaluate stability across regimes.
- Add sector constraints to reduce hidden concentration risk.
"""

memo_path = os.path.join(REPORTS, "agentic_research_memo.md")
with open(memo_path, "w", encoding="utf-8") as f:
    f.write(memo)

print("Saved memo:", memo_path)
print("\n---\n")
print(memo[:1200], "...")
