# Experiment 1 - Prompt Strategy Test

This notebook benchmarks LLM-generated trading strategies across prompt versions and risk profiles.
Each configuration runs 20 stochastic iterations per ticker while persisting outputs for later analysis.

## Environment Setup
Load environment variables, configure project paths, and import utilities used during bulk execution.

In [1]:
import json
import os
import sys
from itertools import product
from pathlib import Path
from typing import Any

import pandas as pd
from dotenv import load_dotenv
from openai import OpenAI
from scipy import stats
from tqdm.auto import tqdm

load_dotenv(override=False)

PROJECT_ROOT = Path.cwd()
UTILS_PATH = (PROJECT_ROOT / "utils").resolve()
if str(UTILS_PATH) not in sys.path:
    sys.path.insert(0, str(UTILS_PATH))

In [2]:
from data_utils import (
    HIGH_OBJECTIVES,
    HIGH_RISK_PROFILE,
    LOW_OBJECTIVES,
    LOW_RISK_PROFILE,
    PERSONA,
    evaluate_trading_metrics,
    generate_strategy_for_ticker,
)

DATA_PATH = Path(os.getenv("DATA_PATH", PROJECT_ROOT / "data"))
LLM_PROMPTS_PATH = DATA_PATH / "prompts"
LLM_OUTPUT_PATH = DATA_PATH / "prompts"
LOGS_PATH = PROJECT_ROOT / "logs"
HISTORIC_PATH = DATA_PATH / "historic"

for path in (LLM_OUTPUT_PATH, LOGS_PATH):
    path.mkdir(parents=True, exist_ok=True)

OPENAI_MODEL = os.getenv("OPENAI_MODEL")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
if not OPENAI_API_KEY or not OPENAI_MODEL:
    raise EnvironmentError("OPENAI_API_KEY and OPENAI_MODEL are required for this experiment.")
OPENAI_CLIENT = OpenAI(api_key=OPENAI_API_KEY)

START_DATE = "2018-01-01"
END_DATE = "2020-01-01"

## Helper Functions
Load and cache engineered signals, then execute a single LLM-driven trade simulation while persisting artefacts.

In [3]:
TICKER_CACHE: dict[str, pd.DataFrame] = {}


def load_stock_data(ticker: str, start: str = START_DATE, end: str = END_DATE) -> pd.DataFrame:
    """Load engineered features for the ticker across the experiment window."""
    input_file = HISTORIC_PATH / f"engineered_{ticker}_data.parquet"
    if not input_file.exists():
        raise FileNotFoundError(f"Engineered data not found for {ticker}: {input_file}")

    start_ts = pd.to_datetime(start, utc=True)
    end_ts = pd.to_datetime(end, utc=True)

    engineered_df = pd.read_parquet(input_file)
    engineered_df.set_index("Date", inplace=True)
    engineered_df.index = pd.to_datetime(engineered_df.index, utc=True)
    return engineered_df.loc[start_ts:end_ts].copy()


def get_ticker_frame(ticker: str) -> pd.DataFrame:
    """Return a cached engineered dataset for the requested ticker."""
    if ticker not in TICKER_CACHE:
        TICKER_CACHE[ticker] = load_stock_data(ticker)
    return TICKER_CACHE[ticker]


def run_llm_trade_iteration(
    *,
    ticker: str,
    ticker_df: pd.DataFrame,
    prompt_version: str,
    risk_version: str,
    iteration: int,
    model: str | None = OPENAI_MODEL,
    prompt_path: Path = LLM_PROMPTS_PATH,
    output_path: Path = LLM_OUTPUT_PATH,
    client: Any = OPENAI_CLIENT,
    start_date: str = START_DATE,
    end_date: str = END_DATE,
    news_yaml_file: str | None = None,
) -> tuple[dict[str, Any], pd.DataFrame]:
    """Execute one stochastic LLM strategy run and cache trades plus metrics."""
    iteration_dir = output_path / "results" / risk_version / prompt_version / ticker / f"iter_{iteration:02d}"
    response_dir = output_path / "response" / risk_version / prompt_version / ticker / f"iter_{iteration:02d}"
    iteration_dir.mkdir(parents=True, exist_ok=True)
    response_dir.mkdir(parents=True, exist_ok=True)

    metrics_path = iteration_dir / "metrics.json"
    trades_path = iteration_dir / "trades.csv"

    if metrics_path.exists() and trades_path.exists():
        metrics = json.loads(metrics_path.read_text(encoding="utf-8"))
        trades_df = pd.read_csv(trades_path)
        return metrics, trades_df

    llm_trades_df = generate_strategy_for_ticker(
        ticker_df=ticker_df,
        ticker=ticker,
        LLM_OUTPUT_PATH=response_dir,
        persona=PERSONA,
        HIGH_RISK_PROFILE=HIGH_RISK_PROFILE if risk_version == "r" else LOW_RISK_PROFILE,
        HIGH_OBJECTIVES=HIGH_OBJECTIVES if risk_version == "r" else LOW_OBJECTIVES,
        client=client,
        model=model,
        strategy_yaml_file=prompt_path / f"strat_prompt_{prompt_version}.yml",
        news_yaml_file=(prompt_path / news_yaml_file) if news_yaml_file else None,
        start_date=start_date,
        end_date=end_date,
        max_news=5 if news_yaml_file else 0,
        time_horizon="monthly",
    )

    metrics, trades_df = evaluate_trading_metrics(llm_trades_df)
    metrics_path.write_text(json.dumps(metrics, indent=2), encoding="utf-8")
    trades_df.to_csv(trades_path, index=False)
    return metrics, trades_df

## Parameter Grid
Define the tickers, prompt versions, risk configurations, and metric keys that drive the experiment.

In [4]:
TICKERS = ("AAPL", "MSFT", "GOOGL", "TSLA", "AMZN", "META")
PROMPT_VERSIONS = ("v0", "v1", "v2", "v3", "v4")
RISK_CONFIGS = {
    "r": {"label": "High Risk", "profile": HIGH_RISK_PROFILE, "objectives": HIGH_OBJECTIVES},
    "nr": {"label": "Low Risk", "profile": LOW_RISK_PROFILE, "objectives": LOW_OBJECTIVES},
}
PROMPT_NEWS = {"v4": "analyst_prompt_v1.yml"}
ITERATION_COUNT = 5
METRIC_KEYS = (
    "Sharpe Ratio (Annualized SR)",
    "Maximum Drawdown (MDD)",
    "Mean Entropy",
    "Mean Perplexity",
)
METRIC_ALIASES = {
    "Sharpe Ratio (Annualized SR)": "sr",
    "Maximum Drawdown (MDD)": "mdd",
    "Mean Entropy": "entropy",
    "Mean Perplexity": "ppl",
}
TOTAL_RUNS = len(TICKERS) * len(PROMPT_VERSIONS) * len(RISK_CONFIGS) * ITERATION_COUNT
print(f"Planned executions: {TOTAL_RUNS}")

Planned executions: 300


## Execute Bulk Iterations
Run every prompt, risk, ticker, and iteration combination, capturing metrics to disk and logging outcomes.

In [5]:
execution_log: list[dict[str, Any]] = []

for prompt_version, risk_version, ticker, iteration in tqdm(
    product(PROMPT_VERSIONS, RISK_CONFIGS.keys(), TICKERS, range(1, ITERATION_COUNT + 1)),
    total=TOTAL_RUNS,
    desc="Executing LLM strategies",
):
    ticker_df = get_ticker_frame(ticker)
    news_yaml = PROMPT_NEWS.get(prompt_version)

    metrics, _ = run_llm_trade_iteration(
        ticker=ticker,
        ticker_df=ticker_df,
        prompt_version=prompt_version,
        risk_version=risk_version,
        iteration=iteration,
        model=OPENAI_MODEL,
        prompt_path=LLM_PROMPTS_PATH,
        output_path=LLM_OUTPUT_PATH,
        client=OPENAI_CLIENT,
        start_date=START_DATE,
        end_date=END_DATE,
        news_yaml_file=news_yaml,
    )

    log_entry = {
        "prompt_version": prompt_version,
        "risk_version": risk_version,
        "ticker": ticker,
        "iteration": iteration,
    }
    for metric in METRIC_KEYS:
        log_entry[metric] = metrics.get(metric)
    execution_log.append(log_entry)

execution_df = pd.DataFrame(execution_log)
log_path = LOGS_PATH / "prompt_strategy_execution_log.csv"
if execution_df.empty:
    raise RuntimeError("Execution log is empty; verify that iterations executed successfully.")

execution_df.to_csv(log_path, index=False)

Executing LLM strategies:   0%|          | 0/300 [00:00<?, ?it/s]

## Collect Persisted Results
Assemble every iteration's metrics from disk and write a consolidated dataset for downstream analysis.

In [6]:
results_records: list[dict[str, Any]] = []
results_root = LLM_OUTPUT_PATH / "results"

if not results_root.exists():
    raise RuntimeError("Results directory not found. Run the execution cell first.")

for risk_dir in results_root.iterdir():
    if not risk_dir.is_dir():
        continue
    risk_version = risk_dir.name
    for prompt_dir in risk_dir.iterdir():
        if not prompt_dir.is_dir():
            continue
        prompt_version = prompt_dir.name
        for ticker_dir in prompt_dir.iterdir():
            if not ticker_dir.is_dir():
                continue
            ticker = ticker_dir.name
            for iteration_dir in sorted(ticker_dir.glob("iter_*")):
                metrics_path = iteration_dir / "metrics.json"
                if not metrics_path.exists():
                    continue
                metrics = json.loads(metrics_path.read_text(encoding="utf-8"))
                iteration_label = iteration_dir.name.split("_")[-1]
                record = {
                    "risk_version": risk_version,
                    "prompt_version": prompt_version,
                    "ticker": ticker,
                    "iteration": int(iteration_label),
                }
                for metric in METRIC_KEYS:
                    record[metric] = metrics.get(metric)
                results_records.append(record)

if not results_records:
    raise RuntimeError("No metrics found under the results directory.")

results_df = pd.DataFrame(results_records)
for metric in METRIC_KEYS:
    results_df[metric] = pd.to_numeric(results_df[metric], errors="coerce")
results_df = results_df.sort_values(["risk_version", "prompt_version", "ticker", "iteration"]).reset_index(drop=True)

iteration_metrics_path = results_root / "iteration_metrics.csv"
results_df.to_csv(iteration_metrics_path, index=False)
print(f"Loaded {len(results_df)} iteration rows.")
print(f"Iteration-level metrics saved to {iteration_metrics_path}")

results_df.head()

Loaded 380 iteration rows.
Iteration-level metrics saved to data\prompts\results\iteration_metrics.csv


Unnamed: 0,risk_version,prompt_version,ticker,iteration,Sharpe Ratio (Annualized SR),Maximum Drawdown (MDD),Mean Entropy,Mean Perplexity
0,nr,v0,AAPL,1,0.112957,0.42534,0.333531,1.333213
1,nr,v0,AAPL,2,1.098554,0.339691,0.345406,1.285684
2,nr,v0,AAPL,3,1.098554,0.339691,0.347905,1.643105
3,nr,v0,AAPL,4,1.448693,0.307559,0.346889,1.308549
4,nr,v0,AAPL,5,1.119874,0.331806,0.333454,1.434558


## Summary Statistics
Compute mean and standard deviation for Sharpe ratio, drawdown, entropy, and perplexity by prompt and risk profile.

In [7]:
risk_labels = {key: config["label"] for key, config in RISK_CONFIGS.items()}
results_df["risk_label"] = results_df["risk_version"].map(risk_labels)

metric_list = list(METRIC_KEYS)
group_cols = ["risk_label", "prompt_version", "ticker"]
summary_by_ticker = results_df.groupby(group_cols)[metric_list].agg(["mean", "std", "count"])
summary_by_ticker.columns = [f"{METRIC_ALIASES[col]}_{stat}" for col, stat in summary_by_ticker.columns]
summary_by_ticker = summary_by_ticker.reset_index().sort_values(group_cols)

summary_prompt_level = results_df.groupby(["risk_label", "prompt_version"])[metric_list].agg(["mean", "std", "count"])
summary_prompt_level.columns = [f"{METRIC_ALIASES[col]}_{stat}" for col, stat in summary_prompt_level.columns]
summary_prompt_level = summary_prompt_level.reset_index().sort_values(["risk_label", "prompt_version"])

summary_output_path = results_root / "summary_metrics_by_prompt.csv"
summary_prompt_level.to_csv(summary_output_path, index=False)
summary_ticker_output_path = results_root / "summary_metrics_by_ticker.csv"
summary_by_ticker.to_csv(summary_ticker_output_path, index=False)
print(f"Prompt-level summary saved to {summary_output_path}")
print(f"Ticker-level summary saved to {summary_ticker_output_path}")

summary_by_ticker

Prompt-level summary saved to data\prompts\results\summary_metrics_by_prompt.csv
Ticker-level summary saved to data\prompts\results\summary_metrics_by_ticker.csv


Unnamed: 0,risk_label,prompt_version,ticker,sr_mean,sr_std,sr_count,mdd_mean,mdd_std,mdd_count,entropy_mean,entropy_std,entropy_count,ppl_mean,ppl_std,ppl_count
0,High Risk,v0,AAPL,0.466552,0.326351,5,0.337664,0.05961426,5,0.329645,0.021414,5,1.350096,0.115996,5
1,High Risk,v0,AMZN,-0.063439,0.260761,5,0.459678,0.03186273,5,0.344083,0.023925,5,1.399849,0.166695,5
2,High Risk,v0,GOOGL,0.098164,0.507158,5,0.352952,0.08265246,5,0.325518,0.030607,5,1.292904,0.058936,5
3,High Risk,v0,META,-0.664533,0.447803,5,0.503079,0.1232264,5,0.323662,0.010124,5,1.376652,0.130652,5
4,High Risk,v0,MSFT,0.564139,0.153956,5,0.274763,0.05405411,5,0.362938,0.05543,5,1.475329,0.415023,5
5,High Risk,v0,TSLA,0.222735,0.208178,5,0.574786,0.08472618,5,0.341143,0.018399,5,1.44869,0.192289,5
6,High Risk,v1,AAPL,0.824822,0.310257,20,0.451643,0.02484988,20,0.440196,0.02696,20,1.654529,0.295909,20
7,High Risk,v1,AMZN,0.778162,0.121928,10,0.34112,0.0001542871,10,0.43156,0.053674,10,1.581936,0.386761,10
8,High Risk,v1,GOOGL,0.552481,0.289613,10,0.247511,0.04989555,10,0.438707,0.033393,10,1.640222,0.372713,10
9,High Risk,v1,META,1.094164,0.387933,10,0.426166,0.01088987,10,0.433008,0.029507,10,1.690047,0.427098,10


## Sequential Prompt T-Tests
Two-tailed Welch t-tests compare each prompt version against its immediate predecessor within each risk profile.

In [8]:
ttest_rows_overall: list[dict[str, Any]] = []
ttest_rows_by_ticker: list[dict[str, Any]] = []
wilcoxon_rows_overall: list[dict[str, Any]] = []
wilcoxon_rows_by_ticker: list[dict[str, Any]] = []
ordered_versions = list(PROMPT_VERSIONS)

for risk_version, risk_label in risk_labels.items():
    for index in range(1, len(ordered_versions)):
        baseline_version = ordered_versions[index - 1]
        current_version = ordered_versions[index]
        baseline_mask = (results_df["risk_version"] == risk_version) & (results_df["prompt_version"] == baseline_version)
        current_mask = (results_df["risk_version"] == risk_version) & (results_df["prompt_version"] == current_version)

        for metric, alias in METRIC_ALIASES.items():
            baseline_values = results_df.loc[baseline_mask, metric].dropna()
            current_values = results_df.loc[current_mask, metric].dropna()

            if len(baseline_values) < 2 or len(current_values) < 2:
                p_value = float("nan")
            else:
                _, p_value = stats.ttest_ind(current_values, baseline_values, equal_var=False)

            ttest_rows_overall.append(
                {
                    "risk_label": risk_label,
                    "metric": alias,
                    "baseline_prompt": baseline_version,
                    "current_prompt": current_version,
                    "p_value": p_value,
                }
            )

            baseline_pairs = results_df.loc[baseline_mask, ["ticker", "iteration", metric]].rename(columns={metric: "baseline"})
            current_pairs = results_df.loc[current_mask, ["ticker", "iteration", metric]].rename(columns={metric: "current"})
            paired_df = baseline_pairs.merge(current_pairs, on=["ticker", "iteration"]).dropna()

            if paired_df.empty:
                wilcoxon_p_value = float("nan")
            else:
                try:
                    _, wilcoxon_p_value = stats.wilcoxon(paired_df["current"], paired_df["baseline"])
                except ValueError:
                    wilcoxon_p_value = float("nan")

            wilcoxon_rows_overall.append(
                {
                    "risk_label": risk_label,
                    "metric": alias,
                    "baseline_prompt": baseline_version,
                    "current_prompt": current_version,
                    "p_value": wilcoxon_p_value,
                }
            )

            for ticker in TICKERS:
                ticker_baseline_mask = baseline_mask & (results_df["ticker"] == ticker)
                ticker_current_mask = current_mask & (results_df["ticker"] == ticker)
                ticker_baseline_values = results_df.loc[ticker_baseline_mask, metric].dropna()
                ticker_current_values = results_df.loc[ticker_current_mask, metric].dropna()

                if len(ticker_baseline_values) < 2 or len(ticker_current_values) < 2:
                    ticker_p_value = float("nan")
                else:
                    _, ticker_p_value = stats.ttest_ind(
                        ticker_current_values,
                        ticker_baseline_values,
                        equal_var=False,
                    )

                ttest_rows_by_ticker.append(
                    {
                        "risk_label": risk_label,
                        "ticker": ticker,
                        "metric": alias,
                        "baseline_prompt": baseline_version,
                        "current_prompt": current_version,
                        "p_value": ticker_p_value,
                    }
                )

                ticker_pairs = paired_df.loc[paired_df["ticker"] == ticker]
                if ticker_pairs.empty:
                    ticker_wilcoxon_p = float("nan")
                else:
                    try:
                        _, ticker_wilcoxon_p = stats.wilcoxon(
                            ticker_pairs["current"],
                            ticker_pairs["baseline"],
                        )
                    except ValueError:
                        ticker_wilcoxon_p = float("nan")

                wilcoxon_rows_by_ticker.append(
                    {
                        "risk_label": risk_label,
                        "ticker": ticker,
                        "metric": alias,
                        "baseline_prompt": baseline_version,
                        "current_prompt": current_version,
                        "p_value": ticker_wilcoxon_p,
                    }
                )

ttest_overall_df = pd.DataFrame(ttest_rows_overall).sort_values(["risk_label", "metric", "current_prompt"]).reset_index(drop=True)
ttest_by_ticker_df = (
    pd.DataFrame(ttest_rows_by_ticker).sort_values(["risk_label", "ticker", "metric", "current_prompt"]).reset_index(drop=True)
)
wilcoxon_overall_df = pd.DataFrame(wilcoxon_rows_overall).sort_values(["risk_label", "metric", "current_prompt"]).reset_index(drop=True)
wilcoxon_by_ticker_df = (
    pd.DataFrame(wilcoxon_rows_by_ticker).sort_values(["risk_label", "ticker", "metric", "current_prompt"]).reset_index(drop=True)
)

ttest_overall_output = results_root / "sequential_ttests_overall.csv"
ttest_ticker_output = results_root / "sequential_ttests_by_ticker.csv"
wilcoxon_overall_output = results_root / "sequential_wilcoxon_overall.csv"
wilcoxon_ticker_output = results_root / "sequential_wilcoxon_by_ticker.csv"
ttest_overall_df.to_csv(ttest_overall_output, index=False)
ttest_by_ticker_df.to_csv(ttest_ticker_output, index=False)
wilcoxon_overall_df.to_csv(wilcoxon_overall_output, index=False)
wilcoxon_by_ticker_df.to_csv(wilcoxon_ticker_output, index=False)
print(f"Overall t-test results saved to {ttest_overall_output}")
print(f"Ticker-level t-test results saved to {ttest_ticker_output}")
print(f"Overall Wilcoxon results saved to {wilcoxon_overall_output}")
print(f"Ticker-level Wilcoxon results saved to {wilcoxon_ticker_output}")

ttest_overall_df

  res = hypotest_fun_out(*samples, **kwds)
  res = hypotest_fun_out(*samples, **kwds)
  res = hypotest_fun_out(*samples, **kwds)
  res = hypotest_fun_out(*samples, **kwds)


Overall t-test results saved to data\prompts\results\sequential_ttests_overall.csv
Ticker-level t-test results saved to data\prompts\results\sequential_ttests_by_ticker.csv
Overall Wilcoxon results saved to data\prompts\results\sequential_wilcoxon_overall.csv
Ticker-level Wilcoxon results saved to data\prompts\results\sequential_wilcoxon_by_ticker.csv


  res = hypotest_fun_out(*samples, **kwds)
  res = hypotest_fun_out(*samples, **kwds)
  res = hypotest_fun_out(*samples, **kwds)
  z = (r_plus - mn) / se
  res = hypotest_fun_out(*samples, **kwds)
  z = (r_plus - mn) / se


Unnamed: 0,risk_label,metric,baseline_prompt,current_prompt,p_value
0,High Risk,entropy,v0,v1,5.6435009999999996e-21
1,High Risk,entropy,v1,v2,0.009695419
2,High Risk,entropy,v2,v3,0.3260846
3,High Risk,entropy,v3,v4,0.01633161
4,High Risk,mdd,v0,v1,0.2133505
5,High Risk,mdd,v1,v2,0.2671314
6,High Risk,mdd,v2,v3,0.4990228
7,High Risk,mdd,v3,v4,0.424956
8,High Risk,ppl,v0,v1,3.533761e-05
9,High Risk,ppl,v1,v2,0.2096819


# Paper Summary

In [9]:
metrics_of_interest = ("sr", "ppl", "entropy")
alias_to_metric = {alias: metric for metric, alias in METRIC_ALIASES.items()}
progression_rows: list[dict[str, Any]] = []

for risk_version, risk_label in risk_labels.items():
    for index in range(1, len(ordered_versions)):
        baseline_version = ordered_versions[index - 1]
        current_version = ordered_versions[index]
        baseline_mask = (results_df["risk_version"] == risk_version) & (results_df["prompt_version"] == baseline_version)
        current_mask = (results_df["risk_version"] == risk_version) & (results_df["prompt_version"] == current_version)

        for alias in metrics_of_interest:
            metric_name = alias_to_metric[alias]

            for ticker in TICKERS:
                ticker_current_mask = current_mask & (results_df["ticker"] == ticker)
                current_values = results_df.loc[ticker_current_mask, metric_name].dropna()
                ticker_mean = current_values.mean() if not current_values.empty else float("nan")
                ticker_std = current_values.std(ddof=1) if len(current_values) > 1 else float("nan")

                ttest_p_series = ttest_by_ticker_df.loc[
                    (ttest_by_ticker_df["risk_label"] == risk_label)
                    & (ttest_by_ticker_df["ticker"] == ticker)
                    & (ttest_by_ticker_df["metric"] == alias)
                    & (ttest_by_ticker_df["current_prompt"] == current_version),
                    "p_value",
                ]
                ticker_ttest_p_value = ttest_p_series.iloc[0] if not ttest_p_series.empty else float("nan")

                wilcoxon_p_series = wilcoxon_by_ticker_df.loc[
                    (wilcoxon_by_ticker_df["risk_label"] == risk_label)
                    & (wilcoxon_by_ticker_df["ticker"] == ticker)
                    & (wilcoxon_by_ticker_df["metric"] == alias)
                    & (wilcoxon_by_ticker_df["current_prompt"] == current_version),
                    "p_value",
                ]
                ticker_wilcoxon_p_value = wilcoxon_p_series.iloc[0] if not wilcoxon_p_series.empty else float("nan")

                progression_rows.append(
                    {
                        "risk_label": risk_label,
                        "ticker": ticker,
                        "metric": alias,
                        "baseline_prompt": baseline_version,
                        "current_prompt": current_version,
                        "mean": ticker_mean,
                        "std": ticker_std,
                        "ttest_p_value": ticker_ttest_p_value,
                        "wilcoxon_p_value": ticker_wilcoxon_p_value,
                    }
                )

            overall_values = results_df.loc[current_mask, metric_name].dropna()
            overall_mean = overall_values.mean() if not overall_values.empty else float("nan")
            overall_std = overall_values.std(ddof=1) if len(overall_values) > 1 else float("nan")

            overall_ttest_series = ttest_overall_df.loc[
                (ttest_overall_df["risk_label"] == risk_label)
                & (ttest_overall_df["metric"] == alias)
                & (ttest_overall_df["baseline_prompt"] == baseline_version)
                & (ttest_overall_df["current_prompt"] == current_version),
                "p_value",
            ]
            overall_ttest_p_value = overall_ttest_series.iloc[0] if not overall_ttest_series.empty else float("nan")

            overall_wilcoxon_series = wilcoxon_overall_df.loc[
                (wilcoxon_overall_df["risk_label"] == risk_label)
                & (wilcoxon_overall_df["metric"] == alias)
                & (wilcoxon_overall_df["baseline_prompt"] == baseline_version)
                & (wilcoxon_overall_df["current_prompt"] == current_version),
                "p_value",
            ]
            overall_wilcoxon_p_value = overall_wilcoxon_series.iloc[0] if not overall_wilcoxon_series.empty else float("nan")

            progression_rows.append(
                {
                    "risk_label": risk_label,
                    "ticker": "Overall",
                    "metric": alias,
                    "baseline_prompt": baseline_version,
                    "current_prompt": current_version,
                    "mean": overall_mean,
                    "std": overall_std,
                    "ttest_p_value": overall_ttest_p_value,
                    "wilcoxon_p_value": overall_wilcoxon_p_value,
                }
            )

progression_summary_df = pd.DataFrame(progression_rows)
if not progression_summary_df.empty:
    ticker_order = list(TICKERS) + ["Overall"]
    progression_summary_df["ticker"] = pd.Categorical(progression_summary_df["ticker"], categories=ticker_order, ordered=True)
    progression_summary_df = progression_summary_df.sort_values(
        [
            "risk_label",
            "metric",
            "current_prompt",
            "ticker",
        ]
    ).reset_index(drop=True)

progression_output = results_root / "sequential_ttests_progression_summary.csv"
progression_summary_df.to_csv(progression_output, index=False)
print(f"Progression summary saved to {progression_output}")

progression_summary_df


Progression summary saved to data\prompts\results\sequential_ttests_progression_summary.csv


Unnamed: 0,risk_label,ticker,metric,baseline_prompt,current_prompt,mean,std,ttest_p_value,wilcoxon_p_value
0,High Risk,AAPL,entropy,v0,v1,0.440196,0.026960,0.000015,0.062500
1,High Risk,MSFT,entropy,v0,v1,0.423887,0.034822,0.068551,0.125000
2,High Risk,GOOGL,entropy,v0,v1,0.438707,0.033393,0.000117,0.062500
3,High Risk,TSLA,entropy,v0,v1,0.446512,0.038798,0.000008,0.062500
4,High Risk,AMZN,entropy,v0,v1,0.431560,0.053674,0.000777,0.062500
...,...,...,...,...,...,...,...,...,...
163,Low Risk,GOOGL,sr,v3,v4,0.426826,0.300621,0.822048,0.750000
164,Low Risk,TSLA,sr,v3,v4,0.290123,0.302314,0.915962,1.000000
165,Low Risk,AMZN,sr,v3,v4,0.868558,0.493611,0.380590,0.625000
166,Low Risk,META,sr,v3,v4,0.288389,0.639744,0.727713,1.000000


In [10]:
overall = progression_summary_df.loc[
    (progression_summary_df["ticker"] == "Overall") & (progression_summary_df["risk_label"] == "High Risk")
]
overall.pivot(index=["risk_label", "metric"], columns="current_prompt", values=["mean", "ttest_p_value"])

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,mean,mean,mean,ttest_p_value,ttest_p_value,ttest_p_value,ttest_p_value
Unnamed: 0_level_1,current_prompt,v1,v2,v3,v4,v1,v2,v3,v4
risk_label,metric,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
High Risk,entropy,0.436295,0.463079,0.451135,0.485423,5.6435009999999996e-21,0.009695,0.326085,0.016332
High Risk,ppl,1.628944,1.755012,1.570891,1.82708,3.533761e-05,0.209682,0.105477,0.036653
High Risk,sr,0.789129,0.221504,0.285961,0.091234,5.0189e-08,0.000436,0.757692,0.341947


In [11]:
# Overall Welch t-test between v1 and v4 for SR, PPL, Entropy (both risk profiles)
baseline_v = "v1"
current_v = "v4"
rows = []
alias_to_metric = {alias: metric for metric, alias in METRIC_ALIASES.items()}
for risk_version, risk_label in risk_labels.items():
    baseline_mask = (results_df["risk_version"] == risk_version) & (results_df["prompt_version"] == baseline_v)
    current_mask = (results_df["risk_version"] == risk_version) & (results_df["prompt_version"] == current_v)

    for alias in ("sr", "ppl", "entropy"):
        metric_name = alias_to_metric[alias]
        base_vals = results_df.loc[baseline_mask, metric_name].dropna()
        curr_vals = results_df.loc[current_mask, metric_name].dropna()

        mean_base = base_vals.mean() if not base_vals.empty else float("nan")
        mean_curr = curr_vals.mean() if not curr_vals.empty else float("nan")
        std_base = base_vals.std(ddof=1) if len(base_vals) > 1 else float("nan")
        std_curr = curr_vals.std(ddof=1) if len(curr_vals) > 1 else float("nan")
        n_base = len(base_vals)
        n_curr = len(curr_vals)

        if n_base < 2 or n_curr < 2:
            t_stat = float("nan")
            t_p = float("nan")
        else:
            t_stat, t_p = stats.ttest_ind(curr_vals, base_vals, equal_var=False)

        rows.append(
            {
                "risk_label": risk_label,
                "risk_version": risk_version,
                "metric": alias,
                "baseline": baseline_v,
                "current": current_v,
                "mean_baseline": mean_base,
                "mean_current": mean_curr,
                "std_baseline": std_base,
                "std_current": std_curr,
                "n_baseline": n_base,
                "n_current": n_curr,
                "ttest_stat": t_stat,
                "ttest_p": t_p,
            }
        )

overall_ttest_df = pd.DataFrame(rows)
out = results_root / f"overall_ttest_{baseline_v}_vs_{current_v}.csv"
overall_ttest_df.to_csv(out, index=False)
print(f"Overall v1 vs v4 t-test saved to {out}")
overall_ttest_df


Overall v1 vs v4 t-test saved to data\prompts\results\overall_ttest_v1_vs_v4.csv


Unnamed: 0,risk_label,risk_version,metric,baseline,current,mean_baseline,mean_current,std_baseline,std_current,n_baseline,n_current,ttest_stat,ttest_p
0,High Risk,r,sr,v1,v4,0.789129,0.091234,0.355338,0.747353,70,30,-4.883653,2.3e-05
1,High Risk,r,ppl,v1,v4,1.628944,1.82708,0.336073,0.545137,70,30,1.846072,0.072524
2,High Risk,r,entropy,v1,v4,0.436295,0.485423,0.03517,0.061826,70,30,4.078582,0.000229
3,Low Risk,nr,sr,v1,v4,0.829134,0.660146,0.366008,0.587904,60,30,-1.440942,0.157273
4,Low Risk,nr,ppl,v1,v4,1.561608,1.716489,0.284072,0.451406,60,30,1.71696,0.093564
5,Low Risk,nr,entropy,v1,v4,0.429534,0.456539,0.035963,0.080068,60,30,1.760661,0.08704


In [12]:
from pprint import pprint

pprint(overall_ttest_df)

  risk_label risk_version   metric baseline current  mean_baseline  \
0  High Risk            r       sr       v1      v4       0.789129   
1  High Risk            r      ppl       v1      v4       1.628944   
2  High Risk            r  entropy       v1      v4       0.436295   
3   Low Risk           nr       sr       v1      v4       0.829134   
4   Low Risk           nr      ppl       v1      v4       1.561608   
5   Low Risk           nr  entropy       v1      v4       0.429534   

   mean_current  std_baseline  std_current  n_baseline  n_current  ttest_stat  \
0      0.091234      0.355338     0.747353          70         30   -4.883653   
1      1.827080      0.336073     0.545137          70         30    1.846072   
2      0.485423      0.035170     0.061826          70         30    4.078582   
3      0.660146      0.366008     0.587904          60         30   -1.440942   
4      1.716489      0.284072     0.451406          60         30    1.716960   
5      0.456539      0.