# Strategy 01 Result Analysis Notebook
This notebook loads pickled backtest outputs, computes descriptive statistics, runs simple hypothesis tests, and visualises the performance of Strategy 01. Update the configuration cells as needed before executing the analysis.

## 1. Set Up Environment
Configure paths and import the libraries required for analysis and visualisation.

In [None]:
from __future__ import annotations

import json
import pickle
from dataclasses import dataclass
from pathlib import Path
from typing import Any, Iterable

import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
from scipy import stats

# Configure display and plotting defaults
pd.set_option("display.max_rows", 50)
pd.set_option("display.max_columns", 50)
sns.set_theme(style="whitegrid")

# Configure directories
NOTEBOOK_DIR = Path.cwd()
DEFAULT_RESULTS_DIR = Path.home() / "Downloads"
analysis_output_dir = (NOTEBOOK_DIR / "analysis_outputs").resolve()
analysis_output_dir.mkdir(parents=True, exist_ok=True)

results_dir = DEFAULT_RESULTS_DIR
print(f"Notebook directory: {NOTEBOOK_DIR}")
print(f"Results directory: {results_dir}")
print(f"Analysis outputs will be stored in: {analysis_output_dir}")

## 2. Discover Result Files
Locate pickled result files (simulation, holdings, revenue) under the results directory to decide which dataset to analyse.

In [None]:
from collections import defaultdict

RESULT_FILE_PATTERNS = {
    "simulation": "simulation_results_*.pkl",
    "holdings": "shares_owned_*.pkl",
    "revenue": "revenue_records_*.pkl",
}

def discover_result_files(root: Path) -> pd.DataFrame:
    records: list[dict[str, Any]] = []
    for kind, pattern in RESULT_FILE_PATTERNS.items():
        for path in root.glob(pattern):
            prefix = path.stem.split("_")[-1]
            records.append({
                "kind": kind,
                "prefix": prefix,
                "path": path.resolve(),
                "modified": pd.Timestamp(path.stat().st_mtime, unit="s"),
                "size_kb": path.stat().st_size / 1024,
            })
    return pd.DataFrame(records).sort_values(["prefix", "kind"])

discovered_files = discover_result_files(results_dir)
if discovered_files.empty:
    print("No result files were found. Update `results_dir` above and rerun this cell.")
else:
    display(discovered_files.reset_index(drop=True))

## 3. Load and Combine Result Data
Select a prefix to analyse, read the relevant pickles, and convert them into tabular pandas structures for downstream processing.

In [None]:
def load_pickle(path: Path) -> Any:
    with open(path, "rb") as fh:
        return pickle.load(fh)

def flatten_simulation(simulation: dict[str, list[dict[str, Any]]]) -> pd.DataFrame:
    records: list[dict[str, Any]] = []
    for ticker, rows in simulation.items():
        for row in rows:
            record = row.copy()
            record["ticker"] = ticker
            records.append(record)
    return pd.DataFrame(records)

def flatten_holdings(holdings: dict[str, list[dict[str, Any]]]) -> pd.DataFrame:
    records: list[dict[str, Any]] = []
    for ticker, rows in holdings.items():
        for row in rows:
            record = row.copy()
            record["ticker"] = ticker
            records.append(record)
    return pd.DataFrame(records)

def flatten_revenue(revenue: dict[str, list[dict[str, Any]]]) -> pd.DataFrame:
    records: list[dict[str, Any]] = []
    for ticker, rows in revenue.items():
        for row in rows:
            record = row.copy()
            record["ticker"] = ticker
            records.append(record)
    return pd.DataFrame(records)

if discovered_files.empty:
    print("Populate `results_dir` with pickled backtest outputs before continuing.")
    simulation_df = pd.DataFrame()
    holdings_df = pd.DataFrame()
    revenue_df = pd.DataFrame()
else:
    available_prefixes = discovered_files["prefix"].unique().tolist()
    target_prefix = available_prefixes[0]
    print(f"Available prefixes: {available_prefixes}")
    print(f"Using prefix: {target_prefix}")

    paths = {
        row.kind: row.path for row in discovered_files.itertuples() if row.prefix == target_prefix
    }

    simulation_df = flatten_simulation(load_pickle(paths["simulation"])) if "simulation" in paths else pd.DataFrame()
    holdings_df = flatten_holdings(load_pickle(paths["holdings"])) if "holdings" in paths else pd.DataFrame()
    revenue_df = flatten_revenue(load_pickle(paths["revenue"])) if "revenue" in paths else pd.DataFrame()

    print("Loaded frames:")
    print({
        "simulation": simulation_df.shape,
        "holdings": holdings_df.shape,
        "revenue": revenue_df.shape,
    })

## 4. Clean and Prepare Data
Enforce data types, drop malformed rows, and compute convenience fields for downstream analysis.

In [None]:
def prepare_simulation(df: pd.DataFrame) -> pd.DataFrame:
    if df.empty:
        return df
    df = df.copy()
    df["simuldate"] = pd.to_datetime(df["simuldate"], errors="coerce")
    numeric_fields = ["invest_amount", "shares_bought", "vwap_stability", "popularity"]
    for field in numeric_fields:
        if field in df.columns:
            df[field] = pd.to_numeric(df[field], errors="coerce")
    df.dropna(subset=["simuldate", "ticker"], inplace=True)
    df.sort_values(["ticker", "simuldate"], inplace=True)
    df["invest_amount"].fillna(0, inplace=True) if "invest_amount" in df else None
    df["shares_bought"].fillna(0, inplace=True) if "shares_bought" in df else None
    df["invest_flag"].fillna("no_action", inplace=True) if "invest_flag" in df else None
    return df

def prepare_holdings(df: pd.DataFrame) -> pd.DataFrame:
    if df.empty:
        return df
    df = df.copy()
    for col in ["buy_date", "sold_date"]:
        if col in df.columns:
            df[col] = pd.to_datetime(df[col], errors="coerce")
    for col in ["shares", "buy_price", "sold_price"]:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors="coerce")
    df.dropna(subset=["ticker", "buy_date", "shares"], inplace=True)
    df["holding_days"] = (df["sold_date"] - df["buy_date"]).dt.days
    return df

def prepare_revenue(df: pd.DataFrame) -> pd.DataFrame:
    if df.empty:
        return df
    df = df.copy()
    if "sell_date" in df.columns:
        df["sell_date"] = pd.to_datetime(df["sell_date"], errors="coerce")
    for col in ["revenue", "shares_sold", "sold_price", "bought_price"]:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors="coerce")
    df.dropna(subset=["ticker", "sell_date", "revenue"], inplace=True)
    df["profit_pct"] = np.where(
    (df["bought_price"] > 0),
        (df["sold_price"] - df["bought_price"]) / df["bought_price"],
        np.nan,
    ) if "bought_price" in df.columns and "sold_price" in df.columns else np.nan
    return df

simulation_df = prepare_simulation(simulation_df)
holdings_df = prepare_holdings(holdings_df)
revenue_df = prepare_revenue(revenue_df)

print({
    "simulation": simulation_df.shape,
    "holdings": holdings_df.shape,
    "revenue": revenue_df.shape,
})

## 5. Compute Descriptive Statistics
Produce overview tables and correlation heatmaps to understand the distribution of key strategy metrics.

In [None]:
if simulation_df.empty:
    print("Simulation DataFrame empty; skip descriptive statistics.")
else:
    display(simulation_df.describe(include="all").transpose())

    grouped_summary = (
        simulation_df.groupby("ticker")["invest_amount"].agg([
            ("trades", "count"),
            ("invest_total", "sum"),
            ("invest_mean", "mean"),
        ])
    )
    display(grouped_summary.sort_values("invest_total", ascending=False))

    numeric_cols = simulation_df.select_dtypes(include=[np.number])
    if not numeric_cols.empty:
        corr = numeric_cols.corr()
        plt.figure(figsize=(8, 6))
        sns.heatmap(corr, annot=True, fmt=".2f", cmap="coolwarm", square=True)
        plt.title("Correlation Matrix (Simulation Metrics)")
        plt.tight_layout()
        plt.show()

if revenue_df.empty:
    print("Revenue DataFrame empty; skipping revenue stats.")
else:
    revenue_summary = revenue_df.groupby("ticker")["revenue"].agg(["count", "sum", "mean", "median"])
    display(revenue_summary.sort_values("sum", ascending=False))

    if "profit_pct" in revenue_df.columns and revenue_df["profit_pct"].notna().any():
        profit_stats = revenue_df["profit_pct"].describe(percentiles=[0.25, 0.5, 0.75])
        display(profit_stats.to_frame(name="profit_pct"))

## 6. Run Statistical Tests
Evaluate whether revenues differ significantly from zero and compare profitability between tickers using SciPy hypothesis tests.

In [None]:
stat_test_results: list[dict[str, Any]] = []

if revenue_df.empty:
    print("Revenue data unavailable; skipping hypothesis tests.")
else:
    revenues = revenue_df["revenue"].dropna()
    if len(revenues) > 1:
        t_stat, p_value = stats.ttest_1samp(revenues, popmean=0)
        stat_test_results.append({
            "test": "one-sample t-test",
            "statistic": t_stat,
            "p_value": p_value,
            "n": len(revenues),
        })
        print(f"One-sample t-test vs zero revenue: t={t_stat:.3f}, p={p_value:.4f}, n={len(revenues)}")
    else:
        print("Not enough revenue observations for t-test.")

    if "profit_pct" in revenue_df.columns:
        groups = [
            grp.dropna().values
            for _, grp in revenue_df.groupby("ticker")["profit_pct"]
            if grp.dropna().shape[0] > 1
        ]
        if len(groups) >= 2:
            f_stat, p_value = stats.f_oneway(*groups)
            stat_test_results.append({
                "test": "one-way ANOVA (profit_pct by ticker)",
                "statistic": f_stat,
                "p_value": p_value,
                "groups": len(groups),
            })
            print(f"ANOVA across tickers (profit_pct): F={f_stat:.3f}, p={p_value:.4f}, groups={len(groups)}")
        else:
            print("Not enough ticker groups for ANOVA.")

if stat_test_results:
    display(pd.DataFrame(stat_test_results))

## 7. Generate Distribution Charts
Visualise the distributions of trade revenues and holding durations with histograms, KDE curves, and box plots.

In [None]:
if revenue_df.empty:
    print("Revenue data unavailable; skipping distribution plots.")
else:
    plt.figure(figsize=(8, 5))
    sns.histplot(revenue_df["revenue"], kde=True, bins=30)
    plt.axvline(0, color="red", linestyle="--", linewidth=1)
    plt.title("Revenue Distribution")
    plt.xlabel("Revenue (KRW)")
    plt.tight_layout()
    plt.show()

    if "profit_pct" in revenue_df.columns and revenue_df["profit_pct"].notna().any():
        plt.figure(figsize=(8, 5))
        sns.kdeplot(revenue_df["profit_pct"].dropna(), shade=True)
        plt.title("Profit Percentage KDE")
        plt.xlabel("Profit %")
        plt.tight_layout()
        plt.show()

    plt.figure(figsize=(10, 5))
    sns.boxplot(data=revenue_df, x="ticker", y="revenue")
    plt.title("Revenue by Ticker")
    plt.xlabel("Ticker")
    plt.ylabel("Revenue (KRW)")
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

if holdings_df.empty:
    print("Holdings data unavailable; skipping holding duration plots.")
else:
    if "holding_days" in holdings_df.columns and holdings_df["holding_days"].notna().any():
        plt.figure(figsize=(8, 5))
        sns.histplot(holdings_df["holding_days"].dropna(), kde=True, bins=20)
        plt.title("Holding Duration Distribution")
        plt.xlabel("Holding Days")
        plt.tight_layout()
        plt.show()

## 8. Create Comparative Visualizations
Build time-series and cross-sectional plots to compare performance across tickers and simulation dates.

In [None]:
if revenue_df.empty:
    print("Revenue data unavailable; skipping comparative visuals.")
else:
    revenue_by_date = (
        revenue_df.groupby("sell_date")["revenue"].sum().sort_index().cumsum().rename("cumulative_revenue")
    )
    plt.figure(figsize=(10, 5))
    plt.plot(revenue_by_date.index, revenue_by_date.values)
    plt.title("Cumulative Revenue Over Time")
    plt.xlabel("Sell Date")
    plt.ylabel("Cumulative Revenue (KRW)")
    plt.tight_layout()
    plt.show()

    revenue_per_ticker = revenue_df.groupby("ticker")["revenue"].sum().sort_values(ascending=False)
    plt.figure(figsize=(10, 5))
    sns.barplot(x=revenue_per_ticker.index, y=revenue_per_ticker.values)
    plt.title("Total Revenue by Ticker")
    plt.xlabel("Ticker")
    plt.ylabel("Total Revenue (KRW)")
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

if simulation_df.empty:
    print("Simulation data unavailable; skipping scatter plots.")
else:
    if {"invest_amount", "popularity"}.issubset(simulation_df.columns):
        plt.figure(figsize=(8, 5))
        sns.scatterplot(data=simulation_df, x="popularity", y="invest_amount", hue="ticker", alpha=0.6)
        plt.title("Investment Size vs Popularity Signal")
        plt.xlabel("Volume Popularity Signal")
        plt.ylabel("Invest Amount")
        plt.tight_layout()
        plt.show()

    if {"simuldate", "invest_amount"}.issubset(simulation_df.columns):
        invest_over_time = simulation_df.groupby("simuldate")["invest_amount"].sum().sort_index()
        plt.figure(figsize=(10, 4))
        plt.plot(invest_over_time.index, invest_over_time.values)
        plt.title("Aggregate Invest Amount Over Simulation Dates")
        plt.xlabel("Simulation Date")
        plt.ylabel("Invest Amount (KRW)")
        plt.tight_layout()
        plt.show()

## 9. Export Analysis Artifacts
Persist processed datasets, statistics, and test outcomes so they can be reused outside this notebook.

In [None]:
export_prefix = (f"{target_prefix}_analysis" if "target_prefix" in locals() else "strategy01_analysis")
export_paths: dict[str, Path] = {}

if not simulation_df.empty:
    simulation_csv = analysis_output_dir / f"{export_prefix}_simulation.csv"
    simulation_df.to_csv(simulation_csv, index=False)
    export_paths["simulation_csv"] = simulation_csv

if not revenue_df.empty:
    revenue_csv = analysis_output_dir / f"{export_prefix}_revenue.csv"
    revenue_df.to_csv(revenue_csv, index=False)
    export_paths["revenue_csv"] = revenue_csv

    revenue_summary = revenue_df.groupby("ticker")["revenue"].agg(["count", "sum", "mean", "median"])
    summary_csv = analysis_output_dir / f"{export_prefix}_revenue_summary.csv"
    revenue_summary.to_csv(summary_csv)
    export_paths["revenue_summary_csv"] = summary_csv

if stat_test_results:
    stats_json = analysis_output_dir / f"{export_prefix}_stat_tests.json"
    with open(stats_json, "w", encoding="utf-8") as fh:
        json.dump(stat_test_results, fh, indent=2, default=str)
    export_paths["stat_tests_json"] = stats_json

if export_paths:
    print("Exported the following artifacts:")
    for label, path in export_paths.items():
        print(f"- {label}: {path}")
else:
    print("No artifacts were exported. Ensure data frames are populated before running this cell.")