In [1]:
# =========================
# Cell 0 — Imports & setup
# =========================

from __future__ import annotations

from dataclasses import dataclass
from pathlib import Path
from typing import Iterable, Sequence

import numpy as np
import polars as pl

pl.enable_string_cache()

# Opcional: para ver tablas grandes sin truncado
pl.Config.set_tbl_rows(20)
pl.Config.set_tbl_cols(20)
pl.Config.set_fmt_str_lengths(120)

class PreprocessError(RuntimeError):
    pass


In [None]:
# =========================
# Cell 1 — Config
# =========================

@dataclass(frozen=True)
class PreprocessConfig:
    # Carpetas
    root_dir: Path                        # datos_competicion/
    algos_subdir: str = "algoritmos"
    cache_dir: Path = Path("data/cache")       # donde escribir parquets
    
    # Columnas esperadas (robusto a variaciones)
    dt_candidates: tuple[str, ...] = ("datetime", "date", "timestamp", "time")
    close_candidates: tuple[str, ...] = ("close", "Close", "c", "price", "last")
    
    # Parsing datetime (para algos)
    dt_format_candidates: tuple[str, ...] = (
        "%Y-%m-%d %H:%M:%S",
        "%Y-%m-%d %H:%M:%S%.f",
        "%Y-%m-%dT%H:%M:%S",
        "%Y-%m-%dT%H:%M:%S%.f",
    )
    
    # Calidad / filtros
    min_obs: int = 60                    # mínimo de días con dato
    min_coverage: float = 0.70           # n_obs / (end-start+1)
    constant_close_std_eps: float = 1e-8 # close_std muy pequeño => constante
    max_abs_ret_clip: float = 0.50       # clip robusto de returns diarios (+/- 50%)
    
    # Features RL
    feature_windows: tuple[int, ...] = (20, 60, 120)
    annualization_factor: int = 252      # trading days
    
    # Output names
    panel_name: str = "algos_panel.parquet"
    meta_name: str = "algos_meta.parquet"
    meta_good_name: str = "algos_meta_good.parquet"
    features_name: str = "algos_features.parquet"
    features_good_name: str = "algos_features_good.parquet"
    alive_intervals_name: str = "alive_intervals.parquet"
    
    benchmark_trades_name: str = "benchmark_trades_clean.parquet"
    benchmark_monthly_name: str = "benchmark_monthly_clean.parquet"
    benchmark_yearly_name: str = "benchmark_yearly_clean.parquet"
    benchmark_monthly_stats_name: str = "benchmark_monthly_stats.parquet"

def ensure_dirs(cfg: PreprocessConfig) -> None:
    cfg.cache_dir.mkdir(parents=True, exist_ok=True)
    (cfg.cache_dir / "checks").mkdir(parents=True, exist_ok=True)

def algos_dir(cfg: PreprocessConfig) -> Path:
    d = cfg.root_dir / cfg.algos_subdir
    if not d.exists():
        raise PreprocessError(f"No existe la carpeta de algoritmos: {d}")
    return d


In [3]:
# =========================
# Cell 2 — Helpers (robustos)
# =========================

def pick_first_existing(cols: Iterable[str], candidates: Sequence[str]) -> str:
    s = set(cols)
    for c in candidates:
        if c in s:
            return c
    raise PreprocessError(
        f"No encontré ninguna columna entre {candidates}. Columnas disponibles: {sorted(cols)[:30]}..."
    )

def build_dt_expr(colname: str, cfg: PreprocessConfig) -> pl.Expr:
    """
    Intenta parsear datetime con varios formatos.
    Si falla, devuelve Null -> luego filtramos.
    """
    exprs = [
        pl.col(colname).str.strptime(pl.Datetime, format=fmt, strict=False)
        for fmt in cfg.dt_format_candidates
    ]
    # coalesce => primer parseo válido
    return pl.coalesce(exprs).alias("dt")

def safe_write_parquet(lf: pl.LazyFrame, path: Path, overwrite: bool = False) -> None:
    if path.exists() and not overwrite:
        return
    lf.sink_parquet(str(path), compression="zstd", statistics=True)

def require_columns(df_or_lf, required: Sequence[str], name: str) -> None:
    cols = df_or_lf.columns if hasattr(df_or_lf, "columns") else []
    missing = [c for c in required if c not in cols]
    if missing:
        raise PreprocessError(f"[{name}] faltan columnas: {missing}. Hay: {cols}")


In [4]:
# =========================
# Cell 3 — Step A: Build panel (algo_id, date, close, ret_1d, logret_1d)
# =========================

def build_algos_panel(cfg: PreprocessConfig, overwrite: bool = False) -> Path:
    ensure_dirs(cfg)
    out_path = cfg.cache_dir / cfg.panel_name
    if out_path.exists() and not overwrite:
        return out_path

    pattern = str(algos_dir(cfg) / "*.csv")

    # Leemos una muestra mínima para detectar columnas reales (robusto)
    sample_files = sorted(algos_dir(cfg).glob("*.csv"))
    if not sample_files:
        raise PreprocessError(f"No hay CSV en {algos_dir(cfg)}")
    sample = pl.read_csv(sample_files[0], n_rows=5)
    dt_col = pick_first_existing(sample.columns, cfg.dt_candidates)
    close_col = pick_first_existing(sample.columns, cfg.close_candidates)

    # Lazy scan masivo + ruta para extraer algo_id
    lf = pl.scan_csv(
        pattern,
        glob=True,
        ignore_errors=True,              # robusto ante algún csv corrupto
        infer_schema_length=0,           # no gastes tiempo infiriendo demasiado
        include_file_paths="path",
    )

    # Parseo dt robusto + casting close
    # Regex mejorada para Windows y Unix: captura el nombre del archivo sin extensión
    lf = (
        lf
        .with_columns([
            # Extraer solo el nombre del archivo (funciona en Windows y Unix)
            pl.col("path").str.replace_all(r".*[/\\]", "").str.replace(r"\.csv$", "").alias("algo_id"),
            build_dt_expr(dt_col, cfg),
            pl.col(close_col).cast(pl.Float64, strict=False).alias("close_raw"),
        ])
        .select(["algo_id", "dt", "close_raw"])
        .filter(pl.col("algo_id").is_not_null())
        .filter(pl.col("algo_id") != "")
        .filter(pl.col("dt").is_not_null())
        .filter(pl.col("close_raw").is_not_null())
        .with_columns([
            pl.col("dt").dt.date().alias("date"),
            pl.col("close_raw").alias("close"),
        ])
        .select(["algo_id", "date", "dt", "close"])
    )

    # Normalización diaria (si hubiera intradía): último close del día
    daily = (
        lf
        .sort(["algo_id", "date", "dt"])
        .group_by(["algo_id", "date"], maintain_order=True)
        .agg(pl.last("close").alias("close"))
        .sort(["algo_id", "date"])
        .with_columns([
            pl.col("close").pct_change().over("algo_id").alias("ret_1d"),
            (pl.col("close") / pl.col("close").shift(1)).log().over("algo_id").alias("logret_1d"),
        ])
        # clipping robusto (protege features/RL de spikes absurdos)
        .with_columns([
            pl.col("ret_1d").clip(-cfg.max_abs_ret_clip, cfg.max_abs_ret_clip).alias("ret_1d"),
            pl.col("logret_1d").clip(-cfg.max_abs_ret_clip, cfg.max_abs_ret_clip).alias("logret_1d"),
        ])
        .select(["algo_id", "date", "close", "ret_1d", "logret_1d"])
    )

    safe_write_parquet(daily, out_path, overwrite=True)
    return out_path

In [5]:
# =========================
# Cell 4 — Step B: Meta por algoritmo + alive_intervals
# =========================

def build_algos_meta(cfg: PreprocessConfig, panel_path: Path, overwrite: bool = False) -> tuple[Path, Path]:
    ensure_dirs(cfg)
    out_meta = cfg.cache_dir / cfg.meta_name
    out_alive = cfg.cache_dir / cfg.alive_intervals_name

    if out_meta.exists() and out_alive.exists() and not overwrite:
        return out_meta, out_alive

    lf = pl.scan_parquet(str(panel_path))

    ann = float(cfg.annualization_factor)

    meta = (
        lf
        .group_by("algo_id")
        .agg([
            pl.min("date").alias("start_date"),
            pl.max("date").alias("end_date"),
            pl.len().alias("n_obs"),
            pl.col("close").n_unique().alias("n_unique_close"),
            pl.col("close").std().alias("close_std"),
            pl.col("ret_1d").mean().alias("ret_mean"),
            pl.col("ret_1d").std().alias("ret_std"),
            pl.col("ret_1d").median().alias("ret_median"),
            pl.col("ret_1d").quantile(0.01).alias("ret_q01"),
            pl.col("ret_1d").quantile(0.99).alias("ret_q99"),
            # Max drawdown (sobre close)
            (pl.col("close") / pl.col("close").cum_max() - 1.0).min().alias("max_drawdown"),
        ])
        .with_columns([
            # días totales entre start y end (incl.)
            (pl.col("end_date") - pl.col("start_date")).dt.total_days().cast(pl.Int64).alias("span_days"),
        ])
        .with_columns([
            (pl.col("span_days") + 1).alias("n_days"),
            (pl.col("n_obs") / (pl.col("span_days") + 1)).alias("coverage_ratio"),
            (pl.col("close_std") <= cfg.constant_close_std_eps).alias("is_constant_std"),
            (pl.col("n_unique_close") <= 2).alias("is_constant_unique"),
        ])
        .with_columns([
            (pl.col("is_constant_std") | pl.col("is_constant_unique")).alias("is_constant"),
            # Sharpe anualizado (simple)
            pl.when(pl.col("ret_std") > 0)
              .then((pl.col("ret_mean") / pl.col("ret_std")) * np.sqrt(ann))
              .otherwise(None)
              .alias("sharpe_ann"),
            (pl.col("ret_std") * np.sqrt(ann)).alias("vol_ann"),
        ])
        .select([
            "algo_id", "start_date", "end_date",
            "n_obs", "n_days", "coverage_ratio",
            "n_unique_close", "close_std", "is_constant",
            "ret_mean", "ret_std", "ret_median", "ret_q01", "ret_q99",
            "vol_ann", "sharpe_ann", "max_drawdown",
        ])
    )

    safe_write_parquet(meta, out_meta, overwrite=True)

    # Alive intervals (lo más eficiente para action-masking)
    alive = (
        pl.scan_parquet(str(out_meta))
        .select(["algo_id", "start_date", "end_date"])
    )
    safe_write_parquet(alive, out_alive, overwrite=True)

    return out_meta, out_alive


def build_good_universe(cfg: PreprocessConfig, meta_path: Path, overwrite: bool = False) -> Path:
    ensure_dirs(cfg)
    out_good = cfg.cache_dir / cfg.meta_good_name
    if out_good.exists() and not overwrite:
        return out_good

    good = (
        pl.scan_parquet(str(meta_path))
        .filter(pl.col("n_obs") >= cfg.min_obs)
        .filter(pl.col("coverage_ratio") >= cfg.min_coverage)
        .filter(~pl.col("is_constant"))
    )
    safe_write_parquet(good, out_good, overwrite=True)
    return out_good


In [6]:
# =========================
# Cell 5 — Step C: Features RL (rolling) por día y algoritmo
# =========================

def build_features(cfg: PreprocessConfig, panel_path: Path, meta_good_path: Path | None = None, overwrite: bool = False) -> tuple[Path, Path]:
    """
    Genera:
      - algos_features.parquet (todos)
      - algos_features_good.parquet (solo good universe)
    """
    ensure_dirs(cfg)
    out_all = cfg.cache_dir / cfg.features_name
    out_good = cfg.cache_dir / cfg.features_good_name

    if out_all.exists() and out_good.exists() and not overwrite:
        return out_all, out_good

    ann_sqrt = float(np.sqrt(cfg.annualization_factor))

    base = (
        pl.scan_parquet(str(panel_path))
        .sort(["algo_id", "date"])
    )

    # Rolling features (solo con ret_1d y close)
    feats = base
    for w in cfg.feature_windows:
        feats = feats.with_columns([
            pl.col("ret_1d").rolling_mean(window_size=w, min_samples=w).over("algo_id").alias(f"ret_mean_{w}"),
            pl.col("ret_1d").rolling_std(window_size=w, min_samples=w).over("algo_id").alias(f"ret_std_{w}"),
            pl.col("ret_1d").rolling_sum(window_size=w, min_samples=w).over("algo_id").alias(f"ret_sum_{w}"),
        ]).with_columns([
            (pl.col(f"ret_std_{w}") * ann_sqrt).alias(f"vol_ann_{w}"),
            pl.when(pl.col(f"ret_std_{w}") > 0)
              .then((pl.col(f"ret_mean_{w}") / pl.col(f"ret_std_{w}")) * ann_sqrt)
              .otherwise(None)
              .alias(f"sharpe_ann_{w}"),
        ])

    # Extra features útiles y baratas:
    feats = feats.with_columns([
        # "price momentum" (log close - log close lag w) para el mayor window
        (pl.col("close").log() - pl.col("close").shift(cfg.feature_windows[-1]).log())
        .over("algo_id")
        .alias(f"log_mom_{cfg.feature_windows[-1]}"),
    ])

    # Selección final - logret_1d viene del panel base, aseguramos que esté disponible
    keep_cols = ["algo_id", "date", "close", "ret_1d", "logret_1d"] + \
               [f"ret_mean_{w}" for w in cfg.feature_windows] + \
               [f"ret_std_{w}" for w in cfg.feature_windows] + \
               [f"ret_sum_{w}" for w in cfg.feature_windows] + \
               [f"vol_ann_{w}" for w in cfg.feature_windows] + \
               [f"sharpe_ann_{w}" for w in cfg.feature_windows] + \
               [f"log_mom_{cfg.feature_windows[-1]}"]

    # Verificamos qué columnas realmente existen antes de seleccionar
    available_cols = feats.collect_schema().names()
    keep_cols = [c for c in keep_cols if c in available_cols]
    
    feats_all = feats.select(keep_cols)
    safe_write_parquet(feats_all, out_all, overwrite=True)

    # Filtrado a universo "good"
    if meta_good_path is None:
        raise PreprocessError("meta_good_path es None, pero se necesita para features_good.")
    good_ids = pl.scan_parquet(str(meta_good_path)).select(["algo_id"])
    feats_good = feats_all.join(good_ids, on="algo_id", how="inner")
    safe_write_parquet(feats_good, out_good, overwrite=True)

    return out_all, out_good


In [7]:
# =========================
# Cell 6 — Step D: Benchmark preprocessing (trades + returns)
# =========================

def preprocess_benchmark(cfg: PreprocessConfig, overwrite: bool = False) -> dict[str, Path]:
    ensure_dirs(cfg)

    trades_csv = cfg.root_dir / "trades_benchmark.csv"
    monthly_csv = cfg.root_dir / "benchmark_monthly_returns.csv"
    yearly_csv = cfg.root_dir / "benchmark_yearly_returns.csv"

    if not trades_csv.exists():
        raise PreprocessError(f"No existe {trades_csv}")
    if not monthly_csv.exists():
        raise PreprocessError(f"No existe {monthly_csv}")
    if not yearly_csv.exists():
        raise PreprocessError(f"No existe {yearly_csv}")

    out_trades = cfg.cache_dir / cfg.benchmark_trades_name
    out_monthly = cfg.cache_dir / cfg.benchmark_monthly_name
    out_yearly = cfg.cache_dir / cfg.benchmark_yearly_name
    out_monthly_stats = cfg.cache_dir / cfg.benchmark_monthly_stats_name

    # -------- trades_benchmark (limpio) --------
    if not out_trades.exists() or overwrite:
        t = pl.scan_csv(str(trades_csv), infer_schema_length=0)

        # Fechas robustas: extraemos YYYY-MM-DD directamente (evita problemas de tz)
        t = (
            t
            .with_columns([
                pl.col("dateOpen").cast(pl.Utf8).str.slice(0, 10).alias("open_date_str"),
                pl.col("dateClose").cast(pl.Utf8).str.slice(0, 10).alias("close_date_str"),
            ])
            .with_columns([
                pl.col("open_date_str").str.strptime(pl.Date, "%Y-%m-%d", strict=False).alias("open_date"),
                pl.col("close_date_str").str.strptime(pl.Date, "%Y-%m-%d", strict=False).alias("close_date"),
            ])
            .with_columns([
                (pl.col("close_date") - pl.col("open_date")).dt.total_days().cast(pl.Int64).alias("holding_days"),
            ])
            .with_columns([
                pl.col("volume").cast(pl.Float64, strict=False),
                pl.col("AUM").cast(pl.Float64, strict=False),
                pl.col("equity_EOD").cast(pl.Float64, strict=False),
                pl.col("equity_normalized").cast(pl.Float64, strict=False),
                pl.col("productname").cast(pl.Utf8),
            ])
            .select([
                "productname",
                "volume",
                "open_date",
                "close_date",
                "holding_days",
                "total_invested_amount_EOD",
                "equity_EOD",
                "AUM",
                "equity_normalized",
                "dateOpen",
                "dateClose",
            ])
            .filter(pl.col("open_date").is_not_null())
            .filter(pl.col("close_date").is_not_null())
        )

        safe_write_parquet(t, out_trades, overwrite=True)

    # -------- monthly returns (clean + consistency check) --------
    if not out_monthly.exists() or overwrite:
        m = (
            pl.scan_csv(str(monthly_csv), infer_schema_length=0)
            .with_columns([
                pl.col("month").cast(pl.Utf8).str.strptime(pl.Date, "%Y-%m", strict=False).alias("month_date"),
                pl.col("start_equity").cast(pl.Float64, strict=False),
                pl.col("end_equity").cast(pl.Float64, strict=False),
                pl.col("monthly_return").cast(pl.Float64, strict=False),
            ])
            .with_columns([
                (pl.col("end_equity") / pl.col("start_equity") - 1.0).alias("monthly_return_calc"),
                (pl.col("monthly_return") - (pl.col("end_equity") / pl.col("start_equity") - 1.0)).abs().alias("monthly_return_abs_err"),
            ])
            .select(["month_date", "start_equity", "end_equity", "monthly_return", "monthly_return_calc", "monthly_return_abs_err"])
        )
        safe_write_parquet(m, out_monthly, overwrite=True)

    # -------- yearly returns (clean + consistency check) --------
    if not out_yearly.exists() or overwrite:
        y = pl.scan_csv(str(yearly_csv), infer_schema_length=0)

        # Intento robusto: si viene columna "year" o similar, lo convertimos a date 1-ene
        cols = y.columns
        year_col = None
        for c in ("year", "Year", "anio", "año"):
            if c in cols:
                year_col = c
                break

        if year_col is None:
            # Si no hay, lo dejamos tal cual y casteamos numéricos si existen
            y2 = y
        else:
            y2 = (
                y.with_columns([
                    pl.col(year_col).cast(pl.Int32, strict=False).alias("year_int"),
                    pl.date(pl.col(year_col).cast(pl.Int32, strict=False), 1, 1).alias("year_date"),
                ])
            )

        # Casteos típicos si están
        for c in ("start_equity", "end_equity", "yearly_return"):
            if c in y2.columns:
                y2 = y2.with_columns(pl.col(c).cast(pl.Float64, strict=False))

        if "start_equity" in y2.columns and "end_equity" in y2.columns and "yearly_return" in y2.columns:
            y2 = y2.with_columns([
                (pl.col("end_equity") / pl.col("start_equity") - 1.0).alias("yearly_return_calc"),
                (pl.col("yearly_return") - (pl.col("end_equity") / pl.col("start_equity") - 1.0)).abs().alias("yearly_return_abs_err"),
            ])

        safe_write_parquet(y2, out_yearly, overwrite=True)

    # -------- stats mensuales del benchmark (desde trades) --------
    if not out_monthly_stats.exists() or overwrite:
        trades = pl.scan_parquet(str(out_trades))

        stats = (
            trades
            .with_columns([
                pl.col("open_date").dt.truncate("1mo").alias("month"),
                pl.col("close_date").dt.truncate("1mo").alias("month_close"),
            ])
            .group_by("month")
            .agg([
                pl.len().alias("n_trades_opened"),
                pl.col("productname").n_unique().alias("unique_algos_opened"),
                pl.col("volume").sum().alias("sum_volume_opened"),
                pl.col("holding_days").mean().alias("avg_holding_days_opened"),
                pl.col("AUM").mean().alias("avg_AUM"),
                pl.col("equity_normalized").mean().alias("avg_equity_norm"),
            ])
            .sort("month")
        )

        safe_write_parquet(stats, out_monthly_stats, overwrite=True)

    return {
        "benchmark_trades_clean": out_trades,
        "benchmark_monthly_clean": out_monthly,
        "benchmark_yearly_clean": out_yearly,
        "benchmark_monthly_stats": out_monthly_stats,
    }


In [8]:
# =========================
# Cell 7 — Sanity checks (rápidos y útiles)
# =========================

def sanity_checks(cfg: PreprocessConfig, panel_path: Path, meta_path: Path, meta_good_path: Path) -> None:
    panel = pl.scan_parquet(str(panel_path))
    meta = pl.scan_parquet(str(meta_path))
    good = pl.scan_parquet(str(meta_good_path))

    # Chequeos básicos
    panel_cols = panel.collect_schema().names()
    needed_panel = ["algo_id", "date", "close", "ret_1d"]  # logret_1d es opcional
    for c in needed_panel:
        if c not in panel_cols:
            raise PreprocessError(f"Panel no tiene columna requerida: {c}. Tiene: {panel_cols}")

    # Resumen rápido (collect pequeño)
    summary = meta.select([
        pl.len().alias("n_algos"),
        pl.col("n_obs").mean().alias("avg_n_obs"),
        pl.col("coverage_ratio").mean().alias("avg_coverage"),
        pl.col("is_constant").mean().alias("pct_constant"),
        pl.col("vol_ann").median().alias("median_vol_ann"),
        pl.col("sharpe_ann").median().alias("median_sharpe_ann"),
    ]).collect()

    good_summary = good.select([
        pl.len().alias("n_good_algos"),
        pl.col("n_obs").mean().alias("avg_n_obs_good"),
        pl.col("coverage_ratio").mean().alias("avg_coverage_good"),
    ]).collect()

    print("=== META SUMMARY ===")
    print(summary)
    print("\n=== GOOD UNIVERSE SUMMARY ===")
    print(good_summary)


In [9]:
# =========================
# Cell 8 — Orquestación (run)
# =========================

def run_preprocessing(cfg: PreprocessConfig, overwrite: bool = False) -> dict[str, Path]:
    ensure_dirs(cfg)

    # A) Panel
    panel_path = build_algos_panel(cfg, overwrite=overwrite)

    # B) Meta + alive intervals
    meta_path, alive_path = build_algos_meta(cfg, panel_path, overwrite=overwrite)
    meta_good_path = build_good_universe(cfg, meta_path, overwrite=overwrite)

    # C) Features RL
    feats_all_path, feats_good_path = build_features(
        cfg,
        panel_path=panel_path,
        meta_good_path=meta_good_path,
        overwrite=overwrite
    )

    # D) Benchmark
    bench_paths = preprocess_benchmark(cfg, overwrite=overwrite)

    # Checks
    sanity_checks(cfg, panel_path, meta_path, meta_good_path)

    return {
        "algos_panel": panel_path,
        "algos_meta": meta_path,
        "algos_meta_good": meta_good_path,
        "alive_intervals": alive_path,
        "algos_features": feats_all_path,
        "algos_features_good": feats_good_path,
        **bench_paths,
    }


In [None]:
# =========================
# Cell 9 — Ejecuta (ajusta root_dir)
# =========================

cfg = PreprocessConfig(
    root_dir=Path("datos_competicion"),
    cache_dir=Path("data/cache"),
    min_obs=60,
    min_coverage=0.70,
    feature_windows=(20, 60, 120),
)

paths = run_preprocessing(cfg, overwrite=True)  # overwrite=True para regenerar
paths

=== META SUMMARY ===
shape: (1, 6)
┌─────────┬────────────┬──────────────┬──────────────┬────────────────┬───────────────────┐
│ n_algos ┆ avg_n_obs  ┆ avg_coverage ┆ pct_constant ┆ median_vol_ann ┆ median_sharpe_ann │
│ ---     ┆ ---        ┆ ---          ┆ ---          ┆ ---            ┆ ---               │
│ u32     ┆ f64        ┆ f64          ┆ f64          ┆ f64            ┆ f64               │
╞═════════╪════════════╪══════════════╪══════════════╪════════════════╪═══════════════════╡
│ 13663   ┆ 649.866062 ┆ 0.839531     ┆ 0.247896     ┆ 0.055688       ┆ -0.398133         │
└─────────┴────────────┴──────────────┴──────────────┴────────────────┴───────────────────┘

=== GOOD UNIVERSE SUMMARY ===
shape: (1, 3)
┌──────────────┬────────────────┬───────────────────┐
│ n_good_algos ┆ avg_n_obs_good ┆ avg_coverage_good │
│ ---          ┆ ---            ┆ ---               │
│ u32          ┆ f64            ┆ f64               │
╞══════════════╪════════════════╪═══════════════════╡
│ 9491

  cols = y.columns
  if c in y2.columns:
  if "start_equity" in y2.columns and "end_equity" in y2.columns and "yearly_return" in y2.columns:


{'algos_panel': WindowsPath('cache/algos_panel.parquet'),
 'algos_meta': WindowsPath('cache/algos_meta.parquet'),
 'algos_meta_good': WindowsPath('cache/algos_meta_good.parquet'),
 'alive_intervals': WindowsPath('cache/alive_intervals.parquet'),
 'algos_features': WindowsPath('cache/algos_features.parquet'),
 'algos_features_good': WindowsPath('cache/algos_features_good.parquet'),
 'benchmark_trades_clean': WindowsPath('cache/benchmark_trades_clean.parquet'),
 'benchmark_monthly_clean': WindowsPath('cache/benchmark_monthly_clean.parquet'),
 'benchmark_yearly_clean': WindowsPath('cache/benchmark_yearly_clean.parquet'),
 'benchmark_monthly_stats': WindowsPath('cache/benchmark_monthly_stats.parquet')}

In [11]:
# # =========================
# # Cell 9 — Ejecuta (ajusta root_dir)
# # =========================

# cfg = PreprocessConfig(
#     root_dir=Path("datos_competicion"),
#     cache_dir=Path("cache"),
#     min_obs=60,
#     min_coverage=0.70,
#     feature_windows=(20, 60, 120),
# )

# paths = run_preprocessing(cfg, overwrite=False)
# paths
