In [None]:
# Parameters
run_date = "2026-01-01"  # papermill replacement
import os
output_dir = os.environ.get("ORION_SIGNALS_DIR", "../signals")
config_path = os.environ.get("DATUM_API_CONFIG_PATH", "../ops/datum_api_config.json")
dry_run = False

# ensure output exists
os.makedirs(output_dir, exist_ok=True)


In [1]:
# Import basic modules
import pandas as pd
from datum_api_client import DatumApi
import datetime
from datetime import timedelta
from typing import Optional, List, Dict, Any


# Import warnings
import warnings
warnings.filterwarnings("ignore")
# pip install xlrd
# pip install openpyxl

In [None]:
import os
import json
import gzip
from pathlib import Path
from datetime import time, datetime, date, timedelta
from typing import Optional, List, Dict, Any, Tuple

import numpy as np
import pandas as pd


# =========================
# ====== HELPERS ==========
# =========================

def _add_minutes_to_time(t: time, minutes: int) -> time:
    base = datetime.combine(date(2000, 1, 1), t)
    return (base + timedelta(minutes=minutes)).time()


def _time_bucket_str(t: time, bucket_minutes: int) -> str:
    if bucket_minutes <= 1:
        return f"{t.hour:02d}:{t.minute:02d}"
    m = (t.minute // bucket_minutes) * bucket_minutes
    return f"{t.hour:02d}:{m:02d}"


def _linspace_edges(x: np.ndarray, n_bins: int) -> Optional[np.ndarray]:
    x = x[np.isfinite(x)]
    if x.size == 0:
        return None
    mn = float(x.min())
    mx = float(x.max())
    if not np.isfinite(mn) or not np.isfinite(mx):
        return None
    if mn == mx:
        mx = mn + 1e-12
    return np.linspace(mn, mx, n_bins + 1, dtype=float)


def _bin_index(x: np.ndarray, edges: np.ndarray) -> np.ndarray:
    idx = np.digitize(x, edges, right=False) - 1
    n_bins = edges.size - 1
    return np.clip(idx, 0, n_bins - 1)


def _merge_adjacent_ranges(ranges: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
    if not ranges:
        return []
    ranges = sorted(ranges, key=lambda r: (r["from"], r["to"]))
    out = []
    cur = dict(ranges[0])
    cur_sum = cur["rate"] * cur["total"]

    for r in ranges[1:]:
        if r["from"] <= cur["to"]:
            cur["to"] = max(cur["to"], r["to"])
            cur_sum += r["rate"] * r["total"]
            cur["total"] += r["total"]
            cur["rate"] = (cur_sum / cur["total"]) if cur["total"] else 0.0
        else:
            out.append(cur)
            cur = dict(r)
            cur_sum = cur["rate"] * cur["total"]

    out.append(cur)
    for r in out:
        r["from"] = float(r["from"])
        r["to"] = float(r["to"])
        r["rate"] = float(r["rate"])
        r["total"] = int(r["total"])
    return out


# =========================
# ====== MAIN FUNC ========
# =========================

def analyze_open_strategy_fast(
    input_parquet: str,
    output_dir: str,
    *,
    time_col: str = "dt",
    ticker_col: str = "ticker",
    price_col: str = "c",
    stack_col: str = "Stack%",
    bench_col: str = "Bench%",
    devsig_col: str = "dev_sig",
    pre_from: time = time(9, 25),
    pre_to: time = time(9, 30),
    open_from: time = time(9, 30),
    open_to: time = time(10, 0),
    class_minutes: Tuple[int, ...] = (5, 10, 15, 20, 30),
    end_tolerance_minutes: int = 3,
    min_move_abs: float = 0.3,
    n_bins_1d: int = 20,
    n_bins_2d: int = 20,
    minRateTop: float = 0.6,
    minTotalTop: int = 4,
    peak_time_bin_minutes: int = 1,
) -> None:

    # =========================
    # ===== PATH SETUP ========
    # =========================

    out_dir = Path(output_dir).resolve()
    out_dir.mkdir(parents=True, exist_ok=True)

    summary_path = out_dir / "summary.csv"
    onefile_path = out_dir / "onefile.jsonl"
    best_params_path = out_dir / "best_params.jsonl"

    def _open_text(path: Path, mode: str = "wt"):
        if str(path).lower().endswith(".gz"):
            return gzip.open(path, mode, encoding="utf-8", newline="\n", compresslevel=6)
        return open(path, mode.replace("t", ""), encoding="utf-8", newline="\n")

    # =========================
    # ===== INIT SUMMARY ======
    # =========================

    summary_cols = [ticker_col]
    class_names = ["glob"] + [f"{m}m" for m in class_minutes]

    for cls in class_names:
        summary_cols.extend([
            f"open_{cls}_total",
            f"open_{cls}_up_rate",
            f"open_{cls}_down_rate",
            f"open_{cls}_mean_move",
            f"open_{cls}_median_move",
            f"open_{cls}_mean_stack_delta",
            f"open_{cls}_median_stack_delta",
        ])

    pd.DataFrame(columns=summary_cols).to_csv(summary_path, index=False, mode="w")

    # =========================
    # ===== INIT BEST PARAMS ==
    # =========================

    with _open_text(best_params_path, "wt") as best_out:
        best_out.write(json.dumps({
            "meta": {
                "version": "open_v2",
                "generated_at": datetime.utcnow().isoformat() + "Z"
            }
        }, ensure_ascii=False) + "\n")

    # =========================
    # ===== ORIGINAL LOGIC ====
    # =========================

    cols = [time_col, ticker_col, price_col, stack_col, bench_col, devsig_col]
    df = pd.read_parquet(input_parquet, columns=cols)

    df = df[df[time_col].notna() & df[ticker_col].notna()].copy()
    df[time_col] = pd.to_datetime(df[time_col])
    df["session_date"] = df[time_col].dt.date
    df["time_only"] = df[time_col].dt.time

    for c in (price_col, stack_col, bench_col, devsig_col):
        df[c] = pd.to_numeric(df[c], errors="coerce")

    df = df[df[price_col].notna()].copy()
    if df.empty:
        raise ValueError("No rows with price present.")

    # --- PRE window
    pre_mask = (df["time_only"] >= pre_from) & (df["time_only"] <= pre_to)
    df_pre = df.loc[pre_mask].copy()
    df_pre = df_pre[
        df_pre[stack_col].notna() &
        df_pre[bench_col].notna() &
        df_pre[devsig_col].notna()
    ].copy()

    if df_pre.empty:
        raise ValueError("No pre-window rows.")

    df_pre = df_pre.sort_values(time_col)
    start_idx = df_pre.groupby([ticker_col, "session_date"])[time_col].idxmin()
    starts = df_pre.loc[start_idx].copy()

    starts["__key"] = starts[ticker_col].astype(str) + "|" + starts["session_date"].astype(str)
    starts = starts.set_index("__key", drop=True)

    # --- OPEN window
    open_end_to = _add_minutes_to_time(open_to, end_tolerance_minutes)
    df_open = df[(df["time_only"] >= open_from) & (df["time_only"] <= open_end_to)].copy()
    df_open["__key"] = df_open[ticker_col].astype(str) + "|" + df_open["session_date"].astype(str)

    class_defs = [("glob", open_to)]
    for m in class_minutes:
        class_defs.append((f"{int(m)}m", _add_minutes_to_time(open_from, int(m))))

    events_all = []

    for cls, t_target in class_defs:
        t_end_to = _add_minutes_to_time(t_target, end_tolerance_minutes)
        m_end = (df_open["time_only"] >= t_target) & (df_open["time_only"] <= t_end_to)
        cand = df_open.loc[m_end].copy()
        if cand.empty:
            continue

        cand = cand.sort_values(time_col)
        end_idx = cand.groupby([ticker_col, "session_date"])[time_col].idxmin()
        ends = cand.loc[end_idx].copy()

        ends["__key"] = ends[ticker_col].astype(str) + "|" + ends["session_date"].astype(str)
        ends = ends.set_index("__key", drop=True)

        common_keys = starts.index.intersection(ends.index)
        if common_keys.empty:
            continue

        s = starts.loc[common_keys]
        e = ends.loc[common_keys]

        s_dt = s[time_col].to_numpy()
        e_dt = e[time_col].to_numpy()
        ok = e_dt > s_dt
        if not ok.any():
            continue

        s = s.iloc[ok]
        e = e.iloc[ok]

        start_price = s[price_col].to_numpy(dtype=float)
        end_price = e[price_col].to_numpy(dtype=float)
        move_pct = 100.0 * (end_price / start_price - 1.0)

        start_stack = s[stack_col].to_numpy(dtype=float)
        end_stack = e[stack_col].to_numpy(dtype=float)
        stack_delta = end_stack - start_stack

        good = np.isfinite(move_pct) & (np.abs(move_pct) >= float(min_move_abs))
        if not good.any():
            continue

        s = s.iloc[good]
        e = e.iloc[good]

        move_pct = move_pct[good]
        stack_delta = stack_delta[good]

        direction = np.where(move_pct > 0.0, "up", "down")

        ev = pd.DataFrame({
            ticker_col: s[ticker_col].to_numpy(),
            "session_date": s["session_date"].to_numpy(),
            "class": cls,
            "start_dt": s[time_col].to_numpy(),
            "end_dt": e[time_col].to_numpy(),
            "x_stack": s[stack_col].to_numpy(dtype=float),
            "x_bench": s[bench_col].to_numpy(dtype=float),
            "x_dev": s[devsig_col].to_numpy(dtype=float),
            "move_pct": move_pct,
            "stack_delta": stack_delta,
            "dir": direction,
        })

        if not ev.empty:
            events_all.append(ev)

    if not events_all:
        raise ValueError("No events built.")

    events = pd.concat(events_all, ignore_index=True)

    # =========================
    # ===== SUMMARY APPEND ====
    # =========================

    rows = []
    for cls, _ in class_defs:
        evc = events[events["class"] == cls]
        if evc.empty:
            continue

        g = evc.groupby(ticker_col)
        tmp = g.agg(
            total=("dir", "count"),
            up=("dir", lambda s: int((s == "up").sum())),
            down=("dir", lambda s: int((s == "down").sum())),
            mean_move=("move_pct", "mean"),
            median_move=("move_pct", "median"),
            mean_stack_delta=("stack_delta", "mean"),
            median_stack_delta=("stack_delta", "median"),
        ).reset_index()

        tmp[f"open_{cls}_total"] = tmp["total"]
        tmp[f"open_{cls}_up_rate"] = tmp["up"] / tmp["total"]
        tmp[f"open_{cls}_down_rate"] = tmp["down"] / tmp["total"]
        tmp[f"open_{cls}_mean_move"] = tmp["mean_move"]
        tmp[f"open_{cls}_median_move"] = tmp["median_move"]
        tmp[f"open_{cls}_mean_stack_delta"] = tmp["mean_stack_delta"]
        tmp[f"open_{cls}_median_stack_delta"] = tmp["median_stack_delta"]

        tmp = tmp[[ticker_col] + [c for c in tmp.columns if c != ticker_col]]
        rows.append(tmp)

    if rows:
        summary = rows[0]
        for r in rows[1:]:
            summary = summary.merge(r, on=ticker_col, how="outer")

        summary.to_csv(summary_path, index=False, mode="a", header=False)

    # =========================
    # ===== JSONL WRITE =======
    # =========================

    with _open_text(onefile_path, "wt") as onefile_out, \
         _open_text(best_params_path, "at") as best_out:

        for ticker, ev_t in events.groupby(ticker_col):

            rec = {"ticker": ticker, "classes": {}}
            best = {"ticker": ticker, "ratings": {}, "best_ranges": {}}

            for cls, evc in ev_t.groupby("class"):
                mv = evc["move_pct"].to_numpy(dtype=float)
                sd = evc["stack_delta"].to_numpy(dtype=float)
                is_up = (evc["dir"].to_numpy() == "up")

                total_cls = int(evc.shape[0])
                up_rate_cls = float(is_up.mean()) if total_cls else 0.0
                down_rate_cls = 1.0 - up_rate_cls if total_cls else 0.0

                best["ratings"][cls] = {
                    "any": {"rate": float(max(up_rate_cls, down_rate_cls)), "total": total_cls},
                    "up": {"rate": up_rate_cls, "total": total_cls},
                    "down": {"rate": down_rate_cls, "total": total_cls},
                }

                rec["classes"][cls] = {
                    "stats": {
                        "total": total_cls,
                        "up_rate": up_rate_cls,
                        "down_rate": down_rate_cls,
                        "mean_move": float(np.mean(mv)) if mv.size else None,
                        "median_move": float(np.median(mv)) if mv.size else None,
                        "mean_stack_delta": float(np.mean(sd)) if sd.size else None,
                        "median_stack_delta": float(np.median(sd)) if sd.size else None,
                    }
                }

            onefile_out.write(json.dumps(rec, ensure_ascii=False) + "\n")
            best_out.write(json.dumps(best, ensure_ascii=False) + "\n")

    print("GOTOVO:")
    print(" ", summary_path)
    print(" ", onefile_path)
    print(" ", best_params_path)

In [None]:
from pathlib import Path
import os


def _resolve_orion_paths(strategy_code: str):
    final_env = os.environ.get("FINAL_PARQUET_PATH")
    sig_env   = os.environ.get("SIGNALS_DIR")
    orion_env = os.environ.get("ORION_HOME")

    final_path = Path(final_env).expanduser().resolve() if final_env else None
    signals_base = Path(sig_env).expanduser().resolve() if sig_env else None

    if (final_path is None or signals_base is None) and orion_env:
        orion_home = Path(orion_env).expanduser().resolve()
        if final_path is None:
            final_path = (orion_home / "CRACEN" / "final.parquet").resolve()
        if signals_base is None:
            signals_base = (orion_home / "signals").resolve()

    if final_path is None or signals_base is None:
        here = Path.cwd().resolve()
        orion_home = None
        for parent in [here] + list(here.parents):
            if parent.name.lower() == "orion":
                orion_home = parent
                break
            cand = parent / "OriON"
            if cand.exists() and cand.is_dir():
                orion_home = cand.resolve()
                break

        if orion_home is None:
            raise RuntimeError("Cannot locate OriON. Set ORION_HOME (recommended).")

        if final_path is None:
            final_path = (orion_home / "CRACEN" / "final.parquet").resolve()
        if signals_base is None:
            signals_base = (orion_home / "signals").resolve()

    out_dir = (signals_base / strategy_code.lower()).resolve()
    out_dir.mkdir(parents=True, exist_ok=True)

    if not final_path.exists():
        raise FileNotFoundError(f"FINAL parquet not found: {final_path}")

    return final_path, out_dir


# ------------------------------------------------------------------
# MAIN (NO SORTING)
# ------------------------------------------------------------------

FINAL_PATH, OUT_DIR = _resolve_orion_paths("open")

print("Using FINAL parquet (no pre-sorting):", FINAL_PATH)

analyze_open_strategy_fast(
    input_parquet=str(FINAL_PATH),
    output_dir=str(OUT_DIR),

    n_bins_1d=10,
    n_bins_2d=10,

    min_move_abs=0.3,
    minRateTop=0.6,
    minTotalTop=4,

    peak_time_bin_minutes=1,
)

print("OPEN analysis completed (no sorting).")

GOTOVO:
  ARBITRAGE/open_analysis_fast\summary.csv
  ARBITRAGE/open_analysis_fast\onefile.jsonl
  ARBITRAGE/open_analysis_fast\best_params.jsonl
