In [None]:
# Parameters
run_date = "2026-01-01"  # papermill replacement
import os
output_dir = os.environ.get("ORION_SIGNALS_DIR", "../signals")
config_path = os.environ.get("DATUM_API_CONFIG_PATH", "../ops/datum_api_config.json")
dry_run = False

# ensure output exists
os.makedirs(output_dir, exist_ok=True)


In [1]:
# Import basic modules
import pandas as pd
from datum_api_client import DatumApi
import datetime
from datetime import timedelta
from typing import Optional, List, Dict, Any


# Import warnings
import warnings
warnings.filterwarnings("ignore")
# pip install xlrd
# pip install openpyxl

In [None]:
import os
import json
from datetime import time, datetime, date, timedelta
from typing import Optional, List, Dict, Any, Tuple

import numpy as np
import pandas as pd


def _add_minutes_to_time(t: time, minutes: int) -> time:
    base = datetime.combine(date(2000, 1, 1), t)
    return (base + timedelta(minutes=minutes)).time()


def _time_bucket_str(t: time, bucket_minutes: int) -> str:
    if bucket_minutes <= 1:
        return f"{t.hour:02d}:{t.minute:02d}"
    m = (t.minute // bucket_minutes) * bucket_minutes
    return f"{t.hour:02d}:{m:02d}"


def _linspace_edges(x: np.ndarray, n_bins: int) -> Optional[np.ndarray]:
    x = x[np.isfinite(x)]
    if x.size == 0:
        return None
    mn = float(x.min())
    mx = float(x.max())
    if not np.isfinite(mn) or not np.isfinite(mx):
        return None
    if mn == mx:
        mx = mn + 1e-12
    return np.linspace(mn, mx, n_bins + 1, dtype=float)


def _bin_index(x: np.ndarray, edges: np.ndarray) -> np.ndarray:
    idx = np.digitize(x, edges, right=False) - 1
    n_bins = edges.size - 1
    return np.clip(idx, 0, n_bins - 1)


def _merge_adjacent_ranges(ranges: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
    if not ranges:
        return []
    ranges = sorted(ranges, key=lambda r: (r["from"], r["to"]))
    out = []
    cur = dict(ranges[0])
    cur_sum = cur["rate"] * cur["total"]

    for r in ranges[1:]:
        if r["from"] <= cur["to"]:
            cur["to"] = max(cur["to"], r["to"])
            cur_sum += r["rate"] * r["total"]
            cur["total"] += r["total"]
            cur["rate"] = (cur_sum / cur["total"]) if cur["total"] else 0.0
        else:
            out.append(cur)
            cur = dict(r)
            cur_sum = cur["rate"] * cur["total"]

    out.append(cur)
    for r in out:
        r["from"] = float(r["from"])
        r["to"] = float(r["to"])
        r["rate"] = float(r["rate"])
        r["total"] = int(r["total"])
    return out


def _bins_1d_fast(
    x: np.ndarray,
    move: np.ndarray,
    stack_delta: np.ndarray,
    is_up: np.ndarray,
    edges: np.ndarray,
) -> dict:
    """
    Returns:
      {
        "edges": [...],
        "bins": [
          {
            "from","to","total","up_count","down_count","up_rate","down_rate",
            "up":   {"n","mean","median"},
            "down": {"n","mean","median"},
            "stack_delta": {
              "up":   {"n","sum","mean","median"},
              "down": {"n","sum","mean","median"}
            }
          }, ...
        ]
      }
    """
    n_bins = edges.size - 1
    b = _bin_index(x, edges)

    total = np.bincount(b, minlength=n_bins).astype(int)
    up_count = np.bincount(b[is_up], minlength=n_bins).astype(int)
    dn_count = (total - up_count).astype(int)

    # move means
    up_move_sum = np.bincount(b[is_up], weights=move[is_up], minlength=n_bins)
    dn_move_sum = np.bincount(b[~is_up], weights=move[~is_up], minlength=n_bins)
    up_move_mean = np.divide(up_move_sum, up_count, out=np.full(n_bins, np.nan), where=up_count > 0)
    dn_move_mean = np.divide(dn_move_sum, dn_count, out=np.full(n_bins, np.nan), where=dn_count > 0)

    # stack_delta sums/means
    up_sd_sum = np.bincount(b[is_up], weights=stack_delta[is_up], minlength=n_bins)
    dn_sd_sum = np.bincount(b[~is_up], weights=stack_delta[~is_up], minlength=n_bins)
    up_sd_mean = np.divide(up_sd_sum, up_count, out=np.full(n_bins, np.nan), where=up_count > 0)
    dn_sd_mean = np.divide(dn_sd_sum, dn_count, out=np.full(n_bins, np.nan), where=dn_count > 0)

    # medians per bin (n_bins small -> ok)
    up_move_median = [None] * n_bins
    dn_move_median = [None] * n_bins
    up_sd_median = [None] * n_bins
    dn_sd_median = [None] * n_bins

    for i in range(n_bins):
        if total[i] == 0:
            continue
        mask_i = (b == i)

        mv_i = move[mask_i]
        sd_i = stack_delta[mask_i]
        up_i = is_up[mask_i]

        mv_up = mv_i[up_i]
        mv_dn = mv_i[~up_i]
        sd_up = sd_i[up_i]
        sd_dn = sd_i[~up_i]

        if mv_up.size:
            up_move_median[i] = float(np.median(mv_up))
        if mv_dn.size:
            dn_move_median[i] = float(np.median(mv_dn))
        if sd_up.size:
            up_sd_median[i] = float(np.median(sd_up))
        if sd_dn.size:
            dn_sd_median[i] = float(np.median(sd_dn))

    bins_out = []
    for i in range(n_bins):
        if total[i] == 0:
            continue
        up_rate = float(up_count[i] / total[i]) if total[i] else None
        dn_rate = float(dn_count[i] / total[i]) if total[i] else None

        bins_out.append({
            "from": float(edges[i]),
            "to": float(edges[i + 1]),
            "total": int(total[i]),
            "up_count": int(up_count[i]),
            "down_count": int(dn_count[i]),
            "up_rate": up_rate,
            "down_rate": dn_rate,
            "up": {
                "n": int(up_count[i]),
                "mean": None if not np.isfinite(up_move_mean[i]) else float(up_move_mean[i]),
                "median": up_move_median[i],
            },
            "down": {
                "n": int(dn_count[i]),
                "mean": None if not np.isfinite(dn_move_mean[i]) else float(dn_move_mean[i]),
                "median": dn_move_median[i],
            },
            "stack_delta": {
                "up": {
                    "n": int(up_count[i]),
                    "sum": float(up_sd_sum[i]) if up_count[i] else 0.0,
                    "mean": None if not np.isfinite(up_sd_mean[i]) else float(up_sd_mean[i]),
                    "median": up_sd_median[i],
                },
                "down": {
                    "n": int(dn_count[i]),
                    "sum": float(dn_sd_sum[i]) if dn_count[i] else 0.0,
                    "mean": None if not np.isfinite(dn_sd_mean[i]) else float(dn_sd_mean[i]),
                    "median": dn_sd_median[i],
                },
            },
        })

    return {"edges": edges.tolist(), "bins": bins_out}


def _heatmap_avg_fast(
    x: np.ndarray,
    y: np.ndarray,
    move: np.ndarray,
    stack_delta: np.ndarray,
    n_bins: int,
) -> dict:
    """
    Returns:
      {
        "x_edges":[...], "y_edges":[...],
        "cells":[
          {"x_from","x_to","y_from","y_to","n","avg_move","avg_stack_delta"}
        ]
      }
    """
    x_edges = _linspace_edges(x, n_bins)
    y_edges = _linspace_edges(y, n_bins)
    if x_edges is None or y_edges is None:
        return {"x_edges": None, "y_edges": None, "cells": []}

    xi = _bin_index(x, x_edges)
    yi = _bin_index(y, y_edges)

    cell = xi * n_bins + yi
    counts = np.bincount(cell, minlength=n_bins * n_bins).astype(int)
    move_sums = np.bincount(cell, weights=move, minlength=n_bins * n_bins)
    sd_sums = np.bincount(cell, weights=stack_delta, minlength=n_bins * n_bins)

    nonzero = np.nonzero(counts)[0]
    cells = []
    for k in nonzero:
        n = int(counts[k])
        if n <= 0:
            continue
        bx = int(k // n_bins)
        by = int(k % n_bins)
        cells.append({
            "x_from": float(x_edges[bx]),
            "x_to": float(x_edges[bx + 1]),
            "y_from": float(y_edges[by]),
            "y_to": float(y_edges[by + 1]),
            "n": n,
            "avg_move": float(move_sums[k] / n),
            "avg_stack_delta": float(sd_sums[k] / n),
        })

    return {"x_edges": x_edges.tolist(), "y_edges": y_edges.tolist(), "cells": cells}


def analyze_open_strategy_fast(
    input_parquet: str,
    output_dir: str,
    *,
    time_col: str = "dt",
    ticker_col: str = "ticker",
    price_col: str = "c",
    stack_col: str = "Stack%",
    bench_col: str = "Bench%",
    devsig_col: str = "dev_sig",
    pre_from: time = time(9, 25),
    pre_to: time = time(9, 30),
    open_from: time = time(9, 30),
    open_to: time = time(10, 0),
    class_minutes: Tuple[int, ...] = (5, 10, 15, 20, 30),
    end_tolerance_minutes: int = 3,
    min_move_abs: float = 0.3,
    n_bins_1d: int = 20,
    n_bins_2d: int = 20,
    minRateTop: float = 0.6,
    minTotalTop: int = 4,
    peak_time_bin_minutes: int = 1,
) -> None:
    """
    Fast OPEN exporter with all data needed for UI rendering:
      - per class: 1D bins (edges+bins) for stack/bench/dev_sig with direction rates + move stats + stack_delta stats
      - per class: 2D heatmaps (x_edges,y_edges,cells) with avg_move + avg_stack_delta
      - for glob class: peak-time histogram of Stack% max/min during 09:30-10:00
    Outputs:
      - summary.csv
      - onefile.jsonl
      - best_params.jsonl  (✅ UPDATED: matches OpenDoorBestParamsPath reader)
    """
    os.makedirs(output_dir, exist_ok=True)
    summary_path = os.path.join(output_dir, "summary.csv")
    onefile_path = os.path.join(output_dir, "onefile.jsonl")
    best_params_path = os.path.join(output_dir, "best_params.jsonl")

    # 1) Read minimal columns
    cols = [time_col, ticker_col, price_col, stack_col, bench_col, devsig_col]
    df = pd.read_parquet(input_parquet, columns=cols)

    missing = [c for c in cols if c not in df.columns]
    if missing:
        raise ValueError(f"Missing columns in input: {missing}")

    df = df[df[time_col].notna() & df[ticker_col].notna()].copy()
    df[time_col] = pd.to_datetime(df[time_col])
    df["session_date"] = df[time_col].dt.date
    df["time_only"] = df[time_col].dt.time

    for c in (price_col, stack_col, bench_col, devsig_col):
        df[c] = pd.to_numeric(df[c], errors="coerce")

    df = df[df[price_col].notna()].copy()
    if df.empty:
        raise ValueError("No rows with price present after cleaning.")

    # 2) Start (pre-window): first available per (ticker,date) WITH non-NaN features
    pre_mask = (df["time_only"] >= pre_from) & (df["time_only"] <= pre_to)
    df_pre = df.loc[pre_mask].copy()
    df_pre = df_pre[df_pre[stack_col].notna() & df_pre[bench_col].notna() & df_pre[devsig_col].notna()].copy()
    if df_pre.empty:
        raise ValueError("No pre-window rows with non-NaN Stack%/Bench%/dev_sig in 09:25–09:30.")

    df_pre = df_pre.sort_values(time_col)
    start_idx = df_pre.groupby([ticker_col, "session_date"])[time_col].idxmin()
    starts = df_pre.loc[start_idx].copy()

    starts["__key"] = starts[ticker_col].astype(str) + "|" + starts["session_date"].astype(str)
    starts = starts.set_index("__key", drop=True)

    # 3) Prepare end candidates once: only 09:30..10:00 + tolerance
    open_end_to = _add_minutes_to_time(open_to, end_tolerance_minutes)
    df_open = df[(df["time_only"] >= open_from) & (df["time_only"] <= open_end_to)].copy()
    if df_open.empty:
        raise ValueError("No rows in open window 09:30–10:00(+tolerance).")

    df_open["__key"] = df_open[ticker_col].astype(str) + "|" + df_open["session_date"].astype(str)

    # Define classes: glob + Nm
    class_defs: List[Tuple[str, time]] = [("glob", open_to)]
    for m in class_minutes:
        class_defs.append((f"{int(m)}m", _add_minutes_to_time(open_from, int(m))))

    events_all = []

    # 4) Build events per class
    for cls, t_target in class_defs:
        t_end_to = _add_minutes_to_time(t_target, end_tolerance_minutes)
        m_end = (df_open["time_only"] >= t_target) & (df_open["time_only"] <= t_end_to)
        cand = df_open.loc[m_end].copy()
        if cand.empty:
            continue

        cand = cand.sort_values(time_col)
        end_idx = cand.groupby([ticker_col, "session_date"])[time_col].idxmin()
        ends = cand.loc[end_idx].copy()

        ends["__key"] = ends[ticker_col].astype(str) + "|" + ends["session_date"].astype(str)
        ends = ends.set_index("__key", drop=True)

        common_keys = starts.index.intersection(ends.index)
        if common_keys.empty:
            continue

        s = starts.loc[common_keys]
        e = ends.loc[common_keys]

        # chronological check
        s_dt = s[time_col].to_numpy()
        e_dt = e[time_col].to_numpy()
        ok = e_dt > s_dt
        if not ok.any():
            continue

        idx_ok = np.where(ok)[0]
        s = s.iloc[idx_ok]
        e = e.iloc[idx_ok]

        start_price = s[price_col].to_numpy(dtype=float)
        end_price = e[price_col].to_numpy(dtype=float)
        move_pct = 100.0 * (end_price / start_price - 1.0)

        # stack_delta (end_stack - start_stack)
        start_stack = s[stack_col].to_numpy(dtype=float)
        end_stack = e[stack_col].to_numpy(dtype=float)
        stack_delta = end_stack - start_stack

        # filter by min_move_abs
        good = np.isfinite(move_pct) & (np.abs(move_pct) >= float(min_move_abs))
        if not good.any():
            continue

        idx_good = np.where(good)[0]
        s = s.iloc[idx_good]
        e = e.iloc[idx_good]

        move_pct = move_pct[good]
        stack_delta = stack_delta[good]
        start_price = start_price[good]
        end_price = end_price[good]

        dir_up = move_pct > 0.0
        direction = np.where(dir_up, "up", "down")

        ev = pd.DataFrame({
            ticker_col: s[ticker_col].to_numpy(),
            "session_date": s["session_date"].to_numpy(),
            "class": cls,
            "start_dt": s[time_col].to_numpy(),
            "end_dt": e[time_col].to_numpy(),
            "x_stack": s[stack_col].to_numpy(dtype=float),
            "x_bench": s[bench_col].to_numpy(dtype=float),
            "x_dev": s[devsig_col].to_numpy(dtype=float),
            "start_price": start_price,
            "end_price": end_price,
            "move_pct": move_pct,
            "stack_delta": stack_delta,
            "dir": direction,
        })

        # drop rows where any needed feature/outcome is NaN
        ev = ev[
            np.isfinite(ev["x_stack"])
            & np.isfinite(ev["x_bench"])
            & np.isfinite(ev["x_dev"])
            & np.isfinite(ev["move_pct"])
            & np.isfinite(ev["stack_delta"])
        ].copy()

        if not ev.empty:
            events_all.append(ev)

    if not events_all:
        raise ValueError("No events built for any class after filters. Check min_move_abs / data coverage.")

    events = pd.concat(events_all, ignore_index=True)

    # 5) summary.csv (wide, one row per ticker)
    rows = []
    for cls, _ in class_defs:
        evc = events[events["class"] == cls]
        if evc.empty:
            continue

        g = evc.groupby(ticker_col)
        tmp = g.agg(
            total=("dir", "count"),
            up=("dir", lambda s: int((s == "up").sum())),
            down=("dir", lambda s: int((s == "down").sum())),
            mean_move=("move_pct", "mean"),
            median_move=("move_pct", "median"),
            mean_stack_delta=("stack_delta", "mean"),
            median_stack_delta=("stack_delta", "median"),
        ).reset_index()

        tmp[f"open_{cls}_total"] = tmp["total"]
        tmp[f"open_{cls}_up_rate"] = tmp["up"] / tmp["total"]
        tmp[f"open_{cls}_down_rate"] = tmp["down"] / tmp["total"]
        tmp[f"open_{cls}_mean_move"] = tmp["mean_move"]
        tmp[f"open_{cls}_median_move"] = tmp["median_move"]
        tmp[f"open_{cls}_mean_stack_delta"] = tmp["mean_stack_delta"]
        tmp[f"open_{cls}_median_stack_delta"] = tmp["median_stack_delta"]

        tmp = tmp[
            [
                ticker_col,
                f"open_{cls}_total",
                f"open_{cls}_up_rate",
                f"open_{cls}_down_rate",
                f"open_{cls}_mean_move",
                f"open_{cls}_median_move",
                f"open_{cls}_mean_stack_delta",
                f"open_{cls}_median_stack_delta",
            ]
        ]
        rows.append(tmp)

    summary = rows[0]
    for r in rows[1:]:
        summary = summary.merge(r, on=ticker_col, how="outer")

    summary = summary.sort_values(ticker_col)
    summary.to_csv(summary_path, index=False)

    # 6) onefile.jsonl + best_params.jsonl
    with open(onefile_path, "w", encoding="utf-8") as onefile_out, open(best_params_path, "w", encoding="utf-8") as best_out:

        # peak histogram source (strict 09:30..10:00) with Stack% present
        df_peaks = df[
            (df["time_only"] >= open_from)
            & (df["time_only"] <= open_to)
            & df[stack_col].notna()
        ].copy()

        for ticker, ev_t in events.groupby(ticker_col):
            rec = {"ticker": ticker, "classes": {}}

            # ✅ best_params row (matches StrategySignalService.LoadOpenDoorBestParams)
            best = {
                "ticker": ticker,
                "ratings": {},     # cls -> { any:{rate,total}, up:{rate,total}, down:{rate,total} }
                "best_ranges": {}, # cls -> { stack:{up/down}, bench:{...}, dev_sig:{...} }
            }

            # glob peak-time histogram per ticker
            up_counts: Dict[str, int] = {}
            down_counts: Dict[str, int] = {}
            w = df_peaks[df_peaks[ticker_col] == ticker]
            if not w.empty:
                for _, gd in w.groupby("session_date"):
                    i_max = gd[stack_col].idxmax()
                    i_min = gd[stack_col].idxmin()
                    t_max = gd.loc[i_max, "time_only"]
                    t_min = gd.loc[i_min, "time_only"]
                    k_up = _time_bucket_str(t_max, peak_time_bin_minutes)
                    k_dn = _time_bucket_str(t_min, peak_time_bin_minutes)
                    up_counts[k_up] = up_counts.get(k_up, 0) + 1
                    down_counts[k_dn] = down_counts.get(k_dn, 0) + 1

            peak_hist = {
                "bin_minutes": int(peak_time_bin_minutes),
                "up_peak": up_counts,
                "down_peak": down_counts
            }

            for cls, evc in ev_t.groupby("class"):
                mv = evc["move_pct"].to_numpy(dtype=float)
                sd = evc["stack_delta"].to_numpy(dtype=float)

                x_stack = evc["x_stack"].to_numpy(dtype=float)
                x_bench = evc["x_bench"].to_numpy(dtype=float)
                x_dev = evc["x_dev"].to_numpy(dtype=float)

                is_up = (evc["dir"].to_numpy() == "up")

                cls_obj = {
                    "stats": {
                        "total": int(evc.shape[0]),
                        "up_count": int(is_up.sum()),
                        "down_count": int((~is_up).sum()),
                        "up_rate": float(is_up.mean()) if evc.shape[0] else None,
                        "down_rate": float((~is_up).mean()) if evc.shape[0] else None,
                        "mean_move": float(np.mean(mv)) if mv.size else None,
                        "median_move": float(np.median(mv)) if mv.size else None,
                        "mean_stack_delta": float(np.mean(sd)) if sd.size else None,
                        "median_stack_delta": float(np.median(sd)) if sd.size else None,
                    },
                    "bins_1d": {},
                    "heatmaps": {},
                }

                # 1D bins
                e_stack = _linspace_edges(x_stack, n_bins_1d)
                e_bench = _linspace_edges(x_bench, n_bins_1d)
                e_dev = _linspace_edges(x_dev, n_bins_1d)

                cls_obj["bins_1d"]["stack"] = {"edges": None, "bins": []} if e_stack is None else _bins_1d_fast(x_stack, mv, sd, is_up, e_stack)
                cls_obj["bins_1d"]["bench"] = {"edges": None, "bins": []} if e_bench is None else _bins_1d_fast(x_bench, mv, sd, is_up, e_bench)
                cls_obj["bins_1d"]["dev_sig"] = {"edges": None, "bins": []} if e_dev is None else _bins_1d_fast(x_dev, mv, sd, is_up, e_dev)

                # 2D heatmaps
                cls_obj["heatmaps"]["stack_vs_bench"] = _heatmap_avg_fast(x_stack, x_bench, mv, sd, n_bins_2d)
                cls_obj["heatmaps"]["stack_vs_dev"] = _heatmap_avg_fast(x_stack, x_dev, mv, sd, n_bins_2d)
                cls_obj["heatmaps"]["bench_vs_dev"] = _heatmap_avg_fast(x_bench, x_dev, mv, sd, n_bins_2d)

                # ============================
                # ✅ best_params.jsonl payload
                # ============================

                total_cls = int(evc.shape[0])
                up_rate_cls = float(is_up.mean()) if total_cls else 0.0
                down_rate_cls = 1.0 - up_rate_cls if total_cls else 0.0

                best["ratings"][cls] = {
                    "any":  {"rate": float(max(up_rate_cls, down_rate_cls)), "total": int(total_cls)},
                    "up":   {"rate": float(up_rate_cls), "total": int(total_cls)},
                    "down": {"rate": float(down_rate_cls), "total": int(total_cls)},
                }

                best_cls_ranges: Dict[str, Any] = {}
                for feat in ("stack", "bench", "dev_sig"):
                    bins_list = cls_obj["bins_1d"][feat]["bins"]

                    up_ranges = []
                    dn_ranges = []

                    for b in bins_list:
                        total = int(b.get("total", 0) or 0)
                        if total < int(minTotalTop):
                            continue

                        upc = int(b.get("up_count", 0) or 0)
                        dnc = int(b.get("down_count", 0) or 0)
                        if total <= 0:
                            continue

                        ur = upc / total
                        dr = dnc / total

                        if ur >= float(minRateTop):
                            up_ranges.append({"from": b["from"], "to": b["to"], "rate": float(ur), "total": int(total)})
                        if dr >= float(minRateTop):
                            dn_ranges.append({"from": b["from"], "to": b["to"], "rate": float(dr), "total": int(total)})

                    best_cls_ranges[feat] = {
                        "up": _merge_adjacent_ranges(up_ranges),
                        "down": _merge_adjacent_ranges(dn_ranges),
                    }

                best["best_ranges"][cls] = best_cls_ranges

                if cls == "glob":
                    cls_obj["glob_peak_time"] = peak_hist

                rec["classes"][cls] = cls_obj

            onefile_out.write(json.dumps(rec, ensure_ascii=False) + "\n")
            best_out.write(json.dumps(best, ensure_ascii=False) + "\n")

    print("GOTOVO:")
    print(" ", summary_path)
    print(" ", onefile_path)
    print(" ", best_params_path)



In [None]:
# from datetime import time, datetime, date, timedelta

# analyze_open_strategy_fast(
#     input_parquet="ARBITRAGE/final_filtered.parquet",
#     output_dir="ARBITRAGE/open_analysis_fast",
#     n_bins_1d=10,
#     n_bins_2d=10,
#     min_move_abs=0.3,
#     minRateTop=0.6,
#     minTotalTop=4,
#     peak_time_bin_minutes=1,
# )


GOTOVO:
  ARBITRAGE/open_analysis_fast\summary.csv
  ARBITRAGE/open_analysis_fast\onefile.jsonl
  ARBITRAGE/open_analysis_fast\best_params.jsonl
