In [10]:
from dotenv import load_dotenv
import os
from pathlib import Path
from datetime import datetime, timedelta, date
import pandas as pd

# --- Load your API key from .env ---
# Explicitly look for .env in the parent directory (project root)
env_path = Path.cwd().parent / ".env"
if not env_path.exists():
    # If not found, try current directory
    env_path = Path.cwd() / ".env"
    
print(f"Looking for .env at: {env_path}")
print(f".env exists: {env_path.exists()}")

load_dotenv(dotenv_path=env_path)
api_key = os.getenv("DATABENTO_API_KEY")

if not api_key:
    raise RuntimeError(f"No API key found. Checked: {env_path}")

print(f"API key loaded successfully (first 10 chars): {api_key[:10]}")

# --- Initialize DataBento client ---
import databento as db
client = db.Historical(api_key=api_key)

# --- Timezone setup ---
try:
    # Preferred: Python 3.9+ zoneinfo
    from zoneinfo import ZoneInfo
    CHI = ZoneInfo("America/Chicago")
    UTC = ZoneInfo("UTC")
except Exception:
    # Fallback: pytz if zoneinfo not available
    import pytz
    CHI = pytz.timezone("America/Chicago")
    UTC = pytz.UTC



# ---------- Config you'll touch most often ----------
DATASET = "GLBX.MDP3"
SCHEMA  = "bbo-1m"          # 1-minute top-of-book snapshots
WINDOW_MIN = 5              # last N minutes of RTH to pull
RTH_END_CT = (15, 0, 0)     # 3:00pm CT (15:00:00) is CME equity-index futures pit/settlement window end
OUT_DIR = Path("data/raw")  # where DBN files are written
# Example symbols: keep this tight – one expiry + ATM ± N strikes.
SYMBOLS = ["ES."]

# Date range (inclusive)
START = date(2025, 10, 15)
END   = date(2025, 10, 21)
# ----------------------------------------------------


def ensure_outdir():
    OUT_DIR.mkdir(parents=True, exist_ok=True)


def day_iter(d0: date, d1: date):
    d = d0
    while d <= d1:
        yield d
        d += timedelta(days=1)


def close_window_utc(d: date, minutes=5):
    """Return (start_utc, end_utc) for the last N minutes of RTH on day d in UTC."""
    # Build end time in Chicago
    end_ct = datetime(d.year, d.month, d.day, *RTH_END_CT, tzinfo=CHI)
    start_ct = end_ct - timedelta(minutes=minutes)

    return start_ct.astimezone(UTC), end_ct.astimezone(UTC)


def estimate_cost(client: db.Historical, symbols, start_d: date, end_d: date, minutes=5):
    """Use metadata.get_cost to estimate cost for last N minutes per day."""
    total_bytes = 0
    total_usd = 0.0
    rows = []

    for d in day_iter(start_d, end_d):
        st_utc, en_utc = close_window_utc(d, minutes)
        # Skip weekends by checking weekday() (Mon=0 ... Sun=6). CME trades Sunday evening, but for EOD marks we keep weekdays.
        if d.weekday() >= 5:
            continue

        info = client.metadata.get_cost(
            dataset=DATASET,
            schema=SCHEMA,
            symbols=symbols,
            start=st_utc.isoformat(),
            end=en_utc.isoformat(),
        )
        # info contains estimated size_bytes and cost_usd for this request
        size_b = getattr(info, "size_bytes", None) or info.get("size_bytes", 0)
        cost   = getattr(info, "cost_usd",   None) or info.get("cost_usd",   0.0)

        rows.append({"date": d.isoformat(), "size_bytes": size_b, "cost_usd": cost})
        total_bytes += size_b
        total_usd   += float(cost)

    est = pd.DataFrame(rows)
    return est, total_bytes, total_usd


def download_bbo_last_window(client: db.Historical, symbols, start_d: date, end_d: date, minutes=5):
    """Download last N minutes per day and write one DBN per day (small, tidy files)."""
    ensure_outdir()
    manifest = []

    for d in day_iter(start_d, end_d):
        st_utc, en_utc = close_window_utc(d, minutes)
        if d.weekday() >= 5:
            continue

        print(f"Downloading {d.isoformat()} {st_utc.isoformat()} → {en_utc.isoformat()} ({minutes}m) …")
        data = client.timeseries.get_range(
            dataset=DATASET,
            schema=SCHEMA,
            symbols=symbols,
            start=st_utc,
            end=en_utc,
        )
        # Write per-day DBN to keep things modular
        out_file = OUT_DIR / f"glbx-mdp3-{d.isoformat()}.{SCHEMA}.last{minutes}m.dbn.zst"
        data.to_file(str(out_file))
        manifest.append({"date": d.isoformat(), "file": str(out_file)})

    return pd.DataFrame(manifest)


def dbn_to_parquet_mid(closed_dbn_path: Path) -> Path:
    """
    Optional helper: read a per-day DBN file and write a parquet with mids and an aggregated 'quote close'.
    """
    store = db.DBNStore.from_file(str(closed_dbn_path))
    df = store.to_df()
    # Expect columns like: ts_event, symbol, bid_px, ask_px, bid_sz, ask_sz
    
    # Optional: Filter for ES options only (excludes ES futures)
    # ES options symbols contain a space then C/P then digits, e.g. "ESZ5 C7000"
    # Uncomment the line below to filter:
    # df = df[df["symbol"].str.contains(r"\s[CP]\d+$", regex=True, na=False)]
    
    df = df.sort_values(["symbol", "ts_event"])
    df["mid"] = (df["bid_px"] + df["ask_px"]) / 2

    # Aggregate to a single 'close' per symbol per day using median(mid)
    df["date"] = pd.to_datetime(df["ts_event"]).dt.date
    qclose = (
        df.groupby(["date", "symbol"], as_index=False)
          .agg(mid_close=("mid", "median"),
               spread_close=("ask_px", lambda x: (x - df.loc[x.index, "bid_px"]).median()))
    )
    out_pq = closed_dbn_path.with_suffix("").with_suffix(".parquet")
    qclose.to_parquet(out_pq, index=False)
    return out_pq


def pretty_cost(est_df, tot_bytes, tot_usd):
    if est_df.empty:
        print("No trading days in range (weekends/filters).")
        return
    print(est_df.to_string(index=False))
    print(f"\nEstimated total size: {tot_bytes/1e6:.3f} MB")
    print(f"Estimated total cost: ${tot_usd:.2f} USD\n")


def confirm_or_abort(tot_usd, max_budget_usd=None):
    if max_budget_usd is not None and tot_usd > max_budget_usd:
        raise RuntimeError(f"Estimated cost ${tot_usd:.2f} exceeds budget ${max_budget_usd:.2f}. Aborting.")
    ans = input("Proceed with download? [y/N]: ").strip().lower()
    if ans != "y":
        raise SystemExit("Aborted by user before download.")


def main():
    api_key = os.getenv("DATABENTO_API_KEY")
    if not api_key:
        raise RuntimeError("Set DATABENTO_API_KEY in your environment.")

    client = db.Historical(api_key=api_key)

    print("Estimating cost …")
    est_df, tot_bytes, tot_usd = estimate_cost(client, SYMBOLS, START, END, minutes=WINDOW_MIN)
    pretty_cost(est_df, tot_bytes, tot_usd)

    # OPTIONAL: set a soft budget (e.g., $5); set to None to disable
    MAX_BUDGET_USD = 5.00
    confirm_or_abort(tot_usd, max_budget_usd=MAX_BUDGET_USD)

    # If the estimate looks good, download:
    print("Downloading data …")
    manifest = download_bbo_last_window(client, SYMBOLS, START, END, minutes=WINDOW_MIN)
    print("\nWrote DBN files:")
    print(manifest)

    # Optional: convert each DBN to a daily parquet of quote-based closes
    print("\nConverting to quote-based daily close parquet …")
    out_parquets = []
    for _, row in manifest.iterrows():
        pq = dbn_to_parquet_mid(Path(row["file"]))
        out_parquets.append(pq)
        print(f"  {pq}")

    print("\nDone.")


Looking for .env at: c:\Users\alexp\OneDrive\Gdrive\Trading\GitHub Projects\databento-es-options\.env
.env exists: True
API key loaded successfully (first 10 chars): db-bqK4ip9


TypeError: Historical.__init__() got an unexpected keyword argument 'api_key'

In [8]:
# Test: Check if API key is loaded
print(f"API key loaded: {api_key is not None}")
print(f"API key (first 10 chars): {api_key[:10] if api_key else 'NOT FOUND'}")
print(f"Working directory: {os.getcwd()}")
print(f"Looking for .env in: {Path.cwd()}")
print(f".env exists: {Path('.env').exists()}")


API key loaded: True
API key (first 10 chars): db-bqK4ip9
Working directory: c:\Users\alexp\OneDrive\Gdrive\Trading\GitHub Projects\databento-es-options\close_quotes
Looking for .env in: c:\Users\alexp\OneDrive\Gdrive\Trading\GitHub Projects\databento-es-options\close_quotes
.env exists: False


In [9]:
# Run the main function
main()


NameError: name 'main' is not defined