In [0]:
# Databricks Python Notebook: Bronze ingest from Binance Kline REST (append-only)
import time, json, random, datetime as dt
import requests
from typing import List, Dict, Tuple, Optional
from pyspark.sql import Row
from pyspark.sql.functions import col, to_timestamp
from pyspark.sql.types import StructType, StructField, StringType

# ===== (0) 권장 클러스터 설정 =====
spark.conf.set("spark.databricks.delta.optimizeWrite","true")
spark.conf.set("spark.databricks.delta.autoCompact","true")

# =========================
# (A) 실행 설정
# =========================
MODE           = "backfill"                     # once | poll | forever | backfill
# SYMBOLS        = ["BTCUSDT","ETHUSDT"]
SYMBOLS        = ["BTCUSDT"]
INTERVALS      = ["15m"]
LIMIT_ONCE     = 1000
POLL_SECONDS   = 10
MAX_POLLS      = 60
BACKFILL_HOURS = 168

# =========================
# (B) 프로젝트 설정
# =========================
CATALOG = "demo_catalog"
SCHEMA  = "demo_schema"
TABLE        = f"{CATALOG}.{SCHEMA}.bronze_charts"       # Bronze Delta (append-only)
STATE_TABLE  = f"{CATALOG}.{SCHEMA}.bronze_ingest_state" # 마지막 open_time(ms) 상태 저장

# Binance REST (Spot)
BASE_URL = "https://api.binance.com"
KLINES   = "/api/v3/klines"
LIMIT_DEFAULT = 500  # Spot 기본 limit (최대 1000)

# =========================
# (C) 테이블 준비
# =========================
spark.sql(f"CREATE CATALOG IF NOT EXISTS {CATALOG}")
spark.sql(f"CREATE SCHEMA  IF NOT EXISTS {CATALOG}.{SCHEMA}")

spark.sql(f"""
CREATE TABLE IF NOT EXISTS {TABLE} (
  source            STRING,
  event_time        TIMESTAMP,  -- open_time(UTC)
  ingest_time       TIMESTAMP,  -- 적재 시각(UTC)
  unique_key        STRING,     -- "symbol|interval|open_time(ms)"
  raw_json          STRING,     -- 원본 배열 JSON 문자열
  api_endpoint      STRING,     -- 호출한 REST endpoint
  api_params_hash   STRING,     -- 요청 파라미터 해시(디버깅/재현)
  dt                DATE        -- event_date(partition key)
) USING DELTA
PARTITIONED BY (dt)
""")

spark.sql(f"""
CREATE TABLE IF NOT EXISTS {STATE_TABLE} (
  symbol     STRING,
  interval   STRING,
  last_open_time_ms LONG,
  updated_at TIMESTAMP
) USING DELTA
""")

# =========================
# (D) 유틸리티
# =========================
def _params_hash(params: Dict) -> str:
    raw = json.dumps(params, sort_keys=True, separators=(",",":"))
    import hashlib
    return hashlib.sha256(raw.encode("utf-8")).hexdigest()

def _to_ms(ts: dt.datetime) -> int:
    if ts.tzinfo is None:
        ts = ts.replace(tzinfo=dt.timezone.utc)
    return int(ts.timestamp() * 1000)

def _from_ms(ms: int) -> dt.datetime:
    return dt.datetime.fromtimestamp(ms/1000, tz=dt.timezone.utc)

def binance_klines(symbol: str, interval: str, start_ms: Optional[int]=None,
                   end_ms: Optional[int]=None, limit: int=LIMIT_DEFAULT,
                   max_retries: int=5) -> Tuple[List[list], Dict[str,str]]:
    url = BASE_URL + KLINES
    q = {"symbol": symbol, "interval": interval, "limit": limit}
    if start_ms is not None: q["startTime"] = start_ms
    if end_ms   is not None: q["endTime"]   = end_ms

    delay = 1
    last_headers = {}
    for _ in range(max_retries):
        r = requests.get(url, params=q, timeout=30)
        last_headers = {k: v for k, v in r.headers.items()}
        if r.status_code == 200:
            return r.json(), last_headers
        if r.status_code == 429:
            time.sleep(delay + random.uniform(0, 0.3))
            delay = min(delay * 2, 16)
            continue
        time.sleep(1 + random.uniform(0, 0.3))
    r.raise_for_status()
    return [], last_headers

def _get_last_state(symbol: str, interval: str) -> Optional[int]:
    df = spark.sql(f"""
      SELECT last_open_time_ms
      FROM {STATE_TABLE}
      WHERE symbol = '{symbol}' AND interval = '{interval}'
      ORDER BY updated_at DESC
      LIMIT 1
    """)
    rows = df.collect()
    return rows[0][0] if rows else None

def _upsert_state(symbol: str, interval: str, last_ms: int):
    # 오프셋 없는 안전한 TIMESTAMP 문자열
    now = dt.datetime.now(dt.timezone.utc).strftime("%Y-%m-%d %H:%M:%S")
    spark.sql(f"""
      MERGE INTO {STATE_TABLE} t
      USING (SELECT '{symbol}' AS symbol, '{interval}' AS interval,
                    {last_ms} AS last_open_time_ms, TIMESTAMP('{now}') AS updated_at) s
      ON t.symbol = s.symbol AND t.interval = s.interval
      WHEN MATCHED THEN UPDATE SET last_open_time_ms = s.last_open_time_ms, updated_at = s.updated_at
      WHEN NOT MATCHED THEN INSERT (symbol, interval, last_open_time_ms, updated_at)
      VALUES (s.symbol, s.interval, s.last_open_time_ms, s.updated_at)
    """)

# ===================================
# (E) Bronze append-only 작성
# ===================================
def _append_to_bronze(symbol: str, interval: str, rows: List[list], endpoint: str, params: Dict):
    if not rows:
        return 0, None

    param_hash = _params_hash(params)
    now = dt.datetime.now(dt.timezone.utc)
    now_s = now.strftime("%Y-%m-%d %H:%M:%S")

    recs = []
    max_open_ms = None

    for item in rows:
        open_ms = int(item[0])
        event_time = _from_ms(open_ms)
        unique = f"{symbol}|{interval}|{open_ms}"
        recs.append({
          "source": "binance.spot.klines",
          "event_time": event_time.strftime("%Y-%m-%d %H:%M:%S"),
          "ingest_time": now_s,
          "unique_key": unique,
          "raw_json": json.dumps(item, separators=(",",":")),
          "api_endpoint": endpoint,
          "api_params_hash": param_hash,
          "dt": event_time.date().isoformat()
        })
        if (max_open_ms is None) or (open_ms > max_open_ms):
            max_open_ms = open_ms

    schema = StructType([
        StructField("source",           StringType(), True),
        StructField("event_time",       StringType(), True),
        StructField("ingest_time",      StringType(), True),
        StructField("unique_key",       StringType(), True),
        StructField("raw_json",         StringType(), True),
        StructField("api_endpoint",     StringType(), True),
        StructField("api_params_hash",  StringType(), True),
        StructField("dt",               StringType(), True),
    ])
    df = (spark.createDataFrame([Row(**r) for r in recs], schema)
            .withColumn("event_time",  to_timestamp(col("event_time")))
            .withColumn("ingest_time", to_timestamp(col("ingest_time")))
            .withColumn("dt",          col("dt").cast("date"))
            .dropDuplicates(["unique_key"])     # 배치 내부만 중복 제거
            .repartition("dt"))                  # 파티션별 파일 수 최소화

    count = df.count()  # 액션 1회(정확한 적재 건수 보고용)
    df.writeTo(TABLE).append()
    return count, max_open_ms

# =========================
# (F) 모드별 동작
# =========================
def backfill_symbol(symbol: str, interval: str,
                    hours: int = BACKFILL_HOURS, limit: int = LIMIT_DEFAULT):
    now_utc = dt.datetime.now(dt.timezone.utc)
    start_utc = now_utc - dt.timedelta(hours=hours)

    last_ms = _get_last_state(symbol, interval)
    if last_ms:
        start_utc = _from_ms(last_ms) + dt.timedelta(milliseconds=1)

    start_ms = _to_ms(start_utc)
    end_ms   = _to_ms(now_utc)

    total_rows = 0
    cursor_ms = start_ms

    print(f"[BACKFILL] {symbol} {interval} {_from_ms(start_ms)} → {_from_ms(end_ms)}")
    while cursor_ms < end_ms:
        batch_end = min(end_ms, cursor_ms + 1000 * 60 * 60 * 6)  # 6시간 창
        params = {"symbol": symbol, "interval": interval,
                  "startTime": cursor_ms, "endTime": batch_end, "limit": limit}
        rows, headers = binance_klines(symbol, interval,
                                       start_ms=cursor_ms, end_ms=batch_end, limit=limit)
        count, max_open_ms = _append_to_bronze(symbol, interval, rows, KLINES, params)

        used_weight = headers.get("X-MBX-USED-WEIGHT-1m") or headers.get("X-MBX-USED-WEIGHT")
        if used_weight:
            print(f"  used_weight(1m): {used_weight}")

        total_rows += count
        if max_open_ms is None:
            cursor_ms = batch_end + 1
        else:
            cursor_ms = max_open_ms + 1
            _upsert_state(symbol, interval, max_open_ms)

        time.sleep(0.2)

    print(f"[BACKFILL DONE] {symbol} {interval}: {total_rows} rows")

def poll_once(symbol: str, interval: str, limit: int = 50):
    params = {"symbol": symbol, "interval": interval, "limit": limit}
    rows, headers = binance_klines(symbol, interval, limit=limit)
    count, max_open_ms = _append_to_bronze(symbol, interval, rows, KLINES, params)
    if max_open_ms is not None:
        _upsert_state(symbol, interval, max_open_ms)
    used_weight = headers.get("X-MBX-USED-WEIGHT-1m") or headers.get("X-MBX-USED-WEIGHT")
    print(f"[POLL] {symbol} {interval}: +{count} rows, last_open_ms={max_open_ms}, used_weight={used_weight}")

# =========================
# (G) MAIN
# =========================
if MODE == "backfill":
    for iv in INTERVALS:
        for sym in SYMBOLS:
            backfill_symbol(sym, iv, hours=BACKFILL_HOURS, limit=LIMIT_DEFAULT)
    dbutils.notebook.exit("backfill done")

elif MODE == "poll":
    for i in range(MAX_POLLS):
        for iv in INTERVALS:
            for sym in SYMBOLS:
                poll_once(sym, iv, limit=LIMIT_ONCE)
        time.sleep(POLL_SECONDS)
    dbutils.notebook.exit("poll done")

elif MODE == "forever":
    print(f"[LIVE] start polling every {POLL_SECONDS}s")
    while True:
        try:
            for iv in INTERVALS:
                for sym in SYMBOLS:
                    poll_once(sym, iv, limit=LIMIT_ONCE)
        except Exception as e:
            print(f"[WARN] {e}")
            time.sleep(5)
        time.sleep(POLL_SECONDS)

else:  # MODE == "once"
    for iv in INTERVALS:
        for sym in SYMBOLS:
            poll_once(sym, iv, limit=LIMIT_ONCE)
    dbutils.notebook.exit("once done")
