In [0]:
# Databricks Notebook: Transform Bronze (Binance) -> Silver (improved)
from pyspark.sql.functions import col, from_json, split
from pyspark.sql.types import ArrayType, StringType
from delta.tables import DeltaTable

# ===== (0) 선택: 최근 N일만 증분 처리 =====
DAYS_BACK = 14  # 전체 처리하려면 None

# =========================
# (A) 환경/대상 테이블 설정
# =========================
CATALOG = "demo_catalog"
SCHEMA  = "demo_schema"
BRONZE  = f"{CATALOG}.{SCHEMA}.bronze_charts"
SILVER  = f"{CATALOG}.{SCHEMA}.silver_charts"

# =========================
# (B) Silver 테이블 생성(없으면)
# =========================
spark.sql(f"""
CREATE TABLE IF NOT EXISTS {SILVER} (
  symbol       STRING,
  interval     STRING,
  open_time    TIMESTAMP,
  open         DOUBLE,
  high         DOUBLE,
  low          DOUBLE,
  close        DOUBLE,
  volume       DOUBLE,
  unique_key   STRING,
  event_time   TIMESTAMP,
  dt           DATE
) USING DELTA
PARTITIONED BY (dt)
""")

# =========================
# (C) Bronze 로드(필요 시 최근 N일만)
# =========================
bronze = spark.table(BRONZE)
if DAYS_BACK is not None:
    bronze = bronze.where(f"dt >= date_sub(current_date(), {DAYS_BACK})")

# kline 배열: [0]=open_time(ms), [1]=open, [2]=high, [3]=low, [4]=close, [5]=volume, ...
arr = from_json(col("raw_json"), ArrayType(StringType()))
uk  = split(col("unique_key"), "\\|")  # "symbol|interval|open_ms"

# =========================
# (D) 변환 DF 구성
# =========================
silver_df = (
    bronze
      .withColumn("symbol",     uk.getItem(0))
      .withColumn("interval",   uk.getItem(1))
      .withColumn("open_time",  (arr.getItem(0).cast("long")/1000).cast("timestamp"))
      .withColumn("open",       arr.getItem(1).cast("double"))
      .withColumn("high",       arr.getItem(2).cast("double"))
      .withColumn("low",        arr.getItem(3).cast("double"))
      .withColumn("close",      arr.getItem(4).cast("double"))
      .withColumn("volume",     arr.getItem(5).cast("double"))
      .withColumn("dt",         col("event_time").cast("date"))
      .select("symbol","interval","open_time","open","high","low","close","volume",
              "unique_key","event_time","dt")
      .dropDuplicates(["unique_key"])
      .repartition("dt")   # 파티션별로 묶어 MERGE 부담↓ (필요시)
)

# =========================
# (E) Delta MERGE (idempotent upsert)
# =========================
target = DeltaTable.forName(spark, SILVER)

(target.alias("t")
  .merge(
    silver_df.alias("s"),
    "t.unique_key = s.unique_key AND t.dt = s.dt"
  )
  .whenMatchedUpdate(set={
      "symbol":     "s.symbol",
      "interval":   "s.interval",
      "open_time":  "s.open_time",
      "open":       "s.open",
      "high":       "s.high",
      "low":        "s.low",
      "close":      "s.close",
      "volume":     "s.volume",
      "event_time": "s.event_time",
      "dt":         "s.dt"
  })
  .whenNotMatchedInsertAll()
  .execute())

print(f"Silver transform complete: upserted into {SILVER}")
