In [0]:
%pip install -q sqlglot

In [0]:
import yaml
with open('./ingestion_config.yaml', 'r') as file:
    config = yaml.safe_load(file)

databricks_config = config["databricks_config"]
volume_path = databricks_config["volume_path"]

catalog = config["databricks_config"]["catalog"]
eval_schema = config["databricks_config"]["eval_schema"]

In [0]:
evals = spark.read.json(f"{volume_path}/dev_20240627/dev.json", multiLine=True)

In [0]:
display(evals)

In [0]:
try:
  spark.sql(f"CREATE SCHEMA IF NOT EXISTS {catalog}.{eval_schema}")
  evals.write.mode("overwrite").saveAsTable(f"{catalog}.{eval_schema}.evals")
except Exception as e:
  print(f"Failed to create table with exception {e}")

In [0]:
from typing import Iterator
import pandas as pd
import sqlglot                                        # loaded once per executor

from pyspark.sql.functions import pandas_udf, col
from pyspark.sql.types import StringType

def _transpile_single(q: str | None) -> str | None:
    """
    Helper that turns one statement into Databricks SQL.
    Returns None for blank / NULL inputs so filters still work.
    """
    if q is None or not q.strip():
        return None
    return sqlglot.transpile(q, write="databricks", pretty=False)[0]

@pandas_udf(StringType())                                    # ↙ type‑hinted iterator
def to_dbsql(batches: Iterator[pd.Series]) -> Iterator[pd.Series]:
    """Iterator‑of‑Series → Databricks SQL (fast Arrow batches)."""
    for pdf in batches:                                      # pdf == Pandas Series
        yield pdf.apply(_transpile_single)


In [0]:
# ---------------------------------------------------------------------------
#  UDF: fast + fault‑tolerant SQL → Databricks SQL transpiler
# ---------------------------------------------------------------------------
from typing import Iterator
import re
import pandas as pd
import sqlglot
from sqlglot.errors import ParseError
from pyspark.sql.functions import pandas_udf, col
from pyspark.sql.types import StringType

_BACKTICK_ID = re.compile(r"`([^`]+)`")          #  back‑ticked identifier

def _ansi_quote(match: re.Match) -> str:
    """`foo bar`  ->  "foo bar"  (ANSI identifier)."""
    return '"' + match.group(1) + '"'

def _dbl_to_backtick(sql: str) -> str:
    """flip all ANSI quotes back to Databricks style."""
    # sqlglot always outputs identifiers as double‑quotes when the target
    # dialect supports them, so a simple replace is safe here.
    return sql.replace('"', '`')

def _safe_transpile(q: str | None,
                    source_dialect: str = "spark") -> str | None:
    if q is None or not q.strip():
        return None

    # 1️⃣  re‑quote problem identifiers
    prepped = _BACKTICK_ID.sub(_ansi_quote, q)

    try:
        # 2️⃣  let sqlglot try again
        t = sqlglot.transpile(
            prepped,
            read=source_dialect,   # tell sqlglot what it’s reading
            write="databricks",
            pretty=False,
        )[0]
        return _dbl_to_backtick(t)           # restore back‑ticks

    except ParseError:
        # 3️⃣  give up gracefully – keep original SQL
        return q

# --- Vectorised iterator‑style Pandas UDF -----------------------------------
@pandas_udf(StringType())                       # Spark infers SCALAR_ITER
def to_dbsql(batch_iter: Iterator[pd.Series]) -> Iterator[pd.Series]:
    for series in batch_iter:                   # each Arrow batch as a Series
        yield series.apply(_safe_transpile)

In [0]:
evals_gold = evals.withColumn("gold_SQL", to_dbsql(col("SQL")))
display(evals_gold)

In [0]:
try:
  spark.sql(f"CREATE SCHEMA IF NOT EXISTS {catalog}.{eval_schema}")
  evals_gold.write.mode("overwrite").saveAsTable(f"{catalog}.{eval_schema}.evals_gold")
except Exception as e:
  print(f"Failed to create table with exception {e}")