In [0]:
%pip install databricks-labs-lakebridge

In [0]:
%restart_python

In [0]:
from databricks.labs.lakebridge.__about__ import __version__
from databricks.labs.lakebridge.config import TranspileConfig, TranspileResult
from databricks.labs.lakebridge.transpiler.transpile_engine import TranspileEngine
from databricks.labs.lakebridge.transpiler.repository import TranspilerRepository
import tempfile
import os
from pathlib import Path
from datetime import datetime
from pyspark.sql import SparkSession
import pandas as pd

print(f"Lakebridge version: {__version__}")

In [0]:
class TeradataToSparkConverter:
    """Lakebridge Python API wrapper for Teradata to Spark conversion."""
    
    def __init__(self, catalog: str = "az_adb_simbus_training", schema: str = "td2ss"):
        self.catalog = catalog
        self.schema = schema
        self.spark = SparkSession.builder.getOrCreate()
        labs_path = "/local_disk0/.ephemeral_nfs/envs/pythonEnv-7c8c04ee-dc2e-4fe7-a7f0-89823e35d087/lib/python3.11/site-packages"
        
        # Initialize transpiler repository
        self.repository = TranspilerRepository(labs_path)
        
        # Setup tracking table
        self._setup_tracking_table()
    
    def _setup_tracking_table(self):
        """Create conversion history tracking table."""
        self.spark.sql(f"USE CATALOG {self.catalog}")
        self.spark.sql(f"CREATE SCHEMA IF NOT EXISTS {self.schema}")
        
        self.spark.sql(f"""
            CREATE TABLE IF NOT EXISTS {self.catalog}.{self.schema}.conversion_history (
                conversion_id STRING,
                timestamp TIMESTAMP,
                original_sql STRING,
                converted_sql STRING,
                status STRING,
                error_message STRING,
                conversion_time_ms LONG,
                user STRING
            ) USING DELTA
        """)
    
    def convert_sql(self, teradata_sql: str, save_to_catalog: bool = True):
        """Convert Teradata SQL to Spark SQL using Lakebridge Python API."""
        start_time = datetime.now()
        conversion_id = f"conv_{start_time.strftime('%Y%m%d_%H%M%S')}"
        
        # Create temporary directory for input/output
        with tempfile.TemporaryDirectory() as temp_dir:
            input_file = os.path.join(temp_dir, "input.sql")
            output_dir = os.path.join(temp_dir, "output")
            os.makedirs(output_dir, exist_ok=True)
            
            # Write input SQL to file
            with open(input_file, 'w') as f:
                f.write(teradata_sql)
            
            try:
                # Create transpile configuration
                config = TranspileConfig(
                    source_dialect="teradata",
                    # target_dialect="spark",
                    input_source=input_file,
                    output_folder=output_dir,
                    # transpiler="bladebridge"  # Use BladeBridge engine
                )
                
                # Create and run transpile engine
                engine = TranspileEngine()
                result = engine.transpile(config)
                print(result)
                
                # Read converted SQL from output
                converted_sql = None
                output_files = list(Path(output_dir).glob("*.sql"))
                if output_files:
                    with open(output_files[0], 'r') as f:
                        converted_sql = f.read()
                
                status = "SUCCESS" if converted_sql else "FAILED"
                error_message = None if converted_sql else "No output generated"
                
            except Exception as e:
                converted_sql = None
                status = "FAILED"
                error_message = str(e)
                print(f"Conversion error: {e}")
        
        # Calculate conversion time
        conversion_time_ms = int((datetime.now() - start_time).total_seconds() * 1000)
        
        # Prepare result
        result_dict = {
            "conversion_id": conversion_id,
            "timestamp": start_time,
            "original_sql": teradata_sql[:5000],
            "converted_sql": converted_sql[:5000] if converted_sql else None,
            "status": status,
            "error_message": error_message,
            "conversion_time_ms": conversion_time_ms,
            "user": self.spark.sql("SELECT current_user()").collect()[0][0]
        }
        
        # Save to catalog
        if save_to_catalog:
            df = self.spark.createDataFrame([result_dict])
            df.write.mode("append").saveAsTable(
                f"{self.catalog}.{self.schema}.conversion_history"
            )
        
        return result_dict

In [0]:
# Initialize converter
converter = TeradataToSparkConverter(catalog="az_adb_simbus_training", schema="td2ss")

In [0]:
# Test conversion
teradata_query = """
CREATE SET TABLE sales.monthly_summary ,NO FALLBACK (
    month_id INTEGER,
    product_id INTEGER,
    total_sales DECIMAL(12,2),
    units_sold INTEGER
) PRIMARY INDEX (month_id, product_id);

SELECT 
    product_id,
    SUM(total_sales) as revenue,
    COUNT(*) as transactions
FROM sales.monthly_summary
QUALIFY ROW_NUMBER() OVER (PARTITION BY product_id ORDER BY revenue DESC) <= 10
GROUP BY product_id;
"""

# Try the converter
result = converter.convert_sql(teradata_query)

print(f"Status: {result['status']}")
print(f"Conversion Time: {result['conversion_time_ms']}ms")
if result['converted_sql']:
    print("\nConverted SQL:")
    print(result['converted_sql'])
else:
    print(f"\nError: {result['error_message']}")

# COMMAND ----------

# If the above doesn't work, try the direct approach
print("Trying direct conversion...")
converted = convert_teradata_to_spark_direct(teradata_query)
if converted:
    print("Direct conversion successful:")
    print(converted)
else:
    print("Direct conversion failed")

In [0]:
teradata_query = """
CREATE SET TABLE sales.monthly_summary ,NO FALLBACK (
    month_id INTEGER,
    product_id INTEGER,
    total_sales DECIMAL(12,2),
    units_sold INTEGER
) PRIMARY INDEX (month_id, product_id);

SELECT 
    product_id,
    SUM(total_sales) as revenue,
    COUNT(*) as transactions
FROM sales.monthly_summary
QUALIFY ROW_NUMBER() OVER (PARTITION BY product_id ORDER BY revenue DESC) <= 10
GROUP BY product_id;
"""

In [0]:
convert_teradata_to_spark_direct(teradata_query)

In [0]:
# COMMAND ----------

# MAGIC %md
# MAGIC ## Troubleshooting
# MAGIC 
# MAGIC If the above approaches don't work, the Python API may not be fully exposed. 
# MAGIC You can try:
# MAGIC 
# MAGIC 1. **Inspect available methods:**
# MAGIC ```python
# MAGIC from databricks.labs.lakebridge.transpiler.transpile_engine import TranspileEngine
# MAGIC print(dir(TranspileEngine))
# MAGIC ```
# MAGIC 
# MAGIC 2. **Check TranspileConfig attributes:**
# MAGIC ```python
# MAGIC from databricks.labs.lakebridge.config import TranspileConfig
# MAGIC config = TranspileConfig()
# MAGIC print(dir(config))
# MAGIC ```
# MAGIC 
# MAGIC 3. **Use SQLGlot instead** (shown in previous response) which has a well-documented Python API