In [1]:
# Delta Migration and Pipeline Setup for OMIE Data

This notebook migrates existing Parquet files to Delta format and establishes the foundation for automated daily/monthly pipelines.

## Key Changes
- **Delta Format**: ACID transactions, versioning, time travel
- **Incremental Processing**: Track last processed files
- **Pipeline Ready**: Designed for automated execution
- **Change Detection**: Prevent duplicate processing

## Architecture

```
Bronze Layer (Delta):
bronze/OMIE/delta_tables/
  - daily_prices/          # Daily price data (partitioned by year/month)
  - metadata/              # Processing metadata and change tracking
  - staging/               # Temporary staging for new files
pipelines/
  - daily_update.py        # Daily incremental updates
  - monthly_maintenance.py # Monthly full validation
```

## Setup Instructions
1. Run migration cells to convert existing Parquet to Delta
2. Set up change detection and metadata tracking
3. Test incremental processing logic
4. Deploy as Fabric pipelines

SyntaxError: invalid syntax (1139332763.py, line 3)

In [None]:
# Initialize Spark with Delta Lake support
import os
from pathlib import Path
from datetime import datetime, timedelta
import hashlib
import json
import re

# Import Spark and Delta Lake
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import *

# Initialize Spark session with Delta support
spark = SparkSession.builder \
    .appName("OMIE_Delta_Migration") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
    .config("spark.sql.adaptive.enabled", "true") \
    .config("spark.sql.adaptive.coalescePartitions.enabled", "true") \
    .getOrCreate()

spark.sparkContext.setLogLevel("WARN")

print('✅ Spark with Delta Lake initialized')
print(f'   Spark version: {spark.version}')

# Detect environment and set paths
if os.path.exists('/lakehouse/default/Files'):
    LAKEHOUSE_ROOT = Path('/lakehouse/default/Files')
    print(f'🏭 Detected Fabric environment: {LAKEHOUSE_ROOT}')
else:
    # For local testing
    LAKEHOUSE_ROOT = Path('Files')
    print(f'💻 Local development mode: {LAKEHOUSE_ROOT}')

# Define new Delta structure
BRONZE_DIR = LAKEHOUSE_ROOT / 'bronze' / 'OMIE'
DELTA_DIR = BRONZE_DIR / 'delta_tables'
STAGING_DIR = DELTA_DIR / 'staging'
METADATA_DIR = DELTA_DIR / 'metadata'
DAILY_PRICES_DIR = DELTA_DIR / 'daily_prices'

print(f'📁 Delta Lake structure will be created at: {DELTA_DIR}')

In [None]:
# Create Delta directories and scan existing Parquet files
print("📁 Creating Delta directory structure...")

# Create Delta directories
for dir_path in [DELTA_DIR, STAGING_DIR, METADATA_DIR, DAILY_PRICES_DIR]:
    dir_path.mkdir(parents=True, exist_ok=True)
    print(f"   ✅ Created: {dir_path}")

print("\n🔍 Scanning for existing Parquet files...")

# Find all existing Parquet files
existing_files = []
if BRONZE_DIR.exists():
    for year_dir in BRONZE_DIR.iterdir():
        if year_dir.is_dir() and year_dir.name.isdigit():
            year = int(year_dir.name)
            parquet_files = list(year_dir.glob('*.parquet'))
            
            for file_path in parquet_files:
                if 'manifest' not in file_path.name:  # Skip manifest files
                    # Extract date from filename
                    date_match = re.search(r'(\d{8})', file_path.name)
                    extraction_date = date_match.group(1) if date_match else None
                    
                    # Calculate file checksum for change detection
                    file_checksum = None
                    try:
                        with open(file_path, 'rb') as f:
                            file_checksum = hashlib.md5(f.read()).hexdigest()
                    except:
                        pass
                    
                    existing_files.append({
                        'file_path': str(file_path),
                        'year': year,
                        'filename': file_path.name,
                        'file_size': file_path.stat().st_size if file_path.exists() else 0,
                        'extraction_date': extraction_date,
                        'file_checksum': file_checksum
                    })

print(f"📊 Found {len(existing_files)} Parquet files to migrate:")
for year in sorted(set(f['year'] for f in existing_files)):
    year_files = [f for f in existing_files if f['year'] == year]
    total_size = sum(f['file_size'] for f in year_files) / (1024*1024)  # MB
    print(f"   📅 {year}: {len(year_files)} files ({total_size:.1f} MB)")

# Store file inventory for reference
file_inventory = {
    'scan_timestamp': datetime.now().isoformat(),
    'total_files': len(existing_files),
    'files_by_year': {str(year): len([f for f in existing_files if f['year'] == year]) 
                     for year in sorted(set(f['year'] for f in existing_files))},
    'files': existing_files
}

# Save inventory
inventory_path = METADATA_DIR / 'file_inventory.json'
with open(inventory_path, 'w') as f:
    json.dump(file_inventory, f, indent=2)

print(f"✅ File inventory saved: {inventory_path}")

In [None]:
# Migration using per-year glob reads to avoid per-file HEAD requests
from pyspark.sql import functions as F
from pyspark.sql import Window
from pathlib import Path
import json
import os

# Optional manual overrides (set these to GUIDs if auto-detection fails)
WORKSPACE_ID_OVERRIDE = None  # e.g., "ecf938c4-c449-48de-a07c-1d968a72b3d1"
LAKEHOUSE_ID_OVERRIDE = None  # e.g., "12345678-aaaa-bbbb-cccc-1234567890ab"

# Resolve OneLake ABFSS path using Workspace and Lakehouse IDs to avoid friendly-name restrictions
try:
    from notebookutils import mssparkutils  # Fabric utility
except Exception:
    mssparkutils = None

def _get_env_id(getter_names):
    if not mssparkutils:
        return None
    for name in getter_names:
        try:
            fn = getattr(mssparkutils.env, name, None)
            if callable(fn):
                val = fn()
                if val:
                    return val
        except Exception:
            pass
    return None

def resolve_onelake_abfss(rel_path: str) -> str:
    """Return ABFSS GUID-based path if possible, else pass-through.
    rel_path should begin with 'Files/...'
    """
    # Respect manual overrides first
    ws_id = WORKSPACE_ID_OVERRIDE or _get_env_id(["getWorkspaceId", "getWorkspaceGUID", "getWorkspaceGuid"])
    lakehouse_id = LAKEHOUSE_ID_OVERRIDE or _get_env_id(["getLakehouseId", "getArtifactId", "getItemId"])  # try common names
    if ws_id and lakehouse_id and rel_path:
        rel_path = rel_path.replace("\\", "/").lstrip("/")
        return f"abfss://{ws_id}@onelake.dfs.fabric.microsoft.com/{lakehouse_id}/{rel_path}"
    return rel_path

# Parameters
# Prefer GUID-based ABFSS path to bypass FriendlyNameSupportDisabled
friendly_base = "Files/bronze/OMIE"
base_parquet_dir = resolve_onelake_abfss(friendly_base)
print(f"Base parquet dir resolved to: {base_parquet_dir}")

def _load_inventory_year_files(year: int):
    """Optional fallback: read previously saved file inventory and build per-file ABFSS paths."""
    try:
        inv_path = str((METADATA_DIR / 'file_inventory.json').as_posix())
        with open(inv_path, 'r') as f:
            inv = json.load(f)
        files = []
        for rec in inv.get('files', []):
            if rec.get('year') == year and rec.get('filename'):
                files.append(resolve_onelake_abfss(f"Files/bronze/OMIE/{year}/{rec['filename']}"))
        return files
    except Exception:
        return []

def read_year_df(year: int):
    pattern = f"{base_parquet_dir}/{year}/*.parquet"
    print(f"➡️ Reading year {year} with pattern: {pattern}")
    try:
        df = spark.read.parquet(pattern)
    except Exception as e:
        print(f"   ⚠️  Glob read failed for {year}: {e}\n   ▶️  Trying per-file fallback via inventory (if available)...")
        files = _load_inventory_year_files(year)
        if not files:
            raise
        print(f"   ℹ️  Loading {len(files)} file(s) for {year} using GUID path")
        df = spark.read.parquet(files)

    # Normalize columns
    cols = [c.lower() for c in df.columns]
    df = df.toDF(*cols)

    # Expected fields & derived
    df = df.withColumn("extraction_year", F.lit(year).cast("int"))

    # Parse date from any available column
    date_col = None
    for candidate in ["extraction_date", "_extraction_date", "extraction_date_parsed"]:
        if candidate in df.columns:
            date_col = candidate
            break

    if date_col is None:
        # Try to infer from filename if available
        if "_source_file" in df.columns:
            df = df.withColumn(
                "extraction_date_parsed",
                F.regexp_extract(F.col("_source_file"), r"(20\d{6})", 1)
            )
        else:
            df = df.withColumn("extraction_date_parsed", F.lit(None).cast("string"))
    else:
        df = df.withColumn("extraction_date_parsed", F.col(date_col).cast("string"))

    # Normalize price column for OMIE
    if "marginalpdbc" in df.columns:
        df = df.withColumn("marginal_price_eur_mwh", F.col("marginalpdbc").cast("double"))
    elif "marginal_price_eur_mwh" not in df.columns:
        df = df.withColumn("marginal_price_eur_mwh", F.lit(None).cast("double"))

    # Add basic quality & metadata
    now_ts = F.current_timestamp()
    df = (
        df
        .withColumn("price_valid", (F.col("marginal_price_eur_mwh").isNotNull()) & (F.col("marginal_price_eur_mwh") >= 0))
        .withColumn("price_category", F.when(F.col("marginal_price_eur_mwh") < 50, F.lit("LOW")).when(F.col("marginal_price_eur_mwh") < 150, F.lit("MID")).otherwise(F.lit("HIGH")))
        .withColumn("data_quality_score", F.when(F.col("price_valid"), F.lit(1.0)).otherwise(F.lit(0.5)))
        .withColumn("table_created_at", now_ts)
    )

    # Ensure source_file exists for lineage
    if "_source_file" in df.columns:
        df = df.withColumn("source_file", F.col("_source_file"))
    elif "source_file" not in df.columns:
        df = df.withColumn("source_file", F.lit(None).cast("string"))

    # Partition helpers
    df = df.withColumn("extraction_month", F.substring(F.col("extraction_date_parsed"), 1, 6))
    return df

# Years to process
years = sorted([int(y) for y in YEARS_TO_PROCESS]) if 'YEARS_TO_PROCESS' in locals() else [2023, 2024, 2025]
print(f"Years to process: {years}")

# Read and union per year
dfs = []
for y in years:
    try:
        yf = read_year_df(y)
        cnt = yf.count()
        print(f"   ✅ Year {y}: {cnt:,} rows")
        if cnt > 0:
            dfs.append(yf)
    except Exception as e:
        print(f"   ⚠️  Skipping year {y} due to error: {e}")

if not dfs:
    raise RuntimeError("No data loaded from parquet. Check paths and permissions.")

full_df = dfs[0]
for d in dfs[1:]:
    full_df = full_df.unionByName(d, allowMissingColumns=True)

print(f"Total rows after union: {full_df.count():,}")

# Write to unified Delta table (partitioned by year and month)
unified_table = TARGET_DELTA_TABLE if 'TARGET_DELTA_TABLE' in locals() else "brz_omie_daily_unified"
print(f"Writing to Delta table: {unified_table}")
(
    full_df
    .repartition("extraction_year", "extraction_month")
    .write
    .format("delta")
    .mode("overwrite")
    .partitionBy("extraction_year", "extraction_month")
    .option("overwriteSchema", "true")
    .saveAsTable(unified_table)
)

print("✅ Unified Delta write completed.")

In [None]:
# Create metadata tables for change detection and pipeline tracking
def setup_metadata_tables():
    """Create Delta tables for tracking processing metadata and changes"""
    
    print("📝 Setting up metadata tables for change detection...")
    
    # 1. Processing Log Table - tracks each file processing event
    processing_log_schema = StructType([
        StructField("file_name", StringType(), False),
        StructField("file_url", StringType(), True),
        StructField("file_size", LongType(), True),
        StructField("file_checksum", StringType(), True),
        StructField("extraction_date", StringType(), True),
        StructField("extraction_year", IntegerType(), True),
        StructField("processed_at", TimestampType(), False),
        StructField("processing_status", StringType(), False),
        StructField("row_count", LongType(), True),
        StructField("delta_version", LongType(), True),
        StructField("pipeline_run_id", StringType(), True),
        StructField("processing_duration_seconds", DoubleType(), True),
        StructField("error_message", StringType(), True)
    ])
    
    # Create empty processing log
    empty_log = spark.createDataFrame([], processing_log_schema)
    processing_log_path = METADATA_DIR / 'processing_log'
    
    try:
        empty_log.write \
            .format("delta") \
            .mode("overwrite") \
            .save(str(processing_log_path))
        print(f"   ✅ Processing log table: {processing_log_path}")
    except Exception as e:
        print(f"   ❌ Failed to create processing log: {e}")
        return None
    
    # 2. Last Processed Table - tracks the state of incremental processing
    last_processed_schema = StructType([
        StructField("data_source", StringType(), False),
        StructField("last_processed_date", StringType(), False),
        StructField("last_processed_file", StringType(), True),
        StructField("last_update_timestamp", TimestampType(), False),
        StructField("files_processed", IntegerType(), True),
        StructField("total_rows", LongType(), True),
        StructField("current_delta_version", LongType(), True)
    ])
    
    # Initialize with migration state
    if migration_results:
        latest_date = "20250101"
        total_files = len([r for r in migration_results if r['status'] == 'success'])
        total_rows = sum(r['rows'] for r in migration_results if r['status'] == 'success')
        
        # Find latest date from successfully migrated files
        for result in migration_results:
            if result['status'] == 'success':
                file_path = result['file_path']
                date_match = re.search(r'(\d{8})', Path(file_path).name)
                if date_match:
                    file_date = date_match.group(1)
                    if file_date > latest_date:
                        latest_date = file_date
        
        initial_data = [(
            "OMIE_daily_prices",
            latest_date,
            f"migration_completed_{total_files}_files",
            datetime.now(),
            total_files,
            total_rows,
            1  # Initial version after migration
        )]
    else:
        initial_data = [(
            "OMIE_daily_prices",
            "20230101", 
            None,
            datetime.now(),
            0,
            0,
            0
        )]
    
    last_processed_df = spark.createDataFrame(initial_data, last_processed_schema)
    last_processed_path = METADATA_DIR / 'last_processed'
    
    try:
        last_processed_df.write \
            .format("delta") \
            .mode("overwrite") \
            .save(str(last_processed_path))
        print(f"   ✅ Last processed table: {last_processed_path}")
    except Exception as e:
        print(f"   ❌ Failed to create last processed table: {e}")
        return None
    
    # 3. File Change Tracking - for detecting new/modified files
    file_tracking_schema = StructType([
        StructField("file_url", StringType(), False),
        StructField("file_name", StringType(), False),
        StructField("file_size", LongType(), True),
        StructField("file_checksum", StringType(), True),
        StructField("last_modified", TimestampType(), True),
        StructField("first_seen", TimestampType(), False),
        StructField("last_checked", TimestampType(), False),
        StructField("processing_status", StringType(), False),
        StructField("extraction_date", StringType(), True)
    ])
    
    empty_tracking = spark.createDataFrame([], file_tracking_schema)
    file_tracking_path = METADATA_DIR / 'file_change_tracking'
    
    try:
        empty_tracking.write \
            .format("delta") \
            .mode("overwrite") \
            .save(str(file_tracking_path))
        print(f"   ✅ File change tracking table: {file_tracking_path}")
    except Exception as e:
        print(f"   ❌ Failed to create file tracking table: {e}")
        return None
    
    print("✅ All metadata tables created successfully!")
    
    return {
        'processing_log': str(processing_log_path),
        'last_processed': str(last_processed_path),
        'file_change_tracking': str(file_tracking_path)
    }

# Setup metadata tables
metadata_tables = setup_metadata_tables()

In [None]:
# Verify Delta table and test operations
def verify_delta_table():
    """Verify the Delta table was created correctly and test key operations"""
    
    print("🧪 Verifying Delta table and testing operations...")
    
    try:
        # Read Delta table
        delta_df = spark.read.format("delta").load(str(DAILY_PRICES_DIR))
        
        # Basic statistics
        total_rows = delta_df.count()
        total_columns = len(delta_df.columns)
        
        print(f"✅ Delta table verification:")
        print(f"   📊 Total rows: {total_rows:,}")
        print(f"   📋 Total columns: {total_columns}")
        
        # Schema verification
        print(f"\n📋 Delta table schema:")
        delta_df.printSchema()
        
        # Partition information
        print(f"\n📁 Partition summary:")
        partition_summary = delta_df.groupBy("partition_year", "partition_month") \
            .agg(F.count("*").alias("row_count"),
                 F.min("extraction_date_parsed").alias("min_date"),
                 F.max("extraction_date_parsed").alias("max_date")) \
            .orderBy("partition_year", "partition_month")
        
        partition_summary.show(20, truncate=False)
        
        # Data quality checks
        print(f"\n🔍 Data quality summary:")
        quality_stats = delta_df.agg(
            F.count("*").alias("total_records"),
            F.sum(F.when(F.col("price_valid"), 1).otherwise(0)).alias("valid_prices"),
            F.avg("marginal_price_eur_mwh").alias("avg_price"),
            F.min("marginal_price_eur_mwh").alias("min_price"),
            F.max("marginal_price_eur_mwh").alias("max_price"),
            F.countDistinct("source_file").alias("unique_files")
        ).collect()[0]
        
        print(f"   Total records: {quality_stats['total_records']:,}")
        print(f"   Valid prices: {quality_stats['valid_prices']:,} ({quality_stats['valid_prices']/quality_stats['total_records']*100:.1f}%)")
        print(f"   Average price: {quality_stats['avg_price']:.2f} €/MWh")
        print(f"   Price range: {quality_stats['min_price']:.2f} - {quality_stats['max_price']:.2f} €/MWh")
        print(f"   Unique files: {quality_stats['unique_files']}")
        
        # Test Delta features
        print(f"\n⏰ Testing Delta features:")
        
        # 1. Table history
        try:
            history_df = spark.sql(f"DESCRIBE HISTORY delta.`{DAILY_PRICES_DIR}`")
            version_count = history_df.count()
            print(f"   📜 Table versions: {version_count}")
            
            # Show recent operations
            print(f"   Recent operations:")
            history_df.select("version", "timestamp", "operation", "operationMetrics") \
                .orderBy(F.desc("version")) \
                .show(3, truncate=False)
                
        except Exception as e:
            print(f"   ⚠️  History check failed: {e}")
        
        # 2. Time travel (if multiple versions exist)
        try:
            v0_df = spark.read.format("delta").option("versionAsOf", 0).load(str(DAILY_PRICES_DIR))
            v0_count = v0_df.count()
            print(f"   🕐 Version 0 rows: {v0_count:,}")
        except Exception as e:
            print(f"   ⚠️  Time travel test failed: {e}")
        
        # 3. Sample data
        print(f"\n🔍 Sample data (latest 5 records):")
        delta_df.orderBy(F.desc("migrated_at")) \
            .select("extraction_date_parsed", "marginal_price_eur_mwh", "price_category", 
                   "source_file", "partition_year", "partition_month") \
            .show(5, truncate=False)
        
        # 4. Test metadata tables
        if metadata_tables:
            print(f"\n📊 Metadata tables status:")
            for table_name, table_path in metadata_tables.items():
                try:
                    meta_df = spark.read.format("delta").load(table_path)
                    row_count = meta_df.count()
                    print(f"   {table_name}: {row_count} records")
                    
                    if table_name == "last_processed" and row_count > 0:
                        print("   Latest processing state:")
                        meta_df.show(truncate=False)
                        
                except Exception as e:
                    print(f"   ❌ {table_name}: Error - {e}")
        
        print(f"\n✅ All verifications completed successfully!")
        return True
        
    except Exception as e:
        print(f"❌ Delta table verification failed: {e}")
        import traceback
        traceback.print_exc()
        return False

# Run verification
verification_success = verify_delta_table()

In [None]:
# Quick verification adapted to unified table and OMIE fields
from pyspark.sql import functions as F

unified_table = TARGET_DELTA_TABLE if 'TARGET_DELTA_TABLE' in locals() else "brz_omie_daily_unified"
print(f"Verifying table: {unified_table}")

try:
    df = spark.table(unified_table)
    print(f"Rows: {df.count():,}")
    df.select(
        F.min("extraction_date_parsed").alias("min_date"),
        F.max("extraction_date_parsed").alias("max_date"),
        F.min("marginal_price_eur_mwh").alias("min_price"),
        F.max("marginal_price_eur_mwh").alias("max_price"),
        F.avg("marginal_price_eur_mwh").alias("avg_price")
    ).show(truncate=False)

    spark.sql(f"DESCRIBE HISTORY {unified_table}").show(truncate=False)
except Exception as e:
    print(f"Verification error: {e}")

In [None]:
# Generate configuration and summary for pipeline development
def generate_migration_summary():
    """Generate a comprehensive summary and export configuration for pipelines"""
    
    print("=" * 80)
    print("🎉 DELTA MIGRATION COMPLETED")
    print("=" * 80)
    
    # Migration summary
    if migration_results:
        success_count = len([r for r in migration_results if r['status'] == 'success'])
        failed_count = len([r for r in migration_results if r['status'] == 'failed'])
        total_rows = sum(r['rows'] for r in migration_results if r['status'] == 'success')
        
        print(f"\n📊 Migration Results:")
        print(f"   ✅ Successful files: {success_count}")
        print(f"   ❌ Failed files: {failed_count}")
        print(f"   📈 Total rows migrated: {total_rows:,}")
        
        # Show year breakdown
        print(f"\n📅 Data by year:")
        for year in sorted(set(r['year'] for r in migration_results if r['status'] == 'success')):
            year_rows = sum(r['rows'] for r in migration_results if r['year'] == year and r['status'] == 'success')
            year_files = len([r for r in migration_results if r['year'] == year and r['status'] == 'success'])
            print(f"   {year}: {year_files} files, {year_rows:,} rows")
    
    # Directory structure
    print(f"\n📁 Delta Lake Structure Created:")
    print(f"   Lakehouse root: {LAKEHOUSE_ROOT}")
    print(f"   Delta tables: {DELTA_DIR}")
    print(f"   Daily prices: {DAILY_PRICES_DIR}")
    print(f"   Metadata: {METADATA_DIR}")
    print(f"   Staging: {STAGING_DIR}")
    
    # Current table status
    if verification_success:
        try:
            delta_df = spark.read.format("delta").load(str(DAILY_PRICES_DIR))
            total_rows = delta_df.count()
            partitions = delta_df.select("partition_year", "partition_month").distinct().count()
            
            print(f"\n📊 Delta Table Status:")
            print(f"   Total rows: {total_rows:,}")
            print(f"   Partitions: {partitions}")
            print(f"   Columns: {len(delta_df.columns)}")
            
            # Date range
            date_range = delta_df.agg(
                F.min("extraction_date_parsed").alias("min_date"),
                F.max("extraction_date_parsed").alias("max_date")
            ).collect()[0]
            
            if date_range['min_date'] and date_range['max_date']:
                print(f"   Date range: {date_range['min_date']} to {date_range['max_date']}")
                
        except Exception as e:
            print(f"   ⚠️  Could not read table status: {e}")
    
    # Export configuration for pipelines
    config = {
        'migration': {
            'completed_at': datetime.now().isoformat(),
            'lakehouse_root': str(LAKEHOUSE_ROOT),
            'delta_tables_dir': str(DELTA_DIR),
            'daily_prices_table': str(DAILY_PRICES_DIR),
            'metadata_dir': str(METADATA_DIR),
            'staging_dir': str(STAGING_DIR),
            'success': verification_success and optimization_success
        },
        'tables': {
            'daily_prices': str(DAILY_PRICES_DIR),
            'processing_log': metadata_tables['processing_log'] if metadata_tables else None,
            'last_processed': metadata_tables['last_processed'] if metadata_tables else None,
            'file_change_tracking': metadata_tables['file_change_tracking'] if metadata_tables else None
        },
        'pipeline_config': {
            'omie_base_url': 'https://www.omie.es',
            'target_years': [2023, 2024, 2025],
            'file_patterns': ['marginalpdbc'],
            'update_schedule': {
                'daily': '0 6 * * *',  # 6 AM UTC daily
                'monthly': '0 2 1 * *'  # 2 AM on 1st of month
            }
        },
        'spark_config': {
            'app_name': 'OMIE_Pipeline',
            'delta_enabled': True,
            'adaptive_query_enabled': True,
            'coalesce_partitions_enabled': True,
            'delta_optimizations': {
                'autoOptimize': True,
                'autoCompact': True
            }
        }
    }
    
    # Save configuration
    config_path = METADATA_DIR / 'pipeline_config.json'
    with open(config_path, 'w') as f:
        json.dump(config, f, indent=2)
    
    print(f"\n📋 Pipeline configuration exported: {config_path}")
    
    # Next steps
    print(f"\n🔄 Next Steps:")
    print(f"   1. ✅ Delta format migration complete")
    print(f"   2. 🔄 Create daily incremental update pipeline")
    print(f"   3. 🔄 Create monthly maintenance pipeline")
    print(f"   4. 🔄 Set up change detection for new files")
    print(f"   5. 🔄 Create managed Delta tables in catalog")
    print(f"   6. 🔄 Build Fabric Data Factory pipelines")
    print(f"   7. 🔄 Test end-to-end pipeline execution")
    
    print(f"\n💡 Ready for Pipeline Development!")
    print(f"   The unified Delta table provides:")
    print(f"   - ACID transactions and versioning")
    print(f"   - Optimized partitioning by year/month") 
    print(f"   - Built-in change detection metadata")
    print(f"   - Time travel and rollback capabilities")
    print(f"   - Ready for incremental updates")
    
    return config

# Generate final summary and configuration
final_config = generate_migration_summary()