In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

# --- 1. SETUP: Fresh Spark Session ---
try:
    spark.stop()
    print("Stopped existing session.")
except:
    pass

print("Starting new Spark Session...")
spark = SparkSession.builder \
    .appName("PipelineVerification") \
    .config("spark.jars.packages", "io.delta:delta-spark_2.12:3.0.0,org.apache.hadoop:hadoop-aws:3.3.4,com.amazonaws:aws-java-sdk-bundle:1.12.262") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.hadoop.fs.s3a.endpoint", "http://minio:9000") \
    .config("spark.hadoop.fs.s3a.access.key", "minio") \
    .config("spark.hadoop.fs.s3a.secret.key", "minioadmin") \
    .config("spark.hadoop.fs.s3a.path.style.access", "true") \
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
    .config("spark.hadoop.fs.s3a.connection.ssl.enabled", "false") \
    .getOrCreate()

# --- 2. VERIFY BRONZE (Did Ingestion Work?) ---
print("\n" + "="*40)
print("TEST 1: BRONZE LAYER (Ingestion)")
print("="*40)
try:
    df_bronze = spark.read.format("delta").load("s3a://bronze/telco_churn_delta")
    cols = df_bronze.columns

    print(cols)
    
    # Check for "PK" garbage
    if any("PK" in c for c in cols):
        print("❌ FAIL: 'PK' found in columns. Excel read failed.")
    elif "customerID" in cols:
        print("✅ PASS: Columns look correct (customerID found).")
        print(f"   Schema detected: TotalCharges is {df_bronze.schema['TotalCharges'].dataType}")
    else:
        print("⚠️ WARNING: Unexpected column names.")
        print(cols)
except Exception as e:
    print(f"❌ FAIL: Could not read Bronze table. {e}")

Starting new Spark Session...

TEST 1: BRONZE LAYER (Ingestion)
['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn']
✅ PASS: Columns look correct (customerID found).
   Schema detected: TotalCharges is DoubleType()
