In [0]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

# ---------------- Initialize Spark ----------------
spark = (SparkSession.builder
         .appName("CreditRisk_BronzeLayer")
         .enableHiveSupport()
         .getOrCreate())

# ---------------- Define Input (Raw Data) ----------------
raw_path = "s3://creditbuc/delta/"

# ---------------- Bronze Catalog Details ----------------
bronze_catalog = "bronze1"   # Your Glue Catalog
bronze_schema = "credit_data"   # New schema we will create
bronze_table = "credit_bronze"

# ---------------- Step 1: Read Raw Data ----------------
# Assuming your file is CSV (adjust options if JSON/Parquet)
df_raw = (spark.read
          .option("header", "true")
          .option("inferSchema", "true")
          .format('delta').load(raw_path))

print("✅ Raw Data Loaded")
df_raw.show(5)

# ---------------- Step 2: (Optional) Minimal Cleaning ----------------
# Bronze layer is usually raw, but let's ensure column names are safe
df_bronze = df_raw.toDF(*[c.strip().replace(" ", "_").lower() for c in df_raw.columns])

print("✅ Columns standardized for Bronze layer")
df_bronze.printSchema()

# ---------------- Step 3: Create Schema in Glue Catalog ----------------
spark.sql(f"CREATE DATABASE IF NOT EXISTS {bronze_catalog}.{bronze_schema}")

print(f"✅ Schema `{bronze_catalog}.{bronze_schema}` created in Glue Catalog")

# ---------------- Step 4: Write to Bronze Table ----------------
(df_bronze.write
    .format("delta")
    .mode("overwrite")
    .saveAsTable(f"{bronze_catalog}.{bronze_schema}.{bronze_table}"))

print(f"✅ Bronze Table `{bronze_catalog}.{bronze_schema}.{bronze_table}` created")
