In [0]:
from pyspark.sql import functions as F
from pyspark.sql.types import IntegerType, DoubleType

bronze_path = "s3://creditbuc/credit_risk_dataset.csv"
silver_catalog = "silver1"
silver_schema = "credit_data"
silver_table = "credit_silver"

df_bronze = spark.read.csv(
    bronze_path,
    header=True,
    inferSchema=True
)

display(df_bronze)

numeric_cols = [
    field.name
    for field in df_bronze.schema.fields
    if str(field.dataType) in ["IntegerType", "DoubleType", "LongType"]
]
categorical_cols = [
    field.name
    for field in df_bronze.schema.fields
    if str(field.dataType) == "StringType"
]

df_clean = df_bronze

for col in numeric_cols:
    mean_value = df_clean.select(F.mean(F.col(col))).collect()[0][0]
    if mean_value is not None:
        df_clean = df_clean.fillna({col: mean_value})

for col in categorical_cols:
    df_clean = df_clean.fillna({col: "Unknown"})

df_clean = df_clean.dropDuplicates()

for col in df_clean.columns:
    df_clean = df_clean.withColumnRenamed(
        col,
        col.strip().lower().replace(" ", "_")
    )

if "person_age" in df_clean.columns:
    df_clean = df_clean.withColumn(
        "person_age",
        F.col("person_age").cast(IntegerType())
    )
if "person_income" in df_clean.columns:
    df_clean = df_clean.withColumn(
        "person_income",
        F.col("person_income").cast(DoubleType())
    )
if "loan_amnt" in df_clean.columns:
    df_clean = df_clean.withColumn(
        "loan_amnt",
        F.col("loan_amnt").cast(DoubleType())
    )
if "annual_inc" in df_clean.columns:
    df_clean = df_clean.withColumn(
        "annual_inc",
        F.col("annual_inc").cast(DoubleType())
    )

spark.sql(
    f"CREATE SCHEMA IF NOT EXISTS {silver_catalog}.{silver_schema}"
)

df_clean.write.format("delta")\
    .mode("overwrite")\
    .option("overwriteSchema", "true")\
    .partitionBy("loan_intent")\
    .saveAsTable(f"{silver_catalog}.{silver_schema}.{silver_table}")

df_silver = spark.sql(
    f"SELECT * FROM {silver_catalog}.{silver_schema}.{silver_table} LIMIT 10"
)
display(df_silver)