In [0]:
configs = {
    "fs.azure.sas.creditcarddefaulterfraud.storageaccananyashivhare.blob.core.windows.net": "sv=2024-11-04&ss=bfqt&srt=sco&sp=rwdlacupiytfx&se=2025-10-14T05:32:59Z&st=2025-10-13T21:17:59Z&spr=https&sig=b9ZLgCIMpRTDZ0vmofsCfd8AwbZIwPxImfteCMFi3Og%3D"
}

In [0]:
# Databricks notebook: silver_layer (Complete Version)
from pyspark.sql.functions import col, when

print("="*70)
print("SILVER LAYER TRANSFORMATION")
print("="*70)


# STEP 1: DEFINE PATHS (Using /mnt/cc mount)

bronze_path = "/mnt/cc/bronze/credit_default"
silver_path = "/mnt/cc/silver/credit_default"

print(f"\n Bronze path: {bronze_path}")
print(f" Silver path: {silver_path}")


# STEP 2: READ FROM BRONZE LAYER

print("\n" + "-"*70)
print("STEP 1: READING BRONZE DATA")
print("-"*70)

try:
    df = spark.read.format("delta").load(bronze_path)
    row_count = df.count()
    col_count = len(df.columns)
    print(f"✓ Successfully loaded bronze data")
    print(f"  Rows: {row_count}")
    print(f"  Columns: {col_count}")
    print(f"  Columns: {df.columns}")
except Exception as e:
    print(f"✗ Error reading bronze layer: {e}")
    raise

# STEP 3: DATA TRANSFORMATIONS

print("\n" + "-"*70)
print("STEP 2: DATA TRANSFORMATIONS")
print("-"*70)

# 3A: Convert numeric columns to double
print("\n3A. Converting numeric columns to double...")
num_cols = [c for c, t in df.dtypes if t in ('int', 'double', 'long', 'float')]
print(f"   Found {len(num_cols)} numeric columns:")
for c in num_cols:
    print(f"     - {c}")
    df = df.withColumn(c, col(c).cast('double'))
print("   ✓ Conversion complete")

# 3B: Clean EDUCATION column (map invalid values to 4)
print("\n3B. Cleaning EDUCATION column...")
print("   - Mapping education values: 0,1,2,3 stay same, others become 4")
education_before = df.select("EDUCATION").distinct().count()
df = df.withColumn(
    "EDUCATION",
    when(col("EDUCATION").isin(0, 1, 2, 3), col("EDUCATION")).otherwise(4)
)
education_after = df.select("EDUCATION").distinct().count()
print(f"   ✓ Education values before: {education_before}, after: {education_after}")

# 3C: Calculate average bill amount
print("\n3C. Calculating average bill amount...")
print("   - Formula: (BILL_AMT1 + BILL_AMT2 + BILL_AMT3 + BILL_AMT4 + BILL_AMT5 + BILL_AMT6) / 6")
df = df.withColumn(
    "avg_bill_amt",
    (
        col("BILL_AMT1") + col("BILL_AMT2") + col("BILL_AMT3") +
        col("BILL_AMT4") + col("BILL_AMT5") + col("BILL_AMT6")
    ) / 6
)
print("   ✓ Average bill amount calculated")

print(f"\n✓ All data transformations complete")
print(f"  Total columns after transformation: {len(df.columns)}")


# STEP 4: WRITE TO SILVER LAYER

print("\n" + "-"*70)
print("STEP 3: WRITING TO SILVER LAYER")
print("-"*70)

try:
    # Create silver directory if it doesn't exist
    print(f"\nCreating silver directory: {silver_path}")
    dbutils.fs.mkdirs(silver_path)
    print("✓ Directory created/exists")
    
    print(f"\nWriting delta table...")
    df.write \
        .format("delta") \
        .mode("overwrite") \
        .option("overwriteSchema", "true") \
        .save(silver_path)
    
    print(f"✓ Successfully written {row_count} rows to silver layer")
    print(f"✓ Location: {silver_path}")
except Exception as e:
    print(f"✗ Error writing to silver layer: {e}")
    raise

# STEP 5: CREATE DATABASE AND TABLE

print("\n" + "-"*70)
print("STEP 4: CREATING DATABASE AND TABLE")
print("-"*70)

try:
    print("\nCreating database: credit_risk")
    spark.sql("CREATE DATABASE IF NOT EXISTS credit_risk")
    print("✓ Database created/exists")
    
    # Register the Delta table using the mounted path
    print("\nCreating table: credit_risk.silver_credit_default")
    spark.sql(f"""
        CREATE TABLE IF NOT EXISTS credit_risk.silver_credit_default
        USING DELTA
        LOCATION '{silver_path}'
    """)
    print("✓ Table created/exists")
    print(f"✓ Table location: {silver_path}")
except Exception as e:
    print(f"Warning: Could not create table: {e}")
    print("But data is safely written to silver layer")

# STEP 6: DISPLAY RESULTS

print("\n" + "="*70)
print("✓ SILVER LAYER TRANSFORMATION COMPLETE!")
print("="*70)

print(f"\n📊 SUMMARY:")
print(f"  Database: credit_risk")
print(f"  Table: silver_credit_default")
print(f"  Location: {silver_path}")
print(f"  Total Rows: {row_count:,}")
print(f"  Total Columns: {len(df.columns)}")

print("\n" + "-"*70)
print("COLUMN NAMES:")
print("-"*70)
for i, col_name in enumerate(df.columns, 1):
    print(f"  {i:2d}. {col_name}")

print("\n" + "-"*70)
print("SAMPLE DATA (First 10 rows):")
print("-"*70)
display(df.limit(10))

print("\n" + "-"*70)
print("DATA SCHEMA:")
print("-"*70)
df.printSchema()

print("\n" + "-"*70)
print("SUMMARY STATISTICS:")
print("-"*70)
display(df.describe())

print("\n" + "-"*70)
print("DATA QUALITY CHECKS:")
print("-"*70)
print("\nNull values by column:")

# Simple null check using SQL/Spark operations (no serialization issues)
from pyspark.sql.functions import sum as spark_sum, when

for col_name in df.columns:
    null_count = df.filter(col(col_name).isNull()).count()
    if null_count > 0:
        print(f"  {col_name}: {null_count} nulls")
    else:
        print(f"  {col_name}: 0 nulls ✓")

print("\n" + "-"*70)
print("EDUCATION VALUE DISTRIBUTION:")
print("-"*70)
display(df.groupBy("EDUCATION").count())

print("\n" + "-"*70)
print("DEFAULT FLAG DISTRIBUTION:")
print("-"*70)
display(df.groupBy("default").count())

print("\n" + "-"*70)
print("AVERAGE BILL AMOUNT STATISTICS:")
print("-"*70)
display(df.select(
    col("avg_bill_amt").alias("avg_bill_amt")
).describe())

print("\n" + "="*70)
print("✓ ALL SILVER LAYER TASKS COMPLETED SUCCESSFULLY!")
print("="*70)

SILVER LAYER TRANSFORMATION

📁 Bronze path: /mnt/cc/bronze/credit_default
📁 Silver path: /mnt/cc/silver/credit_default

----------------------------------------------------------------------
STEP 1: READING BRONZE DATA
----------------------------------------------------------------------
✓ Successfully loaded bronze data
  Rows: 90000
  Columns: 26
  Columns: ['ID', 'LIMIT_BAL', 'SEX', 'EDUCATION', 'MARRIAGE', 'AGE', 'PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6', 'BILL_AMT1', 'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6', 'default', 'ingest_ts']

----------------------------------------------------------------------
STEP 2: DATA TRANSFORMATIONS
----------------------------------------------------------------------

3A. Converting numeric columns to double...
   Found 21 numeric columns:
     - ID
     - LIMIT_BAL
     - AGE
     - PAY_0
     - PAY_2
     - PAY_3
     - PAY_4
     - PAY_5
 

ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,BILL_AMT1,BILL_AMT2,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default,ingest_ts,avg_bill_amt
1.0,20000.0,F,4,Married,24.0,2.0,2.0,-1.0,-1.0,-2.0,-2.0,3913.0,3102.0,689.0,0.0,0.0,0.0,0.0,689.0,0.0,0.0,0.0,0.0,Y,2025-10-13T22:34:44.235+0000,1284.0
2.0,120000.0,F,4,Single,26.0,-1.0,2.0,0.0,0.0,0.0,2.0,2682.0,1725.0,2682.0,3272.0,3455.0,3261.0,0.0,1000.0,1000.0,1000.0,0.0,2000.0,Y,2025-10-13T22:34:44.235+0000,2846.1666666666665
3.0,90000.0,F,4,Single,34.0,0.0,0.0,0.0,0.0,0.0,0.0,29239.0,14027.0,13559.0,14331.0,14948.0,15549.0,1518.0,1500.0,1000.0,1000.0,1000.0,5000.0,N,2025-10-13T22:34:44.235+0000,16942.166666666668
4.0,50000.0,F,4,Married,37.0,0.0,0.0,0.0,0.0,0.0,0.0,46990.0,48233.0,49291.0,28314.0,28959.0,29547.0,2000.0,2019.0,1200.0,1100.0,1069.0,1000.0,N,2025-10-13T22:34:44.235+0000,38555.66666666666
5.0,50000.0,M,4,Married,57.0,-1.0,0.0,-1.0,0.0,0.0,0.0,8617.0,5670.0,35835.0,20940.0,19146.0,19131.0,2000.0,36681.0,10000.0,9000.0,689.0,679.0,N,2025-10-13T22:34:44.235+0000,18223.166666666668
6.0,50000.0,M,4,Single,37.0,0.0,0.0,0.0,0.0,0.0,0.0,64400.0,57069.0,57608.0,19394.0,19619.0,20024.0,2500.0,1815.0,657.0,1000.0,1000.0,800.0,N,2025-10-13T22:34:44.235+0000,39685.66666666666
7.0,500000.0,M,4,Single,29.0,0.0,0.0,0.0,0.0,0.0,0.0,367965.0,412023.0,445007.0,542653.0,483003.0,473944.0,55000.0,40000.0,38000.0,20239.0,13750.0,13770.0,N,2025-10-13T22:34:44.235+0000,454099.1666666667
8.0,100000.0,F,4,Single,23.0,0.0,-1.0,-1.0,0.0,0.0,-1.0,11876.0,380.0,601.0,221.0,-159.0,567.0,380.0,601.0,0.0,581.0,1687.0,1542.0,N,2025-10-13T22:34:44.235+0000,2247.6666666666665
9.0,140000.0,F,4,Married,28.0,0.0,0.0,2.0,0.0,0.0,0.0,11285.0,14096.0,12108.0,12211.0,11793.0,3719.0,3329.0,0.0,432.0,1000.0,1000.0,1000.0,N,2025-10-13T22:34:44.235+0000,10868.666666666666
10.0,20000.0,M,4,Single,35.0,-2.0,-2.0,-2.0,-2.0,-1.0,-1.0,0.0,0.0,0.0,0.0,13007.0,13912.0,0.0,0.0,0.0,13007.0,1122.0,0.0,N,2025-10-13T22:34:44.235+0000,4486.5



----------------------------------------------------------------------
DATA SCHEMA:
----------------------------------------------------------------------
root
 |-- ID: double (nullable = true)
 |-- LIMIT_BAL: double (nullable = true)
 |-- SEX: string (nullable = true)
 |-- EDUCATION: string (nullable = true)
 |-- MARRIAGE: string (nullable = true)
 |-- AGE: double (nullable = true)
 |-- PAY_0: double (nullable = true)
 |-- PAY_2: double (nullable = true)
 |-- PAY_3: double (nullable = true)
 |-- PAY_4: double (nullable = true)
 |-- PAY_5: double (nullable = true)
 |-- PAY_6: double (nullable = true)
 |-- BILL_AMT1: double (nullable = true)
 |-- BILL_AMT2: double (nullable = true)
 |-- BILL_AMT3: double (nullable = true)
 |-- BILL_AMT4: double (nullable = true)
 |-- BILL_AMT5: double (nullable = true)
 |-- BILL_AMT6: double (nullable = true)
 |-- PAY_AMT1: double (nullable = true)
 |-- PAY_AMT2: double (nullable = true)
 |-- PAY_AMT3: double (nullable = true)
 |-- PAY_AMT4: double (nu

summary,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,BILL_AMT1,BILL_AMT2,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default,avg_bill_amt
count,90000.0,90000.0,90000,90000.0,90000,90000.0,90000.0,90000.0,90000.0,90000.0,90000.0,90000.0,90000.0,90000.0,90000.0,90000.0,90000.0,90000.0,90000.0,90000.0,90000.0,90000.0,90000.0,90000.0,90000,90000.0
mean,15000.5,167484.32266666667,,3.9981333333333335,0.0,35.4855,-0.0167,-0.1337666666666666,-0.1662,-0.2206666666666666,-0.2662,-0.2911,51223.3309,49179.07516666667,47013.1548,43262.94896666666,40311.40096666667,38871.7604,5663.5805,5921.1635,5225.6815,4826.076866666666,4799.387633333334,5215.502566666667,,44976.94519999998
stddev,8660.30214595648,129746.21990249012,,0.0863901912683988,0.0,9.217801645226825,1.1237890411055755,1.1971726707464807,1.1968542696963889,1.1691256317566887,1.1331748148217171,1.149974847754654,73635.04238566419,71172.97794969511,69348.61686544509,64332.14131249017,60796.48023505294,59553.44581341183,16563.09631450986,23040.61438811869,17606.765833637284,15665.985672689074,15278.135917363705,17777.268244745588,,63260.018951766026
min,1.0,10000.0,F,0.0,0,21.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-165580.0,-69777.0,-157264.0,-170000.0,-81334.0,-339603.0,0.0,0.0,0.0,0.0,0.0,0.0,N,-56043.16666666666
max,30000.0,1000000.0,M,4.0,Single,79.0,8.0,8.0,8.0,8.0,8.0,8.0,964511.0,983931.0,1664089.0,891586.0,927171.0,961664.0,873552.0,1684259.0,896040.0,621000.0,426529.0,528666.0,Y,877313.8333333334



----------------------------------------------------------------------
DATA QUALITY CHECKS:
----------------------------------------------------------------------

Null values by column:
  ID: 0 nulls ✓
  LIMIT_BAL: 0 nulls ✓
  SEX: 0 nulls ✓
  EDUCATION: 0 nulls ✓
  MARRIAGE: 0 nulls ✓
  AGE: 0 nulls ✓
  PAY_0: 0 nulls ✓
  PAY_2: 0 nulls ✓
  PAY_3: 0 nulls ✓
  PAY_4: 0 nulls ✓
  PAY_5: 0 nulls ✓
  PAY_6: 0 nulls ✓
  BILL_AMT1: 0 nulls ✓
  BILL_AMT2: 0 nulls ✓
  BILL_AMT3: 0 nulls ✓
  BILL_AMT4: 0 nulls ✓
  BILL_AMT5: 0 nulls ✓
  BILL_AMT6: 0 nulls ✓
  PAY_AMT1: 0 nulls ✓
  PAY_AMT2: 0 nulls ✓
  PAY_AMT3: 0 nulls ✓
  PAY_AMT4: 0 nulls ✓
  PAY_AMT5: 0 nulls ✓
  PAY_AMT6: 0 nulls ✓
  default: 0 nulls ✓
  ingest_ts: 0 nulls ✓
  avg_bill_amt: 0 nulls ✓

----------------------------------------------------------------------
EDUCATION VALUE DISTRIBUTION:
----------------------------------------------------------------------


EDUCATION,count
0,42
4,89958



----------------------------------------------------------------------
DEFAULT FLAG DISTRIBUTION:
----------------------------------------------------------------------


default,count
Y,19908
N,70092



----------------------------------------------------------------------
AVERAGE BILL AMOUNT STATISTICS:
----------------------------------------------------------------------


summary,avg_bill_amt
count,90000.0
mean,44976.94519999998
stddev,63260.018951766026
min,-56043.16666666666
max,877313.8333333334



✓ ALL SILVER LAYER TASKS COMPLETED SUCCESSFULLY!
