In [0]:
from pyspark.sql.types import StructType,StructField,StringType,IntegerType,DoubleType,FloatType,DateType
from pyspark.sql.functions import col, concat, current_timestamp,regexp_replace,lit,to_date,when,sha2

In [0]:
loan_default_schema = StructType(fields=[StructField("loan_id", StringType(), False),
                                     StructField("mem_id", StringType(), False),
                                     StructField("def_id", StringType(), False),
                                     StructField("delinq_2yrs", IntegerType(), True),
                                     StructField("delinq_amnt",FloatType(), True),
                                     StructField("pub_rec",IntegerType(), True),
                                     StructField("pub_rec_bankruptcies",IntegerType(), True),
                                     StructField("inq_last_6mths",IntegerType(), True),
                                     StructField("total_rec_late_fee",FloatType(), True),
                                     StructField("hardship_flag",StringType(), True),
                                     StructField("hardship_type",StringType(), True),
                                     StructField("hardship_length",IntegerType(), True),
                                     StructField("hardship_amount",FloatType(), True)
 
                                    
])

####Read the csv file into a dataframe

In [0]:
loan_default_df=spark.read.option("header",True).schema(loan_default_schema).csv("/mnt/bronze/lending_loan/loan_defaulters.csv")

In [0]:
display(loan_default_df)

####Add the ingestion date to the dataframe

In [0]:
df_ingest_date=loan_default_df.withColumn("ingest_date", current_timestamp())
display(df_ingest_date)

####Add a surrogate key to the dataframe

In [0]:
loan_default_key=df_ingest_date.withColumn("loan_default_key", sha2(concat(col("loan_id"),col("mem_id"),col("def_id")), 256))
display(loan_default_key)

####Replace the NULL strings into NULL values

In [0]:
df_null=loan_default_key.replace("null",None)

In [0]:
df_null.createOrReplaceTempView("null_df_table")
display(spark.sql("select * from null_df_table where delinq_2yrs is null and hardship_flag is null"))

####Rename the columns in the dataframe


In [0]:
loan_df_rename=loan_default_key.withColumnRenamed("mem_id", "member_id") \
.withColumnRenamed("def_id", "loan_default_id") \
.withColumnRenamed("delinq_2yrs", "defaulters_2yrs") \
.withColumnRenamed("delinq_amnt", "defaulters_amount") \
.withColumnRenamed("pub_rec", "public_records") \
.withColumnRenamed("pub_rec_bankruptcies", "public_records_bankruptcies") \
.withColumnRenamed("inq_last_6mths", "enquiries_6mnths") \
.withColumnRenamed("total_rec_late_fee", "late_fee") 

In [0]:
loan_df_rename.createOrReplaceTempView("temp")
display_df=spark.sql("select loan_default_key, ingest_date, loan_id,member_id,loan_default_id,defaulters_2yrs,defaulters_amount,public_records,public_records_bankruptcies,enquiries_6mnths,late_fee,hardship_flag,hardship_type,hardship_length,hardship_amount from temp")
display(display_df)

####Write the cleaned dataframe into data lake

In [0]:
display_df.write.options(header='True').mode("append").parquet("/mnt/silver/lending_loan/loan_defaulters")

In [0]:
display(spark.read.parquet("/mnt/silver/lending_loan/loan_defaulters"))