In [0]:
from pyspark.sql.types import StructType,StructField,StringType,IntegerType,DoubleType,FloatType,DateType
from pyspark.sql.functions import col, concat, current_timestamp,regexp_replace,lit,to_date,sha2

In [0]:
loan_schema = StructType(fields=[StructField("loan_id", StringType(), False),
                                     StructField("mem_id", StringType(), False),
                                     StructField("acc_id", StringType(), False),
                                     StructField("loan_amt", DoubleType(), True),
                                     StructField("fnd_amt", DoubleType(), True),
                                     StructField("term", StringType(), True),
                                     StructField("interest", StringType(), True),
                                     StructField("installment", FloatType(), True),
                                     StructField("issue_date", DateType(), True),
                                     StructField("loan_status", StringType(), True),
                                     StructField("purpose", StringType(), True),
                                     StructField("title", StringType(), True),
                                     StructField("disbursement_method", StringType(), True)
                                    
])

####Read the csv file into a dataframe


In [0]:
loan_df = spark.read \
.option("header", True) \
.schema(loan_schema) \
.csv("/mnt/bronze/lending_loan/loan_details.csv")

In [0]:
loan_df.createOrReplaceTempView("loan_table")
loan_sql=spark.sql("select * from loan_table")
display(loan_sql)

In [0]:
loan_sql.createOrReplaceTempView("loan_data")
loan_data_df=spark.sql("select * from loan_data where term=36 or interest > 5.0")
display(loan_data_df)

####Cleaning Techniques to include

In [0]:
# Define the string to remove
string_to_remove = "months"
 
# Use the regexp_replace function to remove the string from the column
clean_term_df = loan_df.withColumn("term", regexp_replace(loan_df["term"], string_to_remove, ""))
 
# Display the resulting dataframe
display(clean_term_df)

In [0]:
# Define the string to remove
string_to_remove = "%"
 
# Use the regexp_replace function to remove the string from the column
clean_interest_df = clean_term_df.withColumn("interest", regexp_replace(clean_term_df["interest"], string_to_remove, ""))
 
# Display the resulting dataframe
display(clean_interest_df)

####Rename columns in the dataframe


In [0]:
loan_df_rename=clean_interest_df.withColumnRenamed("mem_id","member_id") \
.withColumnRenamed("acc_id","account_id") \
.withColumnRenamed("loan_amt","loan_amount") \
.withColumnRenamed("fnd_amt","funded_amount") 

####Add the ingestion date to the dataframe

In [0]:
loan_df_ingestDate=loan_df_rename.withColumn("ingest_date", current_timestamp())
display(loan_df_ingestDate)

####Add a surrogate key to the dataframe

In [0]:
loan_df_key=loan_df_ingestDate.withColumn("loan_key", sha2(concat(col("loan_id"),col("member_id"),col("loan_amount")), 256))
display(loan_df_key)

In [0]:
loan_df_key.createOrReplaceTempView("df_null")
null_df=spark.sql("select * from df_null where interest='null' ")
display(null_df)

In [0]:
loan_df_key.createOrReplaceTempView("df_null")
null_df=spark.sql("select * from df_null where interest is null ")
display(null_df)

####Replace the NULL strings into NULL values

In [0]:
final_df=loan_df_key.replace("null",None)

In [0]:
final_df.createOrReplaceTempView("df_null")
null_df=spark.sql("select * from df_null where interest is null ").show()

In [0]:
final_df.printSchema()

In [0]:
final_df.createOrReplaceTempView("loan_data")
loan_data_df=spark.sql("select * from loan_data where term=36 and interest > 5.0")
display(loan_data_df)

In [0]:
final_df.createOrReplaceTempView("temp_table")
display_df=spark.sql("select loan_key, ingest_date,loan_id,member_id,account_id,loan_amount,funded_amount,term,interest,installment,issue_date,loan_status,purpose,title,disbursement_method from temp_table")
display(display_df)

####Write the cleaned dataframe into data lake

In [0]:
display_df.write.options(header='True').mode("append").parquet("/mnt/silver/lending_loan/loan_details")

In [0]:
dbutils.fs.ls("/mnt/silver/lending_loan/loan_details")

In [0]:
display(spark.read.parquet("/mnt/silver/lending_loan/loan_details"))