In [None]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType
from pyspark.sql.functions import  concat, current_timestamp,sha2,col

In [None]:
#Infer the schema of customer's data
customer_schema = StructType(fields=[StructField("cust_id", StringType(), True),
                                     StructField("mem_id", StringType(), True),
                                     StructField("fst_name", StringType(), False),
                                     StructField("lst_name", StringType(), False),
                                     StructField("prm_status", StringType(), False),
                                     StructField("age", IntegerType(), False),
                                     StructField("state", StringType(), False),
                                     StructField("country", StringType(), False)
                                    
])

####Read the csv file into a dataframe

In [None]:

customer_df = spark.read \
.option("header", True) \
.schema(customer_schema) \
.csv("/mnt/bronze/lending_loan/loan_customer_data.csv")

In [None]:
customer_df.printSchema()

root
 |-- cust_id: string (nullable = true)
 |-- mem_id: string (nullable = true)
 |-- fst_name: string (nullable = true)
 |-- lst_name: string (nullable = true)
 |-- prm_status: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- state: string (nullable = true)
 |-- country: string (nullable = true)



In [None]:
customer_selected_df = customer_df.select(col("cust_id"), col("mem_id"), col("fst_name"), col("lst_name"), col("prm_status"), col("age"), col("state"), col("country"))
 

In [None]:
display(customer_selected_df)

####Rename the columns in the dataframe

In [None]:
#Rename the columns to a better understandable way
customer_df_change=customer_df.withColumnRenamed("cust_id","customer_id") \
.withColumnRenamed("mem_id","member_id") \
.withColumnRenamed("fst_name","first_name") \
.withColumnRenamed("lst_name","last_name") \
.withColumnRenamed("prm_status","premium_status") 

####Add the ingestion date to the dataframe

In [None]:
#Include a ingest date column to signify when it got ingested into our data lake
customer_df_ingestDate=customer_df_change.withColumn("ingest_date", current_timestamp())

In [None]:
display(customer_df_ingestDate)

####Add a surrogate key to the dataframe

In [None]:
#Include a customer_key column which acts like a surrogate key in the table
#SHA-2 (Secure Hash Algorithm 2) is a set of cryptographic hash functions. It produces a 256-bit (32-byte) hash value and is generally considered to be a more secure.
customer_df_final=customer_df_ingestDate.withColumn("customer_key", sha2(concat(col("member_id"),col("age"),col("state")), 256))
display(customer_df_final)

####Use Spark SQL to query the data

In [None]:
customer_df_final.createOrReplaceTempView("temp_table")
display_df=spark.sql("select customer_key,ingest_date,customer_id,member_id,first_name,last_name,premium_status,age,state,country from temp_table")
display(display_df)

####Write the cleaned dataframe into data lake

In [None]:
display_df.write.options(header='True').mode("append").parquet("/mnt/silver/lending_loan/customer_details")

In [None]:

dbutils.fs.ls("/mnt/silver/lending_loan/customer_details")

In [None]:
display_df.write.options(header='True').mode("append").csv("/mnt/gold/lending_loan/customer_details")