In [None]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType,DoubleType
from pyspark.sql.functions import col, concat, current_timestamp,sha2

In [None]:
#Infer the schema of investor's data
investor_schema = StructType(fields=[StructField("investor_loan_id", StringType(), True),
                                     StructField("loan_id", StringType(), True),
                                     StructField("investor_id", StringType(), False),
                                     StructField("loan_funded_amt", DoubleType(), False),
                                     StructField("investor_type", StringType(), False),
                                     StructField("age", IntegerType(), False),
                                     StructField("state", StringType(), False),
                                     StructField("country", StringType(), False)
                                    
])

####Read the csv file into a dataframe


In [None]:

investor_df = spark.read \
.option("header", True) \
.schema(investor_schema) \
.csv("/mnt/bronze/lending_loan/loan_investors.csv")

In [None]:
investor_df.createOrReplaceTempView("investor_data")
spark.sql("select * from investor_data").show()

####Add the ingestion date to the dataframe


In [None]:
#Include a ingest date column to signify when it got ingested into our data lake
investor_df_ingestDate=investor_df.withColumn("ingest_date", current_timestamp())
display(investor_df_ingestDate)

####Add a surrogate key to the dataframe

In [None]:
#Include a customer_key column which acts like a surrogate key in the table
investor_df_key=investor_df_ingestDate.withColumn("investor_loan_key", sha2(concat(col("investor_loan_id"),col("loan_id"),col("investor_id")), 256))
display(investor_df_key)

####Use Spark SQL to query the data


In [None]:
investor_df_key.createOrReplaceTempView("investor_temp_table")
display_df=spark.sql("select investor_loan_key,ingest_date,investor_loan_id,loan_id,investor_id,loan_funded_amt,investor_type,age,state,country from investor_temp_table")
display(display_df)

####Write the cleaned dataframe into data lake

In [None]:

display_df.write.options(header='True').mode("append").parquet("/mnt/silver/lending_loan/investor_loan_details")

In [None]:
display(spark.read.parquet("/mnt/silver/lending_loan/investor_loan_details"))