In [31]:
from pyspark.sql import SparkSession
import getpass
username = getpass.getuser()
spark = SparkSession. \
builder. \
config('spark.shuffle.useOldFetchProtocol','true'). \
config("spark.sql.warehouse.dir", f"/user/itv021558/warehouse"). \
enableHiveSupport(). \
master('yarn'). \
getOrCreate()

In [32]:
loan_repay_raw_df = spark.read \
.format("csv") \
.option("header","true") \
.option("inferSchema", True) \
.load("/public/trendytech/lendingclubproject/raw/loans_repayments_csv")

In [3]:
loan_repay_raw_df

loan_id,total_rec_prncp,total_rec_int,total_rec_late_fee,total_pymnt,last_pymnt_amnt,last_pymnt_d,next_pymnt_d
141581221,1055.81,2591.7,0.0,3647.51,709.23,Mar-2019,Apr-2019
141506948,1252.75,306.04,0.0,1558.79,312.63,Mar-2019,Apr-2019
141357400,626.37,354.96,0.0,981.33,197.27,Mar-2019,Apr-2019
139445427,1118.16,297.36,0.0,1415.52,283.95,Mar-2019,Apr-2019
141407409,1169.72,3605.3,0.0,4775.02,964.9,Mar-2019,Apr-2019
141360802,2313.98,2512.88,0.0,4826.86,952.02,Mar-2019,Apr-2019
141163960,4689.63,1994.93,0.0,6684.56,1342.57,Mar-2019,Apr-2019
141533932,585.29,640.53,15.0,1240.82,235.13,Mar-2019,Apr-2019
141441276,2030.82,762.81,0.0,2793.63,477.62,Mar-2019,Apr-2019
141569080,1803.55,1110.59,0.0,2914.14,585.91,Mar-2019,Apr-2019


In [33]:
loans_repay_schema = 'loan_id string,total_principal_received float,total_interest_received float,total_late_fee_received float,total_payment_received float,last_payment_amount float,last_payment_date string,next_payment_date string'

In [34]:
loan_repay_raw_df = spark.read \
.format("csv") \
.option("header","true") \
.schema(loans_repay_schema) \
.load("/public/trendytech/lendingclubproject/raw/loans_repayments_csv")

In [35]:
from pyspark.sql.functions import current_timestamp

In [36]:
repay_ingest = loan_repay_raw_df.withColumn("ingest_date", current_timestamp())

In [9]:
repay_ingest

loan_id,total_principal_received,total_interest_received,total_late_fee_received,total_payment_received,last_payment_amount,last_payment_date,next_payment_date,ingest_date
141581221,1055.81,2591.7,0.0,3647.51,709.23,Mar-2019,Apr-2019,2025-11-11 03:36:...
141506948,1252.75,306.04,0.0,1558.79,312.63,Mar-2019,Apr-2019,2025-11-11 03:36:...
141357400,626.37,354.96,0.0,981.33,197.27,Mar-2019,Apr-2019,2025-11-11 03:36:...
139445427,1118.16,297.36,0.0,1415.52,283.95,Mar-2019,Apr-2019,2025-11-11 03:36:...
141407409,1169.72,3605.3,0.0,4775.02,964.9,Mar-2019,Apr-2019,2025-11-11 03:36:...
141360802,2313.98,2512.88,0.0,4826.86,952.02,Mar-2019,Apr-2019,2025-11-11 03:36:...
141163960,4689.63,1994.93,0.0,6684.56,1342.57,Mar-2019,Apr-2019,2025-11-11 03:36:...
141533932,585.29,640.53,15.0,1240.82,235.13,Mar-2019,Apr-2019,2025-11-11 03:36:...
141441276,2030.82,762.81,0.0,2793.63,477.62,Mar-2019,Apr-2019,2025-11-11 03:36:...
141569080,1803.55,1110.59,0.0,2914.14,585.91,Mar-2019,Apr-2019,2025-11-11 03:36:...


In [37]:
repay_ingest.createOrReplaceTempView("repay")

In [38]:
spark.sql("select count (*) from repay where total_principal_received is null")

count(1)
69


In [39]:
columns_to_check =["total_principal_received","total_interest_received","total_late_fee_received",
                   "total_payment_received","last_payment_amount"]

In [40]:
repay_filtered = repay_ingest.na.drop(subset = columns_to_check)

In [16]:
repay_filtered

loan_id,total_principal_received,total_interest_received,total_late_fee_received,total_payment_received,last_payment_amount,last_payment_date,next_payment_date,ingest_date
141581221,1055.81,2591.7,0.0,3647.51,709.23,Mar-2019,Apr-2019,2025-11-11 03:41:...
141506948,1252.75,306.04,0.0,1558.79,312.63,Mar-2019,Apr-2019,2025-11-11 03:41:...
141357400,626.37,354.96,0.0,981.33,197.27,Mar-2019,Apr-2019,2025-11-11 03:41:...
139445427,1118.16,297.36,0.0,1415.52,283.95,Mar-2019,Apr-2019,2025-11-11 03:41:...
141407409,1169.72,3605.3,0.0,4775.02,964.9,Mar-2019,Apr-2019,2025-11-11 03:41:...
141360802,2313.98,2512.88,0.0,4826.86,952.02,Mar-2019,Apr-2019,2025-11-11 03:41:...
141163960,4689.63,1994.93,0.0,6684.56,1342.57,Mar-2019,Apr-2019,2025-11-11 03:41:...
141533932,585.29,640.53,15.0,1240.82,235.13,Mar-2019,Apr-2019,2025-11-11 03:41:...
141441276,2030.82,762.81,0.0,2793.63,477.62,Mar-2019,Apr-2019,2025-11-11 03:41:...
141569080,1803.55,1110.59,0.0,2914.14,585.91,Mar-2019,Apr-2019,2025-11-11 03:41:...


In [41]:
repay_filtered.createOrReplaceTempView("repay")

In [42]:
from pyspark.sql.functions import when, col

In [43]:
repay_fixed = repay_filtered.withColumn(
    "total_payment_received",
    when(
        (col("total_principal_received") != 0.0) &
        (col("total_payment_received") == 0.0),
        (col("total_principal_received") + col("total_interest_received") + col("total_late_fee_received"))
    ).otherwise(col("total_payment_received"))
)

In [22]:
repay_fixed

loan_id,total_principal_received,total_interest_received,total_late_fee_received,total_payment_received,last_payment_amount,last_payment_date,next_payment_date,ingest_date
141581221,1055.81,2591.7,0.0,3647.51,709.23,Mar-2019,Apr-2019,2025-11-11 03:55:...
141506948,1252.75,306.04,0.0,1558.79,312.63,Mar-2019,Apr-2019,2025-11-11 03:55:...
141357400,626.37,354.96,0.0,981.33,197.27,Mar-2019,Apr-2019,2025-11-11 03:55:...
139445427,1118.16,297.36,0.0,1415.52,283.95,Mar-2019,Apr-2019,2025-11-11 03:55:...
141407409,1169.72,3605.3,0.0,4775.02,964.9,Mar-2019,Apr-2019,2025-11-11 03:55:...
141360802,2313.98,2512.88,0.0,4826.86,952.02,Mar-2019,Apr-2019,2025-11-11 03:55:...
141163960,4689.63,1994.93,0.0,6684.56,1342.57,Mar-2019,Apr-2019,2025-11-11 03:55:...
141533932,585.29,640.53,15.0,1240.82,235.13,Mar-2019,Apr-2019,2025-11-11 03:55:...
141441276,2030.82,762.81,0.0,2793.63,477.62,Mar-2019,Apr-2019,2025-11-11 03:55:...
141569080,1803.55,1110.59,0.0,2914.14,585.91,Mar-2019,Apr-2019,2025-11-11 03:55:...


In [44]:
repay_fixed.filter("total_payment_received = 0.0").count()

949

In [45]:
repay_fixed2 = repay_fixed.filter("total_payment_received != 0.0")

In [25]:
repay_fixed2

loan_id,total_principal_received,total_interest_received,total_late_fee_received,total_payment_received,last_payment_amount,last_payment_date,next_payment_date,ingest_date
141581221,1055.81,2591.7,0.0,3647.51,709.23,Mar-2019,Apr-2019,2025-11-11 03:59:...
141506948,1252.75,306.04,0.0,1558.79,312.63,Mar-2019,Apr-2019,2025-11-11 03:59:...
141357400,626.37,354.96,0.0,981.33,197.27,Mar-2019,Apr-2019,2025-11-11 03:59:...
139445427,1118.16,297.36,0.0,1415.52,283.95,Mar-2019,Apr-2019,2025-11-11 03:59:...
141407409,1169.72,3605.3,0.0,4775.02,964.9,Mar-2019,Apr-2019,2025-11-11 03:59:...
141360802,2313.98,2512.88,0.0,4826.86,952.02,Mar-2019,Apr-2019,2025-11-11 03:59:...
141163960,4689.63,1994.93,0.0,6684.56,1342.57,Mar-2019,Apr-2019,2025-11-11 03:59:...
141533932,585.29,640.53,15.0,1240.82,235.13,Mar-2019,Apr-2019,2025-11-11 03:59:...
141441276,2030.82,762.81,0.0,2793.63,477.62,Mar-2019,Apr-2019,2025-11-11 03:59:...
141569080,1803.55,1110.59,0.0,2914.14,585.91,Mar-2019,Apr-2019,2025-11-11 03:59:...


In [46]:
repay_ldate_fixed = repay_fixed2.withColumn(
    "last_payment_date",
    when(
       (col("last_payment_date") == 0.0),
        None
    ).otherwise(col("last_payment_date"))
        
)

In [47]:
repay_ndate_fixed = repay_ldate_fixed.withColumn(
    "next_payment_date",
    when(
       (col("next_payment_date") == 0.0),
        None
    ).otherwise(col("next_payment_date"))
        
)

In [48]:
repay_ndate_fixed.write \
.option("header", True) \
.mode("overwrite") \
.option("path","/user/itv021558/lendingclubproject/cleaned/loans_repayments") \
.save()

In [49]:
spark.stop()