In [1]:
from pyspark.sql import SparkSession
import getpass
username = getpass.getuser()
spark = SparkSession. \
    builder. \
    config('spark.ui.port','0'). \
    config('spark.shuffle.useOldFetchProtocol', 'true'). \
    config("spark.sql.warehouse.dir", f"/user/{username}/warehouse"). \
    enableHiveSupport(). \
    master('yarn'). \
    getOrCreate()

In [2]:
!! hadoop fs -ls /public/trendytech/lendingclubproject/raw

['Found 4 items',
 'drwxr-xr-x   - itv005857 supergroup          0 2023-09-15 14:40 /public/trendytech/lendingclubproject/raw/customers_data_csv',
 'drwxr-xr-x   - itv005857 supergroup          0 2023-09-17 22:57 /public/trendytech/lendingclubproject/raw/loans_data_csv',
 'drwxr-xr-x   - itv005857 supergroup          0 2023-09-18 07:32 /public/trendytech/lendingclubproject/raw/loans_defaulters_csv',
 'drwxr-xr-x   - itv005857 supergroup          0 2023-09-18 07:31 /public/trendytech/lendingclubproject/raw/loans_repayments_csv']

In [3]:
loan_defaulters_df = spark.read \
.format("csv") \
.option("header", True) \
.option("inferschema", True) \
.load("/public/trendytech/lendingclubproject/raw/loans_defaulters_csv")

In [4]:
loan_defaulters_df

# delinq_2yrs - has the person paid any_time late in the last two years
# delinq_amnt - how much amount they had paid late
# pub_rec - are there any public records aganist the person

member_id,delinq_2yrs,delinq_amnt,pub_rec,pub_rec_bankruptcies,inq_last_6mths,total_rec_late_fee,mths_since_last_delinq,mths_since_last_record
9cb79aa7323e81be1...,2.0,0.0,0.0,0.0,0.0,0.0,11.0,
0dd2bbc517e3c8f9e...,0.0,0.0,1.0,1.0,3.0,0.0,,115.0
458458599d3df3bfc...,0.0,0.0,1.0,1.0,1.0,0.0,,76.0
05ea141ec28b5c7f7...,0.0,0.0,0.0,0.0,0.0,0.0,,
aac68850fdac09fd0...,1.0,0.0,0.0,0.0,0.0,0.0,21.0,
3a423e4589e89f429...,0.0,0.0,0.0,0.0,0.0,0.0,,
f1efcf7dfbfef21be...,0.0,0.0,0.0,0.0,1.0,0.0,,
c89986155a070db2e...,1.0,0.0,0.0,0.0,1.0,15.0,5.0,
118dc629b6e134419...,0.0,0.0,0.0,0.0,0.0,0.0,,
a86fa4b7493708333...,0.0,0.0,0.0,0.0,0.0,0.0,,


In [5]:
loan_defaulters_df.printSchema()

root
 |-- member_id: string (nullable = true)
 |-- delinq_2yrs: string (nullable = true)
 |-- delinq_amnt: double (nullable = true)
 |-- pub_rec: string (nullable = true)
 |-- pub_rec_bankruptcies: double (nullable = true)
 |-- inq_last_6mths: string (nullable = true)
 |-- total_rec_late_fee: string (nullable = true)
 |-- mths_since_last_delinq: string (nullable = true)
 |-- mths_since_last_record: string (nullable = true)



In [6]:
loan_defaulters_df.createOrReplaceTempView("loan_defaulters")

In [7]:
spark.sql("select distinct(delinq_2yrs) from loan_defaulters")

delinq_2yrs
1.0
I bike to work on...
271 monthly payme...
183xx
VISA and AMEX cre...
etc. and I feel t...
AZ
017xx
923xx
446xx


In [8]:
spark.sql("select inq_last_6mths, count(*) as total from loan_defaulters group by inq_last_6mths order by total desc")

inq_last_6mths,total
0.0,1381626
1.0,584320
2.0,200180
3.0,68996
4.0,17374
5.0,6224
6.0,1228
7.0,191
8.0,123
,64


In [9]:
spark.sql("select delinq_2yrs, count(*) as total from loan_defaulters group by delinq_2yrs order by total desc").show(40)

+------------------+-------+
|       delinq_2yrs|  total|
+------------------+-------+
|               0.0|1838878|
|               1.0| 281335|
|               2.0|  81285|
|               3.0|  29539|
|               4.0|  13179|
|               5.0|   6599|
|               6.0|   3717|
|               7.0|   2062|
|               8.0|   1223|
|               9.0|    818|
|              10.0|    556|
|              11.0|    363|
|              12.0|    264|
|              13.0|    165|
|              14.0|    120|
|              15.0|     87|
|              null|     63|
|              16.0|     55|
|              17.0|     30|
|              18.0|     30|
|              19.0|     23|
|              20.0|     17|
|              21.0|     12|
|                CA|      8|
|                TX|      6|
|                IL|      5|
|    small_business|      5|
|debt_consolidation|      5|
|              22.0|      5|
|              24.0|      4|
|                FL|      4|
|             

In [10]:
loan_defaulters_schema = 'member_id string, delinq_2yrs float, delinq_amnt float, pub_rec float, pub_rec_bankruptcies float, inq_last_6mths float, total_rec_late_fee float, mths_since_last_delinq float, mths_since_last_record float'

In [11]:
loan_defaulters_raw_df = spark.read \
.format("csv") \
.option("header", True) \
.schema(loan_defaulters_schema) \
.load("/public/trendytech/lendingclubproject/raw/loans_defaulters_csv")

In [12]:
loan_defaulters_raw_df.createOrReplaceTempView("loan_defaulters")

In [13]:
spark.sql("select delinq_2yrs, count(*) as total from loan_defaulters group by delinq_2yrs order by total desc").show(40)

+-----------+-------+
|delinq_2yrs|  total|
+-----------+-------+
|        0.0|1838878|
|        1.0| 281335|
|        2.0|  81285|
|        3.0|  29539|
|        4.0|  13179|
|        5.0|   6599|
|        6.0|   3717|
|        7.0|   2062|
|        8.0|   1223|
|        9.0|    818|
|       10.0|    556|
|       11.0|    363|
|       12.0|    264|
|       null|    261|
|       13.0|    165|
|       14.0|    120|
|       15.0|     87|
|       16.0|     55|
|       18.0|     30|
|       17.0|     30|
|       19.0|     23|
|       20.0|     17|
|       21.0|     12|
|       22.0|      5|
|       24.0|      4|
|       26.0|      3|
|       30.0|      2|
|       29.0|      2|
|       3.44|      2|
|       25.0|      2|
|       23.0|      2|
|       6.52|      1|
|       9.56|      1|
|      21.72|      1|
|       5.52|      1|
|       39.0|      1|
|       58.0|      1|
|       1.41|      1|
|      17.17|      1|
|       1.37|      1|
+-----------+-------+
only showing top 40 rows



In [14]:
from pyspark.sql.functions import col

In [15]:
loans_defaulters_processed_df = loan_defaulters_raw_df.withColumn("delinq_2yrs", col("delinq_2yrs").cast("integer")).fillna(0, subset = ["delinq_2yrs"])

In [16]:
loans_defaulters_processed_df.createOrReplaceTempView("loan_defaulters")

In [17]:
spark.sql("select delinq_2yrs, count(*) as total from loan_defaulters group by delinq_2yrs order by total desc").show(40)

+-----------+-------+
|delinq_2yrs|  total|
+-----------+-------+
|          0|1839141|
|          1| 281337|
|          2|  81285|
|          3|  29545|
|          4|  13180|
|          5|   6601|
|          6|   3719|
|          7|   2063|
|          8|   1226|
|          9|    821|
|         10|    558|
|         11|    363|
|         12|    266|
|         13|    167|
|         14|    123|
|         15|     90|
|         16|     56|
|         17|     33|
|         18|     32|
|         19|     24|
|         20|     19|
|         21|     16|
|         22|      7|
|         24|      6|
|         23|      5|
|         26|      4|
|         25|      2|
|         30|      2|
|         29|      2|
|         27|      1|
|         35|      1|
|         28|      1|
|         42|      1|
|         39|      1|
|         32|      1|
|         36|      1|
|         58|      1|
+-----------+-------+



## Lets create two seperate datasets for delinq and pub_rec

In [18]:
loans_defaulters_delinq_df = spark.sql("select member_id, delinq_2yrs, delinq_amnt, int(mths_since_last_delinq) from loan_defaulters where delinq_2yrs > 0 or mths_since_last_delinq > 0")

In [19]:
loans_defaulters_delinq_df

member_id,delinq_2yrs,delinq_amnt,mths_since_last_delinq
9cb79aa7323e81be1...,2,0.0,11
aac68850fdac09fd0...,1,0.0,21
c89986155a070db2e...,1,0.0,5
6e8d94bf446e97025...,0,0.0,36
42f73fd8a01f1c475...,0,0.0,46
1eef79a0e79b72c7a...,1,0.0,21
1dd1d1b51473d4993...,0,0.0,44
ec1953dba2cfb89ad...,2,0.0,13
8241a6bb3a9350fb8...,0,0.0,57
cdc94fa1c29a6a70a...,0,0.0,44


In [20]:
loan_defaulters_records_enquire_df = spark.sql("select member_id from loan_defaulters where pub_rec > 0.0 or pub_rec_bankruptcies > 0.0 or inq_last_6mths > 0.0")

In [21]:
loan_defaulters_records_enquire_df

# so these are the people who are having public records in our dataset

member_id
0dd2bbc517e3c8f9e...
458458599d3df3bfc...
f1efcf7dfbfef21be...
c89986155a070db2e...
e88945f86a96f8d71...
4e1c30a5dfe9f1e20...
76cbefe31f7834f47...
47d002f59a274c6f2...
09a1c6855801dad88...
56d4375718ad6940d...


In [22]:
loan_defaulters_records_enquire_df.count()

1070125

In [23]:
loans_defaulters_delinq_df.write \
.option("header" , True) \
.format("csv") \
.mode("overwrite") \
.option("path", "/user/itv022692/lendingclubproject/cleaned/loans_defaulters_delinq_csv") \
.save()

In [24]:
loans_defaulters_delinq_df.write \
.format("parquet") \
.mode("overwrite") \
.option("path", "/user/itv022692/lendingclubproject/cleaned/loans_defaulters_delinq_parquet") \
.save()

In [25]:
loan_defaulters_records_enquire_df.write \
.option("header" , True) \
.format("csv") \
.mode("overwrite") \
.option("path", "/user/itv022692/lendingclubproject/cleaned/loans_defaulters_records_enq_csv") \
.save()

In [26]:
loan_defaulters_records_enquire_df.write \
.format("parquet") \
.mode("overwrite") \
.option("path", "/user/itv022692/lendingclubproject/cleaned/loans_defaulters_records_enq_parquet") \
.save()

## this we are trying to for the loanscore in the next part

##  **we will remove null values from each  of these columns = pub_rec , pub_rec_bankruptcies, inq_last_6mths **

In [27]:
loans_def_processed_pub_rec_df = loans_defaulters_processed_df.withColumn("pub_rec", col("pub_rec").cast("integer")).fillna(0, subset = ["pub_rec"])

In [28]:
loans_def_processed_pub_rec_bankruptcies_df = loans_def_processed_pub_rec_df.withColumn("pub_rec_bankruptcies", col("pub_rec_bankruptcies").cast("integer")).fillna(0, subset = ["pub_rec_bankruptcies"])

In [29]:
loans_def_processed_inq_last_6mths_df = loans_def_processed_pub_rec_bankruptcies_df.withColumn("inq_last_6mths", col("inq_last_6mths").cast("integer")).fillna(0, subset = ["inq_last_6mths"])

In [30]:
loans_def_processed_inq_last_6mths_df.createOrReplaceTempView("loan_defaulters")

In [31]:
loan_defaulters_detail_records_enquire_df = spark.sql(
    """select member_id, pub_rec, pub_rec_bankruptcies, inq_last_6mths 
    from loan_defaulters """)

## So we have removed all the null from the three columns now lets write back into hdfs

In [32]:
loan_defaulters_detail_records_enquire_df.write \
.option("header" , True) \
.format("csv") \
.mode("overwrite") \
.option("path", "/user/itv022692/lendingclubproject/cleaned/loans_defaulters_detail_records_enquire_csv") \
.save()

In [33]:
loan_defaulters_detail_records_enquire_df.write \
.format("parquet") \
.mode("overwrite") \
.option("path", "/user/itv022692/lendingclubproject/cleaned/loans_defaulters_detail_records_enquire_parquet") \
.save()

In [34]:
!!  hadoop fs -ls /user/itv022692/lendingclubproject/cleaned/loans_defaulters_detail_records_enquire_csv

['Found 3 items',
 '-rw-r--r--   3 itv022692 supergroup          0 2026-01-19 12:40 /user/itv022692/lendingclubproject/cleaned/loans_defaulters_detail_records_enquire_csv/_SUCCESS',
 '-rw-r--r--   3 itv022692 supergroup   81799785 2026-01-19 12:40 /user/itv022692/lendingclubproject/cleaned/loans_defaulters_detail_records_enquire_csv/part-00000-1c057f6c-7562-41f9-955e-38bf5cae5e47-c000.csv',
 '-rw-r--r--   3 itv022692 supergroup   78710821 2026-01-19 12:40 /user/itv022692/lendingclubproject/cleaned/loans_defaulters_detail_records_enquire_csv/part-00001-1c057f6c-7562-41f9-955e-38bf5cae5e47-c000.csv']

In [35]:
!! hadoop fs -head /user/itv022692/lendingclubproject/cleaned/loans_defaulters_detail_records_enquire_csv/part-00000-1c057f6c-7562-41f9-955e-38bf5cae5e47-c000.csv

['member_id,pub_rec,pub_rec_bankruptcies,inq_last_6mths',
 '9cb79aa7323e81be1dac0536c0af072f66888b7aac448d024c2d0590278ab2c6,0,0,0',
 '0dd2bbc517e3c8f9eb331658c3ec72424bfa5553b5d0d123453ca0ee9612fd21,1,1,3',
 '458458599d3df3bfc8855871e0abae663072f12171499cd2dee7b66881e6a9c8,1,1,1',
 '05ea141ec28b5c7f77ebb8644aa7e9e47aa85dde93986c23cb75400e0b938a62,0,0,0',
 'aac68850fdac09fd0e1f3524b804485cd897ec13ea03932cae6879a40ab5913e,0,0,0',
 '3a423e4589e89f429c5dbec4ef657ff07bf522843d1b1916c6768db7ad26749a,0,0,0',
 'f1efcf7dfbfef21be251164867ba011ceebd6ac7403882485d6d999f4b2324c6,0,0,1',
 'c89986155a070db2e133035d16be2230c668b3fe1199c7012fb2c716b429f844,0,0,1',
 '118dc629b6e134419a8026e6a3c3e206c2789130bb375421dea60986c1a5555a,0,0,0',
 'a86fa4b74937083333c968b47629858506205a274e6a8cdbaca21e0227b9dec1,0,0,0',
 '6e8d94bf446e970251bd9159664d1b704d94814792d817c7e7b8c71f6f3a6494,0,0,0',
 '3de585156dc6b73f6282c372ffef7aa0d9b6aebcd7db293b5ccc913f8428a327,0,0,0',
 'e88945f86a96f8d71467a6ce8facaa5f9e75a019

## here we got the cleaned data which have member_id, pub_rec , pub_rec_bankruptcies, inq_last_6mths 