In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import sha2, concat_ws
from pyspark.sql.functions import *
spark  = SparkSession.builder\
                    .appName("lendingClubProject")\
                    .config('spark.shuffle.useOldFetchProtocol', 'true') \
                    .config("spark.sql.warehouse.dir","/user/anil/warehouse")\
                    .enableHiveSupport()\
                    .master('yarn')\
                    .getOrCreate()

# adding Schema

In [2]:
loanDefaulters = spark.read.format('csv')\
                    .option('header',"true")\
                    .option('inferSchema',"true")\
                    .load('/user/anil/lendingClubProject/loans_defaulters')
loanDefaulters

member,delinq_2yrs,delinq_amnt,pub_rec,pub_rec_bankruptcies,inq_last_6mths,total_rec_late_fee,mths_since_last_delinq,mths_since_last_record
b59d80da191f5b573...,0.0,0.0,0.0,0.0,1.0,0.0,31.0,
202d9f56ecb7c3bc9...,1.0,0.0,0.0,0.0,0.0,0.0,6.0,
e5a140c0922b554b9...,0.0,0.0,0.0,0.0,0.0,0.0,47.0,
e12aefc548f750777...,0.0,0.0,0.0,0.0,0.0,0.0,33.0,
1b3a50d854fbbf97e...,1.0,0.0,0.0,0.0,0.0,0.0,21.0,
1c4329e5f17697127...,0.0,0.0,0.0,0.0,0.0,0.0,,
5026c86ad983175eb...,0.0,0.0,1.0,0.0,2.0,0.0,,71.0
9847d8c1e9d0b2084...,1.0,0.0,2.0,0.0,0.0,0.0,6.0,63.0
8340dbe1adea41fb4...,0.0,0.0,0.0,0.0,0.0,0.0,36.0,
d4de0de3ab7d79ad4...,0.0,0.0,0.0,0.0,0.0,0.0,35.0,


In [3]:
loanDefaulters.printSchema()

root
 |-- member: string (nullable = true)
 |-- delinq_2yrs: string (nullable = true)
 |-- delinq_amnt: double (nullable = true)
 |-- pub_rec: string (nullable = true)
 |-- pub_rec_bankruptcies: double (nullable = true)
 |-- inq_last_6mths: string (nullable = true)
 |-- total_rec_late_fee: string (nullable = true)
 |-- mths_since_last_delinq: string (nullable = true)
 |-- mths_since_last_record: string (nullable = true)



In [4]:
loanDefaulters.select('delinq_2yrs').distinct()

delinq_2yrs
1.0
271 monthly payme...
I bike to work on...
183xx
VISA and AMEX cre...
etc. and I feel t...
AZ
017xx
923xx
446xx


In [5]:
loanDefaulters.groupBy('delinq_2yrs').count().orderBy(desc('count')).show(30)

+------------------+-------+
|       delinq_2yrs|  count|
+------------------+-------+
|               0.0|1838878|
|               1.0| 281335|
|               2.0|  81285|
|               3.0|  29539|
|               4.0|  13179|
|               5.0|   6599|
|               6.0|   3717|
|               7.0|   2062|
|               8.0|   1223|
|               9.0|    818|
|              10.0|    556|
|              11.0|    363|
|              12.0|    264|
|              13.0|    165|
|              14.0|    120|
|              15.0|     87|
|              null|     63|
|              16.0|     55|
|              18.0|     30|
|              17.0|     30|
|              19.0|     23|
|              20.0|     17|
|              21.0|     12|
|                CA|      8|
|                TX|      6|
|debt_consolidation|      5|
|    small_business|      5|
|                IL|      5|
|              22.0|      5|
|              24.0|      4|
+------------------+-------+
only showing t

In [6]:
# In delinq_2yrs columns there are many string values which is not expected. when reading them as float they will be converted as null
loanDefaulterSchema = 'member_id string, delinq_2yrs float, delinq_amnt float, pub_rec float, pub_rec_bankruptcies float,inq_last_6mths float, total_rec_late_fee float, mths_since_last_delinq float, mths_since_last_record float'

In [7]:
loanDefaulters = spark.read.format('csv')\
                    .option('header',"true")\
                    .schema(loanDefaulterSchema)\
                    .load('/user/anil/lendingClubProject/loans_defaulters')

In [8]:
loanDefaulters.printSchema()

root
 |-- member_id: string (nullable = true)
 |-- delinq_2yrs: float (nullable = true)
 |-- delinq_amnt: float (nullable = true)
 |-- pub_rec: float (nullable = true)
 |-- pub_rec_bankruptcies: float (nullable = true)
 |-- inq_last_6mths: float (nullable = true)
 |-- total_rec_late_fee: float (nullable = true)
 |-- mths_since_last_delinq: float (nullable = true)
 |-- mths_since_last_record: float (nullable = true)



In [9]:
loanDefaulters.groupBy('delinq_2yrs').count().orderBy(desc('count')).show(30)

+-----------+-------+
|delinq_2yrs|  count|
+-----------+-------+
|        0.0|1838878|
|        1.0| 281335|
|        2.0|  81285|
|        3.0|  29539|
|        4.0|  13179|
|        5.0|   6599|
|        6.0|   3717|
|        7.0|   2062|
|        8.0|   1223|
|        9.0|    818|
|       10.0|    556|
|       11.0|    363|
|       12.0|    264|
|       null|    261|
|       13.0|    165|
|       14.0|    120|
|       15.0|     87|
|       16.0|     55|
|       18.0|     30|
|       17.0|     30|
|       19.0|     23|
|       20.0|     17|
|       21.0|     12|
|       22.0|      5|
|       24.0|      4|
|       26.0|      3|
|       29.0|      2|
|       23.0|      2|
|       30.0|      2|
|       3.44|      2|
+-----------+-------+
only showing top 30 rows



In [10]:
loanDefaultersModified = loanDefaulters.na.fill(0, subset = ['delinq_2yrs'])

In [11]:
loanDefaultersModified.groupBy('delinq_2yrs').count().orderBy(desc('count')).show(100)

+-----------+-------+
|delinq_2yrs|  count|
+-----------+-------+
|        0.0|1839139|
|        1.0| 281335|
|        2.0|  81285|
|        3.0|  29539|
|        4.0|  13179|
|        5.0|   6599|
|        6.0|   3717|
|        7.0|   2062|
|        8.0|   1223|
|        9.0|    818|
|       10.0|    556|
|       11.0|    363|
|       12.0|    264|
|       13.0|    165|
|       14.0|    120|
|       15.0|     87|
|       16.0|     55|
|       17.0|     30|
|       18.0|     30|
|       19.0|     23|
|       20.0|     17|
|       21.0|     12|
|       22.0|      5|
|       24.0|      4|
|       26.0|      3|
|       29.0|      2|
|       30.0|      2|
|       23.0|      2|
|       25.0|      2|
|       3.44|      2|
|      21.72|      1|
|      20.04|      1|
|      18.53|      1|
|       9.44|      1|
|       39.0|      1|
|       58.0|      1|
|      22.95|      1|
|       9.56|      1|
|      26.24|      1|
|       6.52|      1|
|      17.17|      1|
|      14.15|      1|
|       3.

In [12]:
loanDefaultersModified.printSchema()

root
 |-- member_id: string (nullable = true)
 |-- delinq_2yrs: float (nullable = false)
 |-- delinq_amnt: float (nullable = true)
 |-- pub_rec: float (nullable = true)
 |-- pub_rec_bankruptcies: float (nullable = true)
 |-- inq_last_6mths: float (nullable = true)
 |-- total_rec_late_fee: float (nullable = true)
 |-- mths_since_last_delinq: float (nullable = true)
 |-- mths_since_last_record: float (nullable = true)



In [13]:
loanDefaultersModified = loanDefaultersModified.withColumn('delinq_2yrs', col('delinq_2yrs').cast('int'))

In [14]:
loanDefaultersModified.printSchema()

root
 |-- member_id: string (nullable = true)
 |-- delinq_2yrs: integer (nullable = true)
 |-- delinq_amnt: float (nullable = true)
 |-- pub_rec: float (nullable = true)
 |-- pub_rec_bankruptcies: float (nullable = true)
 |-- inq_last_6mths: float (nullable = true)
 |-- total_rec_late_fee: float (nullable = true)
 |-- mths_since_last_delinq: float (nullable = true)
 |-- mths_since_last_record: float (nullable = true)



In [15]:
loanDefaultersModified.groupBy('delinq_2yrs').count().orderBy(desc('count')).show(100)

+-----------+-------+
|delinq_2yrs|  count|
+-----------+-------+
|          0|1839141|
|          1| 281337|
|          2|  81285|
|          3|  29545|
|          4|  13180|
|          5|   6601|
|          6|   3719|
|          7|   2063|
|          8|   1226|
|          9|    821|
|         10|    558|
|         11|    363|
|         12|    266|
|         13|    167|
|         14|    123|
|         15|     90|
|         16|     56|
|         17|     33|
|         18|     32|
|         19|     24|
|         20|     19|
|         21|     16|
|         22|      7|
|         24|      6|
|         23|      5|
|         26|      4|
|         29|      2|
|         30|      2|
|         25|      2|
|         28|      1|
|         35|      1|
|         27|      1|
|         39|      1|
|         32|      1|
|         58|      1|
|         42|      1|
|         36|      1|
+-----------+-------+



In [20]:
loanDefaultersCleansed = loanDefaultersModified.withColumn("pub_rec", col("pub_rec").cast("integer")).fillna(0, subset = ["pub_rec"])
loanDefaultersCleansedV = loanDefaultersCleansed.withColumn("pub_rec_bankruptcies", col("pub_rec_bankruptcies").cast("integer")).fillna(0, subset = ["pub_rec_bankruptcies"])
loanDefaultersCleansedFinal = loanDefaultersCleansedV.withColumn("inq_last_6mths", col("inq_last_6mths").cast("integer")).fillna(0, subset = ["inq_last_6mths"])


In [22]:
loanDefaultersCleansedFinal.createOrReplaceTempView("loan_defaulters")

In [23]:
spark.sql("select count(*) from loan_defaulters where delinq_2yrs is null")

count(1)
0


In [24]:
loansDefDelinq = spark.sql("select member_id,delinq_2yrs, delinq_amnt, int(mths_since_last_delinq) from loan_defaulters where delinq_2yrs > 0 or mths_since_last_delinq > 0")

In [25]:
loansDefDelinq

member_id,delinq_2yrs,delinq_amnt,mths_since_last_delinq
b59d80da191f5b573...,0,0.0,31
202d9f56ecb7c3bc9...,1,0.0,6
e5a140c0922b554b9...,0,0.0,47
e12aefc548f750777...,0,0.0,33
1b3a50d854fbbf97e...,1,0.0,21
9847d8c1e9d0b2084...,1,0.0,6
8340dbe1adea41fb4...,0,0.0,36
d4de0de3ab7d79ad4...,0,0.0,35
1d4e1ef4353b73c00...,0,0.0,30
6f196952e71277fd4...,4,0.0,5


In [26]:
loansDefRecords = spark.sql("select member_id,pub_rec, pub_rec_bankruptcies, inq_last_6mths from loan_defaulters ")

In [31]:
loansDefRecords

member_id,pub_rec,pub_rec_bankruptcies,inq_last_6mths
b59d80da191f5b573...,0,0,1
202d9f56ecb7c3bc9...,0,0,0
e5a140c0922b554b9...,0,0,0
e12aefc548f750777...,0,0,0
1b3a50d854fbbf97e...,0,0,0
1c4329e5f17697127...,0,0,0
5026c86ad983175eb...,1,0,2
9847d8c1e9d0b2084...,2,0,0
8340dbe1adea41fb4...,0,0,0
d4de0de3ab7d79ad4...,0,0,0


In [27]:
loansDefDelinq.write \
.option("header", True) \
.format("csv") \
.mode("overwrite") \
.option("path", "/user/anil/lendingClubProject/cleansed/NoLoansDefaultersCsv") \
.save()

In [28]:
loansDefDelinq.write \
.format("parquet") \
.mode("overwrite") \
.option("path", "/user/anil/lendingClubProject/cleansed/NoLoansDefaultersParquet") \
.save()

In [32]:
loansDefRecords.write \
.option("header", True) \
.format("csv") \
.mode("overwrite") \
.option("path", "/user/anil/lendingClubProject/cleansed/LoansDefaultersCsv") \
.save()

In [33]:
loansDefRecords.write \
.format("parquet") \
.mode("overwrite") \
.option("path", "/user/anil/lendingClubProject/cleansed/LoansDefaultersParquet") \
.save()