In [1]:
from pyspark.sql import SparkSession
import getpass
username = getpass.getuser()
spark = SparkSession. \
    builder. \
    appName("SparkDemo"). \
    config('spark.ui.port','0'). \
    config('spark.shuffle.useOldFethProtocol','true'). \
    config('spark.sql.warehouse.dir',f'/user/{username}/warehouse'). \
    enableHiveSupport(). \
    master('yarn'). \
    getOrCreate()

In [2]:
loans_def_schema = "member_id string, dealing_2yrs float, dealing_amnt float, pub_rec float, pub_rec_bankruptcies float ,inquiry_last_6months float, total_recorded_late_fee float, months_since_last_dealing float, months_since_last_record float"

In [3]:
loans_def_raw_df = spark.read \
.format("csv") \
.option("header","true") \
.schema(loans_def_schema) \
.load("/user/itv009959/lendingclubproject/raw/loans_defaulters_csv")

In [4]:
loans_def_raw_df

member_id,dealing_2yrs,dealing_amnt,pub_rec,pub_rec_bankruptcies,inquiry_last_6months,total_recorded_late_fee,months_since_last_dealing,months_since_last_record
b59d80da191f5b573...,0.0,0.0,0.0,0.0,1.0,0.0,31.0,
202d9f56ecb7c3bc9...,1.0,0.0,0.0,0.0,0.0,0.0,6.0,
e5a140c0922b554b9...,0.0,0.0,0.0,0.0,0.0,0.0,47.0,
e12aefc548f750777...,0.0,0.0,0.0,0.0,0.0,0.0,33.0,
1b3a50d854fbbf97e...,1.0,0.0,0.0,0.0,0.0,0.0,21.0,
1c4329e5f17697127...,0.0,0.0,0.0,0.0,0.0,0.0,,
5026c86ad983175eb...,0.0,0.0,1.0,0.0,2.0,0.0,,71.0
9847d8c1e9d0b2084...,1.0,0.0,2.0,0.0,0.0,0.0,6.0,63.0
8340dbe1adea41fb4...,0.0,0.0,0.0,0.0,0.0,0.0,36.0,
d4de0de3ab7d79ad4...,0.0,0.0,0.0,0.0,0.0,0.0,35.0,


In [5]:
loans_def_raw_df.createOrReplaceTempView("loans_def")

In [6]:
spark.sql("select dealing_2yrs,count(*) as total from loans_def group by dealing_2yrs order by total desc")

dealing_2yrs,total
0.0,1838878
1.0,281335
2.0,81285
3.0,29539
4.0,13179
5.0,6599
6.0,3717
7.0,2062
8.0,1223
9.0,818


In [7]:
from pyspark.sql.functions import col

In [8]:
loans_dealing_modified = loans_def_raw_df.withColumn("dealing_2yrs",col("dealing_2yrs").cast("int")).fillna(0,subset=["dealing_2yrs"])

In [9]:
loans_dealing_modified

member_id,dealing_2yrs,dealing_amnt,pub_rec,pub_rec_bankruptcies,inquiry_last_6months,total_recorded_late_fee,months_since_last_dealing,months_since_last_record
b59d80da191f5b573...,0,0.0,0.0,0.0,1.0,0.0,31.0,
202d9f56ecb7c3bc9...,1,0.0,0.0,0.0,0.0,0.0,6.0,
e5a140c0922b554b9...,0,0.0,0.0,0.0,0.0,0.0,47.0,
e12aefc548f750777...,0,0.0,0.0,0.0,0.0,0.0,33.0,
1b3a50d854fbbf97e...,1,0.0,0.0,0.0,0.0,0.0,21.0,
1c4329e5f17697127...,0,0.0,0.0,0.0,0.0,0.0,,
5026c86ad983175eb...,0,0.0,1.0,0.0,2.0,0.0,,71.0
9847d8c1e9d0b2084...,1,0.0,2.0,0.0,0.0,0.0,6.0,63.0
8340dbe1adea41fb4...,0,0.0,0.0,0.0,0.0,0.0,36.0,
d4de0de3ab7d79ad4...,0,0.0,0.0,0.0,0.0,0.0,35.0,


In [10]:
loans_dealing_modified.createOrReplaceTempView("loans_def")

In [12]:
    spark.sql("select dealing_2yrs,count(*) as total from loans_def group by dealing_2yrs order by total desc").show(50)

+------------+-------+
|dealing_2yrs|  total|
+------------+-------+
|           0|1839141|
|           1| 281337|
|           2|  81285|
|           3|  29545|
|           4|  13180|
|           5|   6601|
|           6|   3719|
|           7|   2063|
|           8|   1226|
|           9|    821|
|          10|    558|
|          11|    363|
|          12|    266|
|          13|    167|
|          14|    123|
|          15|     90|
|          16|     56|
|          17|     33|
|          18|     32|
|          19|     24|
|          20|     19|
|          21|     16|
|          22|      7|
|          24|      6|
|          23|      5|
|          26|      4|
|          29|      2|
|          25|      2|
|          30|      2|
|          28|      1|
|          27|      1|
|          32|      1|
|          35|      1|
|          39|      1|
|          58|      1|
|          42|      1|
|          36|      1|
+------------+-------+



In [13]:
loans_def_dealing_df = spark.sql("select member_id,dealing_2yrs,dealing_amnt,int(months_since_last_dealing)  from loans_def where dealing_2yrs>0 and months_since_last_dealing>0")

In [14]:
loans_def_dealing_df

member_id,dealing_2yrs,dealing_amnt,months_since_last_dealing
202d9f56ecb7c3bc9...,1,0.0,6
1b3a50d854fbbf97e...,1,0.0,21
9847d8c1e9d0b2084...,1,0.0,6
6f196952e71277fd4...,4,0.0,5
9c617cbc6e3e3d6a1...,1,0.0,20
8ff4d0ed17a1cab92...,1,0.0,15
aec13cfd611b2cfea...,1,0.0,19
af7a8a661df3318bd...,2,0.0,7
176d6002f8f60ea33...,1,1850.0,1
2a589bfc0ceedd44d...,1,0.0,1


In [45]:
loans_def_inquiry = spark.sql("select member_id from loans_def where pub_rec>0.0 or pub_rec_bankruptcies>0.0 or inquiry_last_6months>0.0")

In [46]:
loans_def_inquiry

member_id
b59d80da191f5b573...
5026c86ad983175eb...
9847d8c1e9d0b2084...
9dd72636b1b4045b4...
1d4e1ef4353b73c00...
9c617cbc6e3e3d6a1...
45eb04dd75400a942...
08d233c81f9e50726...
8ff4d0ed17a1cab92...
aec13cfd611b2cfea...


In [47]:
loans_def_dealing_df.write \
.format("csv") \
.option("header", True) \
.mode("overwrite") \
.option("path","/user/itv009959/lendingclubproject/cleaned/loans_defaulters_dealing_csv") \
.save()

In [48]:
loans_def_dealing_df.write \
.format("parquet") \
.mode("overwrite") \
.option("path","/user/itv009959/lendingclubproject/cleaned/loans_defaulters_dealing_parquet") \
.save()

In [49]:
loans_def_inquiry.write \
.format("csv") \
.option("header", True) \
.mode("overwrite") \
.option("path","/user/itv009959/lendingclubproject/cleaned/loans_defaulters_inquiry_csv") \
.save()

In [50]:
loans_def_inquiry.write \
.format("parquet") \
.mode("overwrite") \
.option("path","/user/itv009959/lendingclubproject/cleaned/loans_defaulters_inquiry_parquet") \
.save()

In [15]:
loans_dealing_p_public_rec_modified = loans_dealing_modified.withColumn("pub_rec",col("pub_rec").cast("int")).fillna(0,subset=["pub_rec"]) 

In [16]:
loans_dealing_p_public_rec_bank_modified = loans_dealing_p_public_rec_modified.withColumn("pub_rec_bankruptcies",col("pub_rec_bankruptcies").cast("int")).fillna(0,subset=["pub_rec_bankruptcies"]) 

In [17]:
loans_dealing_inq6months_modified = loans_dealing_p_public_rec_bank_modified.withColumn("inquiry_last_6months",col("inquiry_last_6months").cast("int")).fillna(0,subset=["inquiry_last_6months"]) 

In [18]:
loans_dealing_inq6months_modified.createOrReplaceTempView("loans_def")

In [19]:
loans_def_detail_inquiry = spark.sql("select member_id, pub_rec, pub_rec_bankruptcies, inquiry_last_6months from loans_def ")

In [21]:
loans_def_detail_inquiry.count()

2260701

In [22]:
loans_def_detail_inquiry.write \
.format("csv") \
.option("header", True) \
.mode("overwrite") \
.option("path","/user/itv009959/lendingclubproject/cleaned/loans_defaulters_detail_records_csv") \
.save()

In [23]:
loans_def_detail_inquiry.write \
.format("parquet") \
.mode("overwrite") \
.option("path","/user/itv009959/lendingclubproject/cleaned/loans_defaulters_detail_records_parquet") \
.save()