# cleansing Loans data

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import sha2, concat_ws
from pyspark.sql.functions import *
spark  = SparkSession.builder\
                    .appName("lendingClubProject")\
                    .config('spark.shuffle.useOldFetchProtocol', 'true') \
                    .config("spark.sql.warehouse.dir","/user/anil/warehouse")\
                    .enableHiveSupport()\
                    .master('yarn')\
                    .getOrCreate()

In [2]:
loansSchema = 'loan_id string, member_id string, loan_amount float, funded_amount float, loan_term_months string, interest_rate float, monthly_installment float, issue_date string, loan_status string, loan_purpose string, loan_title string'

# Adding proper Schema

In [3]:
loans = spark.read \
.format("csv") \
.option("header",True) \
.schema(loansSchema) \
.load("/user/anil/lendingClubProject/loans_data")

In [4]:
loans

loan_id,member_id,loan_amount,funded_amount,loan_term_months,interest_rate,monthly_installment,issue_date,loan_status,loan_purpose,loan_title
40391322,4480925324607267c...,35000.0,35000.0,60 months,19.99,927.1,Feb-2015,Fully Paid,debt_consolidation,Debt consolidation
40461475,b54711d4a553ea330...,5600.0,5600.0,36 months,14.31,192.24,Feb-2015,Charged Off,debt_consolidation,Debt consolidation
40361493,db06b45a938f1a3b5...,2925.0,2925.0,36 months,16.49,103.55,Feb-2015,Fully Paid,credit_card,Credit card refin...
38615026,ad9d9524477e85c11...,12300.0,12300.0,36 months,7.49,382.55,Feb-2015,Fully Paid,credit_card,Credit card refin...
40451319,c67f6ac3fea6ef46d...,12000.0,12000.0,36 months,10.49,389.98,Feb-2015,Fully Paid,debt_consolidation,Debt consolidation
40287423,bb36e2cb69517fac3...,11625.0,11625.0,36 months,19.99,431.97,Feb-2015,Charged Off,debt_consolidation,Debt consolidation
39987662,af69a7dff814fb213...,6500.0,6500.0,36 months,14.99,225.3,Feb-2015,Fully Paid,debt_consolidation,Debt consolidation
40360143,c9c794b5025e14a7d...,20950.0,20950.0,36 months,17.86,755.93,Feb-2015,Charged Off,debt_consolidation,Debt consolidation
40510540,61b48d763bd82369a...,22000.0,22000.0,36 months,9.49,704.63,Feb-2015,Fully Paid,credit_card,Credit card refin...
40581708,adc390ceaa6428ba4...,5000.0,5000.0,36 months,12.99,168.45,Feb-2015,Fully Paid,debt_consolidation,Debt consolidation


# Adding current time stanmp

In [5]:
loansIngestedTime = loans.withColumn('IngestedTimeStamp', current_timestamp())

In [6]:
loansIngestedTime

loan_id,member_id,loan_amount,funded_amount,loan_term_months,interest_rate,monthly_installment,issue_date,loan_status,loan_purpose,loan_title,IngestedTimeStamp
40391322,4480925324607267c...,35000.0,35000.0,60 months,19.99,927.1,Feb-2015,Fully Paid,debt_consolidation,Debt consolidation,2024-11-13 09:02:...
40461475,b54711d4a553ea330...,5600.0,5600.0,36 months,14.31,192.24,Feb-2015,Charged Off,debt_consolidation,Debt consolidation,2024-11-13 09:02:...
40361493,db06b45a938f1a3b5...,2925.0,2925.0,36 months,16.49,103.55,Feb-2015,Fully Paid,credit_card,Credit card refin...,2024-11-13 09:02:...
38615026,ad9d9524477e85c11...,12300.0,12300.0,36 months,7.49,382.55,Feb-2015,Fully Paid,credit_card,Credit card refin...,2024-11-13 09:02:...
40451319,c67f6ac3fea6ef46d...,12000.0,12000.0,36 months,10.49,389.98,Feb-2015,Fully Paid,debt_consolidation,Debt consolidation,2024-11-13 09:02:...
40287423,bb36e2cb69517fac3...,11625.0,11625.0,36 months,19.99,431.97,Feb-2015,Charged Off,debt_consolidation,Debt consolidation,2024-11-13 09:02:...
39987662,af69a7dff814fb213...,6500.0,6500.0,36 months,14.99,225.3,Feb-2015,Fully Paid,debt_consolidation,Debt consolidation,2024-11-13 09:02:...
40360143,c9c794b5025e14a7d...,20950.0,20950.0,36 months,17.86,755.93,Feb-2015,Charged Off,debt_consolidation,Debt consolidation,2024-11-13 09:02:...
40510540,61b48d763bd82369a...,22000.0,22000.0,36 months,9.49,704.63,Feb-2015,Fully Paid,credit_card,Credit card refin...,2024-11-13 09:02:...
40581708,adc390ceaa6428ba4...,5000.0,5000.0,36 months,12.99,168.45,Feb-2015,Fully Paid,debt_consolidation,Debt consolidation,2024-11-13 09:02:...


# Dropping the rows which has null values in the mentioned columns

In [7]:
columnsLookup = ["loan_amount", "funded_amount", "loan_term_months", "interest_rate", "monthly_installment", "issue_date", "loan_status", "loan_purpose"]

In [8]:
loansFiltered = loansIngestedTime.na.drop(subset = columnsLookup)

In [9]:
loansFiltered.count()

2260667

In [10]:
loansIngestedTime.count()

2260701

# convert loan_term_months to integer

In [16]:
loansReplaced = loansFiltered.withColumn('loan_term_months',(regexp_replace(col('loan_term_months'),'(\D)',"").cast('int')/12).cast('int'))
loans_Replaced = loansReplaced.withColumnRenamed('loan_term_months','loan_term_years')
loans_Replaced

loan_id,member_id,loan_amount,funded_amount,loan_term_years,interest_rate,monthly_installment,issue_date,loan_status,loan_purpose,loan_title,IngestedTimeStamp
40391322,4480925324607267c...,35000.0,35000.0,5,19.99,927.1,Feb-2015,Fully Paid,debt_consolidation,Debt consolidation,2024-11-13 09:04:...
40461475,b54711d4a553ea330...,5600.0,5600.0,3,14.31,192.24,Feb-2015,Charged Off,debt_consolidation,Debt consolidation,2024-11-13 09:04:...
40361493,db06b45a938f1a3b5...,2925.0,2925.0,3,16.49,103.55,Feb-2015,Fully Paid,credit_card,Credit card refin...,2024-11-13 09:04:...
38615026,ad9d9524477e85c11...,12300.0,12300.0,3,7.49,382.55,Feb-2015,Fully Paid,credit_card,Credit card refin...,2024-11-13 09:04:...
40451319,c67f6ac3fea6ef46d...,12000.0,12000.0,3,10.49,389.98,Feb-2015,Fully Paid,debt_consolidation,Debt consolidation,2024-11-13 09:04:...
40287423,bb36e2cb69517fac3...,11625.0,11625.0,3,19.99,431.97,Feb-2015,Charged Off,debt_consolidation,Debt consolidation,2024-11-13 09:04:...
39987662,af69a7dff814fb213...,6500.0,6500.0,3,14.99,225.3,Feb-2015,Fully Paid,debt_consolidation,Debt consolidation,2024-11-13 09:04:...
40360143,c9c794b5025e14a7d...,20950.0,20950.0,3,17.86,755.93,Feb-2015,Charged Off,debt_consolidation,Debt consolidation,2024-11-13 09:04:...
40510540,61b48d763bd82369a...,22000.0,22000.0,3,9.49,704.63,Feb-2015,Fully Paid,credit_card,Credit card refin...,2024-11-13 09:04:...
40581708,adc390ceaa6428ba4...,5000.0,5000.0,3,12.99,168.45,Feb-2015,Fully Paid,debt_consolidation,Debt consolidation,2024-11-13 09:04:...


In [17]:
loansReplaced.printSchema()

root
 |-- loan_id: string (nullable = true)
 |-- member_id: string (nullable = true)
 |-- loan_amount: float (nullable = true)
 |-- funded_amount: float (nullable = true)
 |-- loan_term_months: integer (nullable = true)
 |-- interest_rate: float (nullable = true)
 |-- monthly_installment: float (nullable = true)
 |-- issue_date: string (nullable = true)
 |-- loan_status: string (nullable = true)
 |-- loan_purpose: string (nullable = true)
 |-- loan_title: string (nullable = true)
 |-- IngestedTimeStamp: timestamp (nullable = false)



# Clean the loans_purpose column

In [18]:
loansReplaced.select('loan_purpose').distinct()

loan_purpose
"guaranteed!"""
and if they are a...
never had any tro...
Bank of America c...
<br/><br/>Lending...
stocks
please feel free ...
I became his prim...
brakes
on one of the bus...


In [20]:
loansReplaced.groupBy('loan_purpose').count().orderBy(desc('count'))

loan_purpose,count
debt_consolidation,1277790
credit_card,516926
home_improvement,150440
other,139413
major_purchase,50429
medical,27481
small_business,24659
car,24009
vacation,15525
moving,15402


In [21]:
purposeLookUp = ["debt_consolidation", "credit_card", "home_improvement", "other", "major_purchase", "medical", "small_business", "car", "vacation", "moving", "house", "wedding", "renewable_energy", "educational"]

In [23]:
loansPurposeModified = loansReplaced.withColumn("loan_purpose", when(col("loan_purpose").isin(purposeLookUp), col("loan_purpose")).otherwise("other"))

In [24]:
loansPurposeModified.groupBy('loan_purpose').count().orderBy(desc('count'))

loan_purpose,count
debt_consolidation,1277790
credit_card,516926
home_improvement,150440
other,139667
major_purchase,50429
medical,27481
small_business,24659
car,24009
vacation,15525
moving,15402


In [26]:
loansPurposeModified.write \
.format("parquet") \
.mode("overwrite") \
.option("path", "/user/anil/lendingClubProject/cleansed/loansParquet") \
.save()

In [None]:
loansPurposeModified.write \
.format("csv") \
.mode("overwrite") \
.option("path", "/user/anil/lendingClubProject/cleansed/loanscsv") \
.save()