In [1]:
from pyspark.sql import SparkSession
import getpass
username = getpass.getuser()
spark = SparkSession. \
builder. \
config('spark.ui.port', '0'). \
config("spark.shuffle.useOldFetchProtocol", "true"). \
config("spark.sql.warehouse.dir", f"/user/{username}/warehouse"). \
enableHiveSupport(). \
master('yarn'). \
getOrCreate()

In [2]:
loans_raw_df = spark.read \
.format("csv") \
.option("header", True) \
.option("inferSchema", True) \
.load("/user/itv015970/lendingclubproject/raw/loans_data_csv")

In [3]:
loans_raw_df

loan_id,member_id,loan_amnt,funded_amnt,term,int_rate,installment,issue_d,loan_status,purpose,title
14408468,5d7676571dee53d06...,5000.0,5000.0,36 months,14.16,171.28,Apr-2014,Fully Paid,debt_consolidation,Debt consolidation
14520567,31fd1ef036c5caf26...,11000.0,11000.0,36 months,9.67,353.24,Apr-2014,Fully Paid,other,Other
14708730,9b48253ca5848fa78...,25000.0,25000.0,36 months,11.99,830.24,May-2014,Charged Off,credit_card,Credit card refin...
14491003,d6b8f2e32be148721...,1500.0,1500.0,36 months,23.43,58.41,Apr-2014,Fully Paid,renewable_energy,Green loan
14510981,0a81ad556e20bcdb9...,10000.0,10000.0,36 months,12.99,336.9,Apr-2014,Fully Paid,debt_consolidation,Debt consolidation
14177845,843b995d39160ddab...,13200.0,13200.0,60 months,12.99,300.28,Apr-2014,Charged Off,debt_consolidation,Debt consolidation
13947687,04a8bd7c4e1407ba8...,10000.0,10000.0,60 months,17.57,251.61,Apr-2014,Fully Paid,other,Other
12905731,3e9a89fa96988af68...,18000.0,18000.0,36 months,18.25,653.01,May-2014,Fully Paid,home_improvement,Home improvement
14137736,394a8db0bd3bcf7de...,12000.0,12000.0,36 months,9.67,385.35,Apr-2014,Fully Paid,debt_consolidation,Debt consolidation
13207083,09cea8d2b6f1272f7...,3200.0,3200.0,36 months,20.49,119.73,Apr-2014,Fully Paid,credit_card,Credit card refin...


In [4]:
loans_raw_df.printSchema()

root
 |-- loan_id: string (nullable = true)
 |-- member_id: string (nullable = true)
 |-- loan_amnt: double (nullable = true)
 |-- funded_amnt: double (nullable = true)
 |-- term: string (nullable = true)
 |-- int_rate: double (nullable = true)
 |-- installment: double (nullable = true)
 |-- issue_d: string (nullable = true)
 |-- loan_status: string (nullable = true)
 |-- purpose: string (nullable = true)
 |-- title: string (nullable = true)



In [5]:
loans_schema = "loan_id string, member_id string, loan_amount float, funded_amount float, loan_term_months string, interest_rate float, monthly_installment float, issue_date string, loan_status string, loan_purpose string, loan_title string"

In [6]:
loans_raw_df = spark.read \
.format("csv") \
.option("header", True) \
.schema(loans_schema) \
.load("/user/itv015970/lendingclubproject/raw/loans_data_csv")

In [7]:
loans_raw_df

loan_id,member_id,loan_amount,funded_amount,loan_term_months,interest_rate,monthly_installment,issue_date,loan_status,loan_purpose,loan_title
14408468,5d7676571dee53d06...,5000.0,5000.0,36 months,14.16,171.28,Apr-2014,Fully Paid,debt_consolidation,Debt consolidation
14520567,31fd1ef036c5caf26...,11000.0,11000.0,36 months,9.67,353.24,Apr-2014,Fully Paid,other,Other
14708730,9b48253ca5848fa78...,25000.0,25000.0,36 months,11.99,830.24,May-2014,Charged Off,credit_card,Credit card refin...
14491003,d6b8f2e32be148721...,1500.0,1500.0,36 months,23.43,58.41,Apr-2014,Fully Paid,renewable_energy,Green loan
14510981,0a81ad556e20bcdb9...,10000.0,10000.0,36 months,12.99,336.9,Apr-2014,Fully Paid,debt_consolidation,Debt consolidation
14177845,843b995d39160ddab...,13200.0,13200.0,60 months,12.99,300.28,Apr-2014,Charged Off,debt_consolidation,Debt consolidation
13947687,04a8bd7c4e1407ba8...,10000.0,10000.0,60 months,17.57,251.61,Apr-2014,Fully Paid,other,Other
12905731,3e9a89fa96988af68...,18000.0,18000.0,36 months,18.25,653.01,May-2014,Fully Paid,home_improvement,Home improvement
14137736,394a8db0bd3bcf7de...,12000.0,12000.0,36 months,9.67,385.35,Apr-2014,Fully Paid,debt_consolidation,Debt consolidation
13207083,09cea8d2b6f1272f7...,3200.0,3200.0,36 months,20.49,119.73,Apr-2014,Fully Paid,credit_card,Credit card refin...


In [8]:
loans_raw_df.printSchema()

root
 |-- loan_id: string (nullable = true)
 |-- member_id: string (nullable = true)
 |-- loan_amount: float (nullable = true)
 |-- funded_amount: float (nullable = true)
 |-- loan_term_months: string (nullable = true)
 |-- interest_rate: float (nullable = true)
 |-- monthly_installment: float (nullable = true)
 |-- issue_date: string (nullable = true)
 |-- loan_status: string (nullable = true)
 |-- loan_purpose: string (nullable = true)
 |-- loan_title: string (nullable = true)



In [9]:
# ingest timestamp column in data frame
from pyspark.sql.functions import current_timestamp

In [10]:
loans_df_ingestd = loans_raw_df.withColumn("ingest_date", current_timestamp())

In [11]:
loans_df_ingestd

loan_id,member_id,loan_amount,funded_amount,loan_term_months,interest_rate,monthly_installment,issue_date,loan_status,loan_purpose,loan_title,ingest_date
14408468,5d7676571dee53d06...,5000.0,5000.0,36 months,14.16,171.28,Apr-2014,Fully Paid,debt_consolidation,Debt consolidation,2025-05-26 12:04:...
14520567,31fd1ef036c5caf26...,11000.0,11000.0,36 months,9.67,353.24,Apr-2014,Fully Paid,other,Other,2025-05-26 12:04:...
14708730,9b48253ca5848fa78...,25000.0,25000.0,36 months,11.99,830.24,May-2014,Charged Off,credit_card,Credit card refin...,2025-05-26 12:04:...
14491003,d6b8f2e32be148721...,1500.0,1500.0,36 months,23.43,58.41,Apr-2014,Fully Paid,renewable_energy,Green loan,2025-05-26 12:04:...
14510981,0a81ad556e20bcdb9...,10000.0,10000.0,36 months,12.99,336.9,Apr-2014,Fully Paid,debt_consolidation,Debt consolidation,2025-05-26 12:04:...
14177845,843b995d39160ddab...,13200.0,13200.0,60 months,12.99,300.28,Apr-2014,Charged Off,debt_consolidation,Debt consolidation,2025-05-26 12:04:...
13947687,04a8bd7c4e1407ba8...,10000.0,10000.0,60 months,17.57,251.61,Apr-2014,Fully Paid,other,Other,2025-05-26 12:04:...
12905731,3e9a89fa96988af68...,18000.0,18000.0,36 months,18.25,653.01,May-2014,Fully Paid,home_improvement,Home improvement,2025-05-26 12:04:...
14137736,394a8db0bd3bcf7de...,12000.0,12000.0,36 months,9.67,385.35,Apr-2014,Fully Paid,debt_consolidation,Debt consolidation,2025-05-26 12:04:...
13207083,09cea8d2b6f1272f7...,3200.0,3200.0,36 months,20.49,119.73,Apr-2014,Fully Paid,credit_card,Credit card refin...,2025-05-26 12:04:...


In [12]:
loans_df_ingestd.createOrReplaceTempView("loans")

In [13]:
spark.sql("select count(*) from loans")

count(1)
2260701


In [14]:
spark.sql("select count(*) from loans where loan_amount is null")

count(1)
33


In [15]:
spark.sql("select * from loans where loan_amount is null")

loan_id,member_id,loan_amount,funded_amount,loan_term_months,interest_rate,monthly_installment,issue_date,loan_status,loan_purpose,loan_title,ingest_date
Total amount fund...,e3b0c44298fc1c149...,,,,,,,,,,2025-05-26 12:04:...
Total amount fund...,e3b0c44298fc1c149...,,,,,,,,,,2025-05-26 12:04:...
Total amount fund...,e3b0c44298fc1c149...,,,,,,,,,,2025-05-26 12:04:...
Total amount fund...,e3b0c44298fc1c149...,,,,,,,,,,2025-05-26 12:04:...
Total amount fund...,e3b0c44298fc1c149...,,,,,,,,,,2025-05-26 12:04:...
Total amount fund...,e3b0c44298fc1c149...,,,,,,,,,,2025-05-26 12:04:...
Total amount fund...,e3b0c44298fc1c149...,,,,,,,,,,2025-05-26 12:04:...
Total amount fund...,e3b0c44298fc1c149...,,,,,,,,,,2025-05-26 12:04:...
Loans that do not...,e3b0c44298fc1c149...,,,,,,,,,,2025-05-26 12:04:...
Total amount fund...,e3b0c44298fc1c149...,,,,,,,,,,2025-05-26 12:04:...


In [16]:
columns_to_check = ["loan_amount", "funded_amount", "loan_term_months", "interest_rate", "monthly_installment", "issue_date", "loan_status", "loan_purpose"]

In [17]:
loans_filtered_df = loans_df_ingestd.na.drop(subset=columns_to_check)

In [18]:
loans_filtered_df.count()

2260667

In [19]:
loans_filtered_df.createOrReplaceTempView("loans")

In [20]:
loans_filtered_df

loan_id,member_id,loan_amount,funded_amount,loan_term_months,interest_rate,monthly_installment,issue_date,loan_status,loan_purpose,loan_title,ingest_date
14408468,5d7676571dee53d06...,5000.0,5000.0,36 months,14.16,171.28,Apr-2014,Fully Paid,debt_consolidation,Debt consolidation,2025-05-26 12:05:...
14520567,31fd1ef036c5caf26...,11000.0,11000.0,36 months,9.67,353.24,Apr-2014,Fully Paid,other,Other,2025-05-26 12:05:...
14708730,9b48253ca5848fa78...,25000.0,25000.0,36 months,11.99,830.24,May-2014,Charged Off,credit_card,Credit card refin...,2025-05-26 12:05:...
14491003,d6b8f2e32be148721...,1500.0,1500.0,36 months,23.43,58.41,Apr-2014,Fully Paid,renewable_energy,Green loan,2025-05-26 12:05:...
14510981,0a81ad556e20bcdb9...,10000.0,10000.0,36 months,12.99,336.9,Apr-2014,Fully Paid,debt_consolidation,Debt consolidation,2025-05-26 12:05:...
14177845,843b995d39160ddab...,13200.0,13200.0,60 months,12.99,300.28,Apr-2014,Charged Off,debt_consolidation,Debt consolidation,2025-05-26 12:05:...
13947687,04a8bd7c4e1407ba8...,10000.0,10000.0,60 months,17.57,251.61,Apr-2014,Fully Paid,other,Other,2025-05-26 12:05:...
12905731,3e9a89fa96988af68...,18000.0,18000.0,36 months,18.25,653.01,May-2014,Fully Paid,home_improvement,Home improvement,2025-05-26 12:05:...
14137736,394a8db0bd3bcf7de...,12000.0,12000.0,36 months,9.67,385.35,Apr-2014,Fully Paid,debt_consolidation,Debt consolidation,2025-05-26 12:05:...
13207083,09cea8d2b6f1272f7...,3200.0,3200.0,36 months,20.49,119.73,Apr-2014,Fully Paid,credit_card,Credit card refin...,2025-05-26 12:05:...


In [21]:
from pyspark.sql.functions import regexp_replace, col

In [22]:
loans_term_modified_df = loans_filtered_df \
.withColumn("loan_term_months", (regexp_replace(col("loan_term_months"), " months", "") \
.cast("int")/12) \
.cast("int")) \
.withColumnRenamed("loan_term_months", "loan_term_years")

In [23]:
loans_term_modified_df

loan_id,member_id,loan_amount,funded_amount,loan_term_years,interest_rate,monthly_installment,issue_date,loan_status,loan_purpose,loan_title,ingest_date
14408468,5d7676571dee53d06...,5000.0,5000.0,3,14.16,171.28,Apr-2014,Fully Paid,debt_consolidation,Debt consolidation,2025-05-26 12:05:...
14520567,31fd1ef036c5caf26...,11000.0,11000.0,3,9.67,353.24,Apr-2014,Fully Paid,other,Other,2025-05-26 12:05:...
14708730,9b48253ca5848fa78...,25000.0,25000.0,3,11.99,830.24,May-2014,Charged Off,credit_card,Credit card refin...,2025-05-26 12:05:...
14491003,d6b8f2e32be148721...,1500.0,1500.0,3,23.43,58.41,Apr-2014,Fully Paid,renewable_energy,Green loan,2025-05-26 12:05:...
14510981,0a81ad556e20bcdb9...,10000.0,10000.0,3,12.99,336.9,Apr-2014,Fully Paid,debt_consolidation,Debt consolidation,2025-05-26 12:05:...
14177845,843b995d39160ddab...,13200.0,13200.0,5,12.99,300.28,Apr-2014,Charged Off,debt_consolidation,Debt consolidation,2025-05-26 12:05:...
13947687,04a8bd7c4e1407ba8...,10000.0,10000.0,5,17.57,251.61,Apr-2014,Fully Paid,other,Other,2025-05-26 12:05:...
12905731,3e9a89fa96988af68...,18000.0,18000.0,3,18.25,653.01,May-2014,Fully Paid,home_improvement,Home improvement,2025-05-26 12:05:...
14137736,394a8db0bd3bcf7de...,12000.0,12000.0,3,9.67,385.35,Apr-2014,Fully Paid,debt_consolidation,Debt consolidation,2025-05-26 12:05:...
13207083,09cea8d2b6f1272f7...,3200.0,3200.0,3,20.49,119.73,Apr-2014,Fully Paid,credit_card,Credit card refin...,2025-05-26 12:05:...


In [24]:
loans_term_modified_df.printSchema()

root
 |-- loan_id: string (nullable = true)
 |-- member_id: string (nullable = true)
 |-- loan_amount: float (nullable = true)
 |-- funded_amount: float (nullable = true)
 |-- loan_term_years: integer (nullable = true)
 |-- interest_rate: float (nullable = true)
 |-- monthly_installment: float (nullable = true)
 |-- issue_date: string (nullable = true)
 |-- loan_status: string (nullable = true)
 |-- loan_purpose: string (nullable = true)
 |-- loan_title: string (nullable = true)
 |-- ingest_date: timestamp (nullable = false)



In [25]:
loans_term_modified_df.createOrReplaceTempView("loans")

In [26]:
spark.sql("select distinct(loan_purpose) from loans")

loan_purpose
"guaranteed!"""
and if they are a...
never had any tro...
<br/><br/>Lending...
Bank of America c...
stocks
please feel free ...
I became his prim...
brakes
on one of the bus...


In [27]:
spark.sql("select loan_purpose, count(*) as total from loans group by loan_purpose order by total desc")

loan_purpose,total
debt_consolidation,1277790
credit_card,516926
home_improvement,150440
other,139413
major_purchase,50429
medical,27481
small_business,24659
car,24009
vacation,15525
moving,15402


In [28]:
loan_purpose_lookup = ["debt_consolidation", "credit_card", "home_improvement", 
                       "other", "major_purchase", "medical", "small_business", "car", 
                       "vacation", "moving", "house", "wedding", "renewable_energy", "educational"]

In [29]:
from pyspark.sql.functions import when

In [30]:
loan_purpose_modifed_df = loans_term_modified_df.withColumn("loan_purpose", when(col("loan_purpose").isin(loan_purpose_lookup), col("loan_purpose")).otherwise("other"))

In [31]:
loan_purpose_modifed_df

loan_id,member_id,loan_amount,funded_amount,loan_term_years,interest_rate,monthly_installment,issue_date,loan_status,loan_purpose,loan_title,ingest_date
14408468,5d7676571dee53d06...,5000.0,5000.0,3,14.16,171.28,Apr-2014,Fully Paid,debt_consolidation,Debt consolidation,2025-05-26 12:05:...
14520567,31fd1ef036c5caf26...,11000.0,11000.0,3,9.67,353.24,Apr-2014,Fully Paid,other,Other,2025-05-26 12:05:...
14708730,9b48253ca5848fa78...,25000.0,25000.0,3,11.99,830.24,May-2014,Charged Off,credit_card,Credit card refin...,2025-05-26 12:05:...
14491003,d6b8f2e32be148721...,1500.0,1500.0,3,23.43,58.41,Apr-2014,Fully Paid,renewable_energy,Green loan,2025-05-26 12:05:...
14510981,0a81ad556e20bcdb9...,10000.0,10000.0,3,12.99,336.9,Apr-2014,Fully Paid,debt_consolidation,Debt consolidation,2025-05-26 12:05:...
14177845,843b995d39160ddab...,13200.0,13200.0,5,12.99,300.28,Apr-2014,Charged Off,debt_consolidation,Debt consolidation,2025-05-26 12:05:...
13947687,04a8bd7c4e1407ba8...,10000.0,10000.0,5,17.57,251.61,Apr-2014,Fully Paid,other,Other,2025-05-26 12:05:...
12905731,3e9a89fa96988af68...,18000.0,18000.0,3,18.25,653.01,May-2014,Fully Paid,home_improvement,Home improvement,2025-05-26 12:05:...
14137736,394a8db0bd3bcf7de...,12000.0,12000.0,3,9.67,385.35,Apr-2014,Fully Paid,debt_consolidation,Debt consolidation,2025-05-26 12:05:...
13207083,09cea8d2b6f1272f7...,3200.0,3200.0,3,20.49,119.73,Apr-2014,Fully Paid,credit_card,Credit card refin...,2025-05-26 12:05:...


In [32]:
loan_purpose_modifed_df.createOrReplaceTempView("loans")

In [33]:
spark.sql("select distinct(loan_purpose) from loans")

loan_purpose
wedding
educational
other
small_business
debt_consolidation
credit_card
moving
vacation
renewable_energy
house


In [34]:
spark.sql("select loan_purpose, count(*) as total from loans group by loan_purpose order by total desc")

loan_purpose,total
debt_consolidation,1277790
credit_card,516926
home_improvement,150440
other,139667
major_purchase,50429
medical,27481
small_business,24659
car,24009
vacation,15525
moving,15402


In [35]:
from pyspark.sql.functions import count, desc

In [36]:
loan_purpose_modifed_df.groupBy("loan_purpose").agg(count("*").alias("total")).orderBy(col("total").desc())

loan_purpose,total
debt_consolidation,1277790
credit_card,516926
home_improvement,150440
other,139667
major_purchase,50429
medical,27481
small_business,24659
car,24009
vacation,15525
moving,15402


In [37]:
loan_purpose_modifed_df.groupBy("loan_purpose").agg(count("*").alias("total")).sort(desc("total"))

loan_purpose,total
debt_consolidation,1277790
credit_card,516926
home_improvement,150440
other,139667
major_purchase,50429
medical,27481
small_business,24659
car,24009
vacation,15525
moving,15402


In [39]:
loan_purpose_modifed_df.write \
.format("csv") \
.mode("overwrite") \
.option("header", "true") \
.option("path", "/user/itv015970/lendingclubproject/cleaned/loans_csv") \
.save()

In [40]:
loan_purpose_modifed_df.write \
.format("parquet") \
.mode("overwrite") \
.option("path", "/user/itv015970/lendingclubproject/cleaned/loans_parquet") \
.save()