In [1]:
from pyspark.sql import SparkSession
import getpass
username = getpass.getuser()
spark = SparkSession. \
    builder. \
    config('spark.ui.port','0'). \
    config('spark.shuffle.useOldFetchProtocol', 'true'). \
    config("spark.sql.warehouse.dir", f"/user/{username}/warehouse"). \
    enableHiveSupport(). \
    master('yarn'). \
    getOrCreate()

In [2]:
loans_raw_df = spark.read \
.format("csv") \
.option("inferSchema","true") \
.option("header","true") \
.load("/public/trendytech/lendingclubproject/raw/loans_data_csv")

In [3]:
loans_raw_df

loan_id,member_id,loan_amnt,funded_amnt,term,int_rate,installment,issue_d,loan_status,purpose,title
56633077,b59d80da191f5b573...,3000.0,3000.0,36 months,7.89,93.86,Aug-2015,Fully Paid,credit_card,Credit card refin...
55927518,202d9f56ecb7c3bc9...,15600.0,15600.0,36 months,7.89,488.06,Aug-2015,Fully Paid,credit_card,Credit card refin...
56473345,e5a140c0922b554b9...,20000.0,20000.0,36 months,9.17,637.58,Aug-2015,Fully Paid,debt_consolidation,Debt consolidation
56463188,e12aefc548f750777...,11200.0,11200.0,60 months,21.99,309.27,Aug-2015,Fully Paid,home_improvement,Home improvement
56473316,1b3a50d854fbbf97e...,16000.0,16000.0,60 months,20.99,432.77,Aug-2015,Charged Off,debt_consolidation,Debt consolidation
56663266,1c4329e5f17697127...,20000.0,20000.0,60 months,13.33,458.45,Aug-2015,Charged Off,debt_consolidation,Debt consolidation
56483027,5026c86ad983175eb...,10000.0,10000.0,36 months,12.69,335.45,Aug-2015,Fully Paid,other,Other
56613385,9847d8c1e9d0b2084...,23400.0,23400.0,60 months,19.19,609.46,Aug-2015,Current,small_business,Business
56643620,8340dbe1adea41fb4...,16000.0,16000.0,36 months,5.32,481.84,Jul-2015,Fully Paid,debt_consolidation,Debt consolidation
56533114,d4de0de3ab7d79ad4...,25450.0,25450.0,36 months,27.31,1043.24,Aug-2015,Charged Off,debt_consolidation,Debt consolidation


In [4]:
loans_schema = '''loan_id string, member_id string, loan_amount float, funded_amount float, 
loan_term_months string, intrest_rate float, monthly_installment float, 
issue_date string, loan_status string, loan_purpose string, loan_title string'''

In [5]:
loans_raw_df = spark.read \
.format("csv") \
.schema(loans_schema) \
.option("header","true") \
.load("/public/trendytech/lendingclubproject/raw/loans_data_csv")

In [6]:
loans_raw_df

loan_id,member_id,loan_amount,funded_amount,loan_term_months,intrest_rate,monthly_installment,issue_date,loan_status,loan_purpose,loan_title
56633077,b59d80da191f5b573...,3000.0,3000.0,36 months,7.89,93.86,Aug-2015,Fully Paid,credit_card,Credit card refin...
55927518,202d9f56ecb7c3bc9...,15600.0,15600.0,36 months,7.89,488.06,Aug-2015,Fully Paid,credit_card,Credit card refin...
56473345,e5a140c0922b554b9...,20000.0,20000.0,36 months,9.17,637.58,Aug-2015,Fully Paid,debt_consolidation,Debt consolidation
56463188,e12aefc548f750777...,11200.0,11200.0,60 months,21.99,309.27,Aug-2015,Fully Paid,home_improvement,Home improvement
56473316,1b3a50d854fbbf97e...,16000.0,16000.0,60 months,20.99,432.77,Aug-2015,Charged Off,debt_consolidation,Debt consolidation
56663266,1c4329e5f17697127...,20000.0,20000.0,60 months,13.33,458.45,Aug-2015,Charged Off,debt_consolidation,Debt consolidation
56483027,5026c86ad983175eb...,10000.0,10000.0,36 months,12.69,335.45,Aug-2015,Fully Paid,other,Other
56613385,9847d8c1e9d0b2084...,23400.0,23400.0,60 months,19.19,609.46,Aug-2015,Current,small_business,Business
56643620,8340dbe1adea41fb4...,16000.0,16000.0,36 months,5.32,481.84,Jul-2015,Fully Paid,debt_consolidation,Debt consolidation
56533114,d4de0de3ab7d79ad4...,25450.0,25450.0,36 months,27.31,1043.24,Aug-2015,Charged Off,debt_consolidation,Debt consolidation


In [7]:
loans_raw_df.printSchema()

root
 |-- loan_id: string (nullable = true)
 |-- member_id: string (nullable = true)
 |-- loan_amount: float (nullable = true)
 |-- funded_amount: float (nullable = true)
 |-- loan_term_months: string (nullable = true)
 |-- intrest_rate: float (nullable = true)
 |-- monthly_installment: float (nullable = true)
 |-- issue_date: string (nullable = true)
 |-- loan_status: string (nullable = true)
 |-- loan_purpose: string (nullable = true)
 |-- loan_title: string (nullable = true)



## Insert a new colunmn named as ingestion date(current_timestamp)

In [8]:
from pyspark.sql.functions import current_timestamp

In [9]:
loans_df_ingested  = loans_raw_df.withColumn("ingest_date", current_timestamp())

In [10]:
loans_df_ingested

loan_id,member_id,loan_amount,funded_amount,loan_term_months,intrest_rate,monthly_installment,issue_date,loan_status,loan_purpose,loan_title,ingest_date
56633077,b59d80da191f5b573...,3000.0,3000.0,36 months,7.89,93.86,Aug-2015,Fully Paid,credit_card,Credit card refin...,2026-01-18 01:53:...
55927518,202d9f56ecb7c3bc9...,15600.0,15600.0,36 months,7.89,488.06,Aug-2015,Fully Paid,credit_card,Credit card refin...,2026-01-18 01:53:...
56473345,e5a140c0922b554b9...,20000.0,20000.0,36 months,9.17,637.58,Aug-2015,Fully Paid,debt_consolidation,Debt consolidation,2026-01-18 01:53:...
56463188,e12aefc548f750777...,11200.0,11200.0,60 months,21.99,309.27,Aug-2015,Fully Paid,home_improvement,Home improvement,2026-01-18 01:53:...
56473316,1b3a50d854fbbf97e...,16000.0,16000.0,60 months,20.99,432.77,Aug-2015,Charged Off,debt_consolidation,Debt consolidation,2026-01-18 01:53:...
56663266,1c4329e5f17697127...,20000.0,20000.0,60 months,13.33,458.45,Aug-2015,Charged Off,debt_consolidation,Debt consolidation,2026-01-18 01:53:...
56483027,5026c86ad983175eb...,10000.0,10000.0,36 months,12.69,335.45,Aug-2015,Fully Paid,other,Other,2026-01-18 01:53:...
56613385,9847d8c1e9d0b2084...,23400.0,23400.0,60 months,19.19,609.46,Aug-2015,Current,small_business,Business,2026-01-18 01:53:...
56643620,8340dbe1adea41fb4...,16000.0,16000.0,36 months,5.32,481.84,Jul-2015,Fully Paid,debt_consolidation,Debt consolidation,2026-01-18 01:53:...
56533114,d4de0de3ab7d79ad4...,25450.0,25450.0,36 months,27.31,1043.24,Aug-2015,Charged Off,debt_consolidation,Debt consolidation,2026-01-18 01:53:...


In [11]:
loans_df_ingested.createOrReplaceTempView("loans")

In [12]:
spark.sql("select count(*) from loans")

count(1)
2260701


In [13]:
spark.sql("select * from loans")

loan_id,member_id,loan_amount,funded_amount,loan_term_months,intrest_rate,monthly_installment,issue_date,loan_status,loan_purpose,loan_title,ingest_date
56633077,b59d80da191f5b573...,3000.0,3000.0,36 months,7.89,93.86,Aug-2015,Fully Paid,credit_card,Credit card refin...,2026-01-18 01:53:...
55927518,202d9f56ecb7c3bc9...,15600.0,15600.0,36 months,7.89,488.06,Aug-2015,Fully Paid,credit_card,Credit card refin...,2026-01-18 01:53:...
56473345,e5a140c0922b554b9...,20000.0,20000.0,36 months,9.17,637.58,Aug-2015,Fully Paid,debt_consolidation,Debt consolidation,2026-01-18 01:53:...
56463188,e12aefc548f750777...,11200.0,11200.0,60 months,21.99,309.27,Aug-2015,Fully Paid,home_improvement,Home improvement,2026-01-18 01:53:...
56473316,1b3a50d854fbbf97e...,16000.0,16000.0,60 months,20.99,432.77,Aug-2015,Charged Off,debt_consolidation,Debt consolidation,2026-01-18 01:53:...
56663266,1c4329e5f17697127...,20000.0,20000.0,60 months,13.33,458.45,Aug-2015,Charged Off,debt_consolidation,Debt consolidation,2026-01-18 01:53:...
56483027,5026c86ad983175eb...,10000.0,10000.0,36 months,12.69,335.45,Aug-2015,Fully Paid,other,Other,2026-01-18 01:53:...
56613385,9847d8c1e9d0b2084...,23400.0,23400.0,60 months,19.19,609.46,Aug-2015,Current,small_business,Business,2026-01-18 01:53:...
56643620,8340dbe1adea41fb4...,16000.0,16000.0,36 months,5.32,481.84,Jul-2015,Fully Paid,debt_consolidation,Debt consolidation,2026-01-18 01:53:...
56533114,d4de0de3ab7d79ad4...,25450.0,25450.0,36 months,27.31,1043.24,Aug-2015,Charged Off,debt_consolidation,Debt consolidation,2026-01-18 01:53:...


In [14]:
spark.sql("select count(*) from loans where loan_amount is null")


count(1)
33


In [15]:
spark.sql("select * from loans where loan_amount is null")

# here we are tryin to check only loan amount column which  are nulls 
# but we see all are nulls so we should drop them 

loan_id,member_id,loan_amount,funded_amount,loan_term_months,intrest_rate,monthly_installment,issue_date,loan_status,loan_purpose,loan_title,ingest_date
Total amount fund...,e3b0c44298fc1c149...,,,,,,,,,,2026-01-18 01:53:...
Total amount fund...,e3b0c44298fc1c149...,,,,,,,,,,2026-01-18 01:53:...
Total amount fund...,e3b0c44298fc1c149...,,,,,,,,,,2026-01-18 01:53:...
Total amount fund...,e3b0c44298fc1c149...,,,,,,,,,,2026-01-18 01:53:...
Total amount fund...,e3b0c44298fc1c149...,,,,,,,,,,2026-01-18 01:53:...
Total amount fund...,e3b0c44298fc1c149...,,,,,,,,,,2026-01-18 01:53:...
Total amount fund...,e3b0c44298fc1c149...,,,,,,,,,,2026-01-18 01:53:...
Total amount fund...,e3b0c44298fc1c149...,,,,,,,,,,2026-01-18 01:53:...
Total amount fund...,e3b0c44298fc1c149...,,,,,,,,,,2026-01-18 01:53:...
Total amount fund...,e3b0c44298fc1c149...,,,,,,,,,,2026-01-18 01:53:...


In [16]:
columns_to_check = ["loan_amount","funded_amount", "loan_term_months", "intrest_rate","monthly_installment", "issue_date", "loan_status","loan_purpose"]

In [17]:
loans_filtered_df = loans_df_ingested.na.drop(subset=columns_to_check)

In [18]:
loans_filtered_df.count()

2260667

In [19]:
loans_filtered_df.createOrReplaceTempView("loans")

In [20]:
loans_filtered_df

loan_id,member_id,loan_amount,funded_amount,loan_term_months,intrest_rate,monthly_installment,issue_date,loan_status,loan_purpose,loan_title,ingest_date
56633077,b59d80da191f5b573...,3000.0,3000.0,36 months,7.89,93.86,Aug-2015,Fully Paid,credit_card,Credit card refin...,2026-01-18 01:53:...
55927518,202d9f56ecb7c3bc9...,15600.0,15600.0,36 months,7.89,488.06,Aug-2015,Fully Paid,credit_card,Credit card refin...,2026-01-18 01:53:...
56473345,e5a140c0922b554b9...,20000.0,20000.0,36 months,9.17,637.58,Aug-2015,Fully Paid,debt_consolidation,Debt consolidation,2026-01-18 01:53:...
56463188,e12aefc548f750777...,11200.0,11200.0,60 months,21.99,309.27,Aug-2015,Fully Paid,home_improvement,Home improvement,2026-01-18 01:53:...
56473316,1b3a50d854fbbf97e...,16000.0,16000.0,60 months,20.99,432.77,Aug-2015,Charged Off,debt_consolidation,Debt consolidation,2026-01-18 01:53:...
56663266,1c4329e5f17697127...,20000.0,20000.0,60 months,13.33,458.45,Aug-2015,Charged Off,debt_consolidation,Debt consolidation,2026-01-18 01:53:...
56483027,5026c86ad983175eb...,10000.0,10000.0,36 months,12.69,335.45,Aug-2015,Fully Paid,other,Other,2026-01-18 01:53:...
56613385,9847d8c1e9d0b2084...,23400.0,23400.0,60 months,19.19,609.46,Aug-2015,Current,small_business,Business,2026-01-18 01:53:...
56643620,8340dbe1adea41fb4...,16000.0,16000.0,36 months,5.32,481.84,Jul-2015,Fully Paid,debt_consolidation,Debt consolidation,2026-01-18 01:53:...
56533114,d4de0de3ab7d79ad4...,25450.0,25450.0,36 months,27.31,1043.24,Aug-2015,Charged Off,debt_consolidation,Debt consolidation,2026-01-18 01:53:...


## till now we have updated column names, created a new column for timestamp(ingest_date)
## and removed all the null value rows for the major columns where the calculations will be done

## now, need to change the loan_term_months to years 

In [21]:
from pyspark.sql.functions import regexp_replace, col

In [22]:
loan_term_modified_df = loans_filtered_df \
.withColumn("loan_term_months",(regexp_replace(col("loan_term_months")," months","") \
.cast("int")/12) \
.cast("int")) \
.withColumnRenamed("loan_term_months","loan_term_years")

In [23]:
loan_term_modified_df

loan_id,member_id,loan_amount,funded_amount,loan_term_years,intrest_rate,monthly_installment,issue_date,loan_status,loan_purpose,loan_title,ingest_date
56633077,b59d80da191f5b573...,3000.0,3000.0,3,7.89,93.86,Aug-2015,Fully Paid,credit_card,Credit card refin...,2026-01-18 01:53:...
55927518,202d9f56ecb7c3bc9...,15600.0,15600.0,3,7.89,488.06,Aug-2015,Fully Paid,credit_card,Credit card refin...,2026-01-18 01:53:...
56473345,e5a140c0922b554b9...,20000.0,20000.0,3,9.17,637.58,Aug-2015,Fully Paid,debt_consolidation,Debt consolidation,2026-01-18 01:53:...
56463188,e12aefc548f750777...,11200.0,11200.0,5,21.99,309.27,Aug-2015,Fully Paid,home_improvement,Home improvement,2026-01-18 01:53:...
56473316,1b3a50d854fbbf97e...,16000.0,16000.0,5,20.99,432.77,Aug-2015,Charged Off,debt_consolidation,Debt consolidation,2026-01-18 01:53:...
56663266,1c4329e5f17697127...,20000.0,20000.0,5,13.33,458.45,Aug-2015,Charged Off,debt_consolidation,Debt consolidation,2026-01-18 01:53:...
56483027,5026c86ad983175eb...,10000.0,10000.0,3,12.69,335.45,Aug-2015,Fully Paid,other,Other,2026-01-18 01:53:...
56613385,9847d8c1e9d0b2084...,23400.0,23400.0,5,19.19,609.46,Aug-2015,Current,small_business,Business,2026-01-18 01:53:...
56643620,8340dbe1adea41fb4...,16000.0,16000.0,3,5.32,481.84,Jul-2015,Fully Paid,debt_consolidation,Debt consolidation,2026-01-18 01:53:...
56533114,d4de0de3ab7d79ad4...,25450.0,25450.0,3,27.31,1043.24,Aug-2015,Charged Off,debt_consolidation,Debt consolidation,2026-01-18 01:53:...


In [24]:
loan_term_modified_df.createOrReplaceTempView("loans")

In [25]:
spark.sql("select * from loans")

loan_id,member_id,loan_amount,funded_amount,loan_term_years,intrest_rate,monthly_installment,issue_date,loan_status,loan_purpose,loan_title,ingest_date
56633077,b59d80da191f5b573...,3000.0,3000.0,3,7.89,93.86,Aug-2015,Fully Paid,credit_card,Credit card refin...,2026-01-18 01:53:...
55927518,202d9f56ecb7c3bc9...,15600.0,15600.0,3,7.89,488.06,Aug-2015,Fully Paid,credit_card,Credit card refin...,2026-01-18 01:53:...
56473345,e5a140c0922b554b9...,20000.0,20000.0,3,9.17,637.58,Aug-2015,Fully Paid,debt_consolidation,Debt consolidation,2026-01-18 01:53:...
56463188,e12aefc548f750777...,11200.0,11200.0,5,21.99,309.27,Aug-2015,Fully Paid,home_improvement,Home improvement,2026-01-18 01:53:...
56473316,1b3a50d854fbbf97e...,16000.0,16000.0,5,20.99,432.77,Aug-2015,Charged Off,debt_consolidation,Debt consolidation,2026-01-18 01:53:...
56663266,1c4329e5f17697127...,20000.0,20000.0,5,13.33,458.45,Aug-2015,Charged Off,debt_consolidation,Debt consolidation,2026-01-18 01:53:...
56483027,5026c86ad983175eb...,10000.0,10000.0,3,12.69,335.45,Aug-2015,Fully Paid,other,Other,2026-01-18 01:53:...
56613385,9847d8c1e9d0b2084...,23400.0,23400.0,5,19.19,609.46,Aug-2015,Current,small_business,Business,2026-01-18 01:53:...
56643620,8340dbe1adea41fb4...,16000.0,16000.0,3,5.32,481.84,Jul-2015,Fully Paid,debt_consolidation,Debt consolidation,2026-01-18 01:53:...
56533114,d4de0de3ab7d79ad4...,25450.0,25450.0,3,27.31,1043.24,Aug-2015,Charged Off,debt_consolidation,Debt consolidation,2026-01-18 01:53:...


In [26]:
spark.sql("select distinct(loan_purpose) from loans ")

loan_purpose
"guaranteed!"""
and if they are a...
never had any tro...
<br/><br/>Lending...
Bank of America c...
stocks
please feel free ...
I became his prim...
brakes
on one of the bus...


In [27]:
spark.sql("select loan_purpose, count(*) as total from loans group by loan_purpose order by total desc")

loan_purpose,total
debt_consolidation,1277790
credit_card,516926
home_improvement,150440
other,139413
major_purchase,50429
medical,27481
small_business,24659
car,24009
vacation,15525
moving,15402


In [28]:
loan_purpose_lookup = ["debt_consolidation", "credit_card", "home_improvement", "other","major_purchase","medical"
                       ,"small_business", "car","vacation", "moving", "house", "wedding", "renewable_energy", "educational"]

# these are the main columns that are having mmore purposes
# if anythng beyond these we will put them into other

In [29]:
from pyspark.sql.functions import when

In [34]:
loans_purpose_modified = loan_term_modified_df.withColumn("loan_purpose", when(col("loan_purpose").isin(loan_purpose_lookup),col("loan_purpose")).otherwise("other"))

In [35]:
loans_purpose_modified

loan_id,member_id,loan_amount,funded_amount,loan_term_years,intrest_rate,monthly_installment,issue_date,loan_status,loan_purpose,loan_title,ingest_date
56633077,b59d80da191f5b573...,3000.0,3000.0,3,7.89,93.86,Aug-2015,Fully Paid,credit_card,Credit card refin...,2026-01-18 02:00:...
55927518,202d9f56ecb7c3bc9...,15600.0,15600.0,3,7.89,488.06,Aug-2015,Fully Paid,credit_card,Credit card refin...,2026-01-18 02:00:...
56473345,e5a140c0922b554b9...,20000.0,20000.0,3,9.17,637.58,Aug-2015,Fully Paid,debt_consolidation,Debt consolidation,2026-01-18 02:00:...
56463188,e12aefc548f750777...,11200.0,11200.0,5,21.99,309.27,Aug-2015,Fully Paid,home_improvement,Home improvement,2026-01-18 02:00:...
56473316,1b3a50d854fbbf97e...,16000.0,16000.0,5,20.99,432.77,Aug-2015,Charged Off,debt_consolidation,Debt consolidation,2026-01-18 02:00:...
56663266,1c4329e5f17697127...,20000.0,20000.0,5,13.33,458.45,Aug-2015,Charged Off,debt_consolidation,Debt consolidation,2026-01-18 02:00:...
56483027,5026c86ad983175eb...,10000.0,10000.0,3,12.69,335.45,Aug-2015,Fully Paid,other,Other,2026-01-18 02:00:...
56613385,9847d8c1e9d0b2084...,23400.0,23400.0,5,19.19,609.46,Aug-2015,Current,small_business,Business,2026-01-18 02:00:...
56643620,8340dbe1adea41fb4...,16000.0,16000.0,3,5.32,481.84,Jul-2015,Fully Paid,debt_consolidation,Debt consolidation,2026-01-18 02:00:...
56533114,d4de0de3ab7d79ad4...,25450.0,25450.0,3,27.31,1043.24,Aug-2015,Charged Off,debt_consolidation,Debt consolidation,2026-01-18 02:00:...


In [37]:
loans_purpose_modified.createOrReplaceTempView("loans")

In [38]:
spark.sql("""select loan_purpose, count(*) as total from loans group by loan_purpose order by total desc""")

loan_purpose,total
debt_consolidation,1277790
credit_card,516926
home_improvement,150440
other,139667
major_purchase,50429
medical,27481
small_business,24659
car,24009
vacation,15525
moving,15402


In [39]:
from pyspark.sql.functions import count

In [41]:
loans_purpose_modified \
.groupBy("loan_purpose") \
.agg(count("*").alias("total")) \
.orderBy(col("total").desc())

loan_purpose,total
debt_consolidation,1277790
credit_card,516926
home_improvement,150440
other,139667
major_purchase,50429
medical,27481
small_business,24659
car,24009
vacation,15525
moving,15402


In [43]:
loans_purpose_modified.write \
.option("header", True) \
.mode("overwrite") \
.format("csv") \
.option("path", "/user/itv022692/lendingclubproject/cleaned/loans_csv") \
.save()

In [44]:
loans_purpose_modified.write \
.option("header", True) \
.mode("overwrite") \
.format("parquet") \
.option("path", "/user/itv022692/lendingclubproject/cleaned/loans_parquet") \
.save()