In [26]:
from pyspark.sql import SparkSession
import getpass 
username=getpass.getuser()
spark=SparkSession. \
    builder. \
    config('spark.ui.port','0'). \
    config("spark.sql.warehouse.dir", "/user/itv015970/warehouse"). \
    config('spark.shuffle.useOldFetchProtocol', 'true'). \
    enableHiveSupport(). \
    master('yarn'). \
    getOrCreate()

In [9]:
customers_df = spark.read \
.format("parquet") \
.load("/public/trendytech/lendingclubproject/cleaned/customers_parquet")

In [10]:
customers_df.count()

2260633

In [12]:
spark.sql("create database itv015970_lending_club")

In [13]:
spark.sql("""create external table itv015970_lending_club.customers(
member_id string, emp_title string, emp_length int, 
home_ownership string, annual_income float, address_state string, address_zipcode string, address_country string, grade string, 
sub_grade string, verification_status string, total_high_credit_limit float, application_type string, join_annual_income float, 
verification_status_joint string, ingest_date timestamp)
stored as parquet location  '/user/itv015970/lendingclubproject/cleaned/customers_parquet'
""")

In [14]:
spark.sql("select * from itv015970_lending_club.customers")

member_id,emp_title,emp_length,home_ownership,annual_income,address_state,address_zipcode,address_country,grade,sub_grade,verification_status,total_high_credit_limit,application_type,join_annual_income,verification_status_joint,ingest_date
708d98caa175dd0c8...,Sales,3,RENT,70000.0,IL,605xx,USA,C,C4,Not Verified,93126.0,Individual,,,2025-05-25 14:25:...
5ea065801950c1e09...,Legal Assistant,6,RENT,40000.0,FL,342xx,USA,A,A3,Source Verified,15100.0,Individual,,,2025-05-25 14:25:...
8f28370d17fa7f7c3...,Owner,10,MORTGAGE,50000.0,FL,321xx,USA,B,B5,Source Verified,17300.0,Individual,,,2025-05-25 14:25:...
98880317ad2282220...,,6,OWN,68157.0,PA,191xx,USA,A,A5,Source Verified,78100.0,Individual,,,2025-05-25 14:25:...
2073d31164f13ca42...,Financial System ...,10,MORTGAGE,86000.0,IN,460xx,USA,A,A1,Not Verified,242487.0,Individual,,,2025-05-25 14:25:...
606afa30c591d2509...,Director of Opera...,4,RENT,82000.0,VA,201xx,USA,D,D3,Not Verified,76236.0,Individual,,,2025-05-25 14:25:...
31b1fd5018ed5f677...,ROC,10,MORTGAGE,98000.0,AR,718xx,USA,A,A2,Verified,244271.0,Individual,,,2025-05-25 14:25:...
39ab02739a9e17f5f...,ISO2,10,RENT,99450.0,CA,926xx,USA,D,D1,Not Verified,91583.0,Individual,,,2025-05-25 14:25:...
821523201d33bd276...,General Manager,3,RENT,65000.0,NY,112xx,USA,D,D4,Source Verified,47439.0,Individual,,,2025-05-25 14:25:...
a6a2e493d205b8432...,purchasing agent,10,RENT,42000.0,MI,496xx,USA,B,B5,Source Verified,52700.0,Individual,,,2025-05-25 14:25:...


In [15]:
spark.sql("""
create external table itv015970_lending_club.loans(
loan_id string, member_id string, loan_amount float, funded_amount float,
loan_term_years integer, interest_rate float, monthly_installment float, issue_date string,
loan_status string, loan_purpose string, loan_title string, ingest_date timestamp)
stored as parquet
location '/user/itv015970/lendingclubproject/cleaned/loans_parquet'
""")

In [16]:
spark.sql("select * from itv015970_lending_club.loans")

loan_id,member_id,loan_amount,funded_amount,loan_term_years,interest_rate,monthly_installment,issue_date,loan_status,loan_purpose,loan_title,ingest_date
14408468,5d7676571dee53d06...,5000.0,5000.0,3,14.16,171.28,Apr-2014,Fully Paid,debt_consolidation,Debt consolidation,2025-05-26 12:09:...
14520567,31fd1ef036c5caf26...,11000.0,11000.0,3,9.67,353.24,Apr-2014,Fully Paid,other,Other,2025-05-26 12:09:...
14708730,9b48253ca5848fa78...,25000.0,25000.0,3,11.99,830.24,May-2014,Charged Off,credit_card,Credit card refin...,2025-05-26 12:09:...
14491003,d6b8f2e32be148721...,1500.0,1500.0,3,23.43,58.41,Apr-2014,Fully Paid,renewable_energy,Green loan,2025-05-26 12:09:...
14510981,0a81ad556e20bcdb9...,10000.0,10000.0,3,12.99,336.9,Apr-2014,Fully Paid,debt_consolidation,Debt consolidation,2025-05-26 12:09:...
14177845,843b995d39160ddab...,13200.0,13200.0,5,12.99,300.28,Apr-2014,Charged Off,debt_consolidation,Debt consolidation,2025-05-26 12:09:...
13947687,04a8bd7c4e1407ba8...,10000.0,10000.0,5,17.57,251.61,Apr-2014,Fully Paid,other,Other,2025-05-26 12:09:...
12905731,3e9a89fa96988af68...,18000.0,18000.0,3,18.25,653.01,May-2014,Fully Paid,home_improvement,Home improvement,2025-05-26 12:09:...
14137736,394a8db0bd3bcf7de...,12000.0,12000.0,3,9.67,385.35,Apr-2014,Fully Paid,debt_consolidation,Debt consolidation,2025-05-26 12:09:...
13207083,09cea8d2b6f1272f7...,3200.0,3200.0,3,20.49,119.73,Apr-2014,Fully Paid,credit_card,Credit card refin...,2025-05-26 12:09:...


In [17]:
spark.sql("""CREATE EXTERNAL TABLE itv015970_lending_club.loans_repayments(loan_id string, total_principal_received float,
total_interest_received float,total_late_fee_received float,total_payment_received float,last_payment_amount float,
last_payment_date string,next_payment_date string,ingest_date timestamp)
stored as parquet LOCATION '/user/itv015970/lendingclubproject/cleaned/loans_repayments_parquet'
""")

In [18]:
spark.sql("select * from itv015970_lending_club.loans_repayments")

loan_id,total_principal_received,total_interest_received,total_late_fee_received,total_payment_received,last_payment_amount,last_payment_date,next_payment_date,ingest_date
141581221,1055.81,2591.7,0.0,3647.51,709.23,Mar-2019,Apr-2019,2025-05-26 17:14:...
141506948,1252.75,306.04,0.0,1558.79,312.63,Mar-2019,Apr-2019,2025-05-26 17:14:...
141357400,626.37,354.96,0.0,981.33,197.27,Mar-2019,Apr-2019,2025-05-26 17:14:...
139445427,1118.16,297.36,0.0,1415.52,283.95,Mar-2019,Apr-2019,2025-05-26 17:14:...
141407409,1169.72,3605.3,0.0,4775.02,964.9,Mar-2019,Apr-2019,2025-05-26 17:14:...
141360802,2313.98,2512.88,0.0,4826.86,952.02,Mar-2019,Apr-2019,2025-05-26 17:14:...
141163960,4689.63,1994.93,0.0,6684.56,1342.57,Mar-2019,Apr-2019,2025-05-26 17:14:...
141533932,585.29,640.53,15.0,1240.82,235.13,Mar-2019,Apr-2019,2025-05-26 17:14:...
141441276,2030.82,762.81,0.0,2793.63,477.62,Mar-2019,Apr-2019,2025-05-26 17:14:...
141569080,1803.55,1110.59,0.0,2914.14,585.91,Mar-2019,Apr-2019,2025-05-26 17:14:...


In [22]:
spark.sql("drop table itv015970_lending_club.loans_defaulters_delinq")

In [23]:
spark.sql("""CREATE EXTERNAL TABLE itv015970_lending_club.loans_defaulters_delinq(
member_id string, delinq_2yrs integer, delinq_amnt float, mths_since_last_delinq integer)
stored as parquet LOCATION '/user/itv015970/lendingclubproject/cleaned/loans_defaulters_deling_parquet'""")

In [27]:
spark.sql("select * from itv015970_lending_club.loans_defaulters_delinq")

member_id,delinq_2yrs,delinq_amnt,mths_since_last_delinq
9cb79aa7323e81be1...,2,0.0,11
aac68850fdac09fd0...,1,0.0,21
c89986155a070db2e...,1,0.0,5
6e8d94bf446e97025...,0,0.0,36
42f73fd8a01f1c475...,0,0.0,46
1eef79a0e79b72c7a...,1,0.0,21
1dd1d1b51473d4993...,0,0.0,44
ec1953dba2cfb89ad...,2,0.0,13
8241a6bb3a9350fb8...,0,0.0,57
cdc94fa1c29a6a70a...,0,0.0,44


In [29]:
spark.sql("""CREATE EXTERNAL TABLE itv015970_lending_club.loans_defaulters_detail_rec_enq(
member_id string, pub_rec integer, pub_rec_bankruptcies integer, inq_last_6mths integer)
stored as parquet LOCATION '/user/itv015970/lendingclubproject/cleaned/loans_def_detail_records_enq_df_parquet'""")

In [31]:
spark.sql("select * from itv015970_lending_club.loans_defaulters_detail_rec_enq")

member_id,pub_rec,pub_rec_bankruptcies,inq_last_6mths
0dd2bbc517e3c8f9e...,,,
458458599d3df3bfc...,,,
f1efcf7dfbfef21be...,,,
c89986155a070db2e...,,,
e88945f86a96f8d71...,,,
4e1c30a5dfe9f1e20...,,,
76cbefe31f7834f47...,,,
47d002f59a274c6f2...,,,
09a1c6855801dad88...,,,
56d4375718ad6940d...,,,
