In [1]:
from pyspark.sql import SparkSession
import getpass
username = getpass.getuser()
spark = SparkSession. \
    builder. \
    config('spark.ui.port','0'). \
    config("spark.shuffle.useOldFetchProtocol","true"). \
    config("spark.sql.warehouse.dir",f"/user/itv010130/warehouse"). \
    enableHiveSupport(). \
    master('yarn'). \
    getOrCreate()

### 1. Customers External Table Creation

In [2]:
customer_df = spark.read \
.format("parquet") \
.load("/user/itv010130/loanproject/clean/customer_data_parquet")

In [3]:
customer_df

member_id,emp_title,emp_length,home_ownership,annual_income,address_state,address_zipcode,address_country,grade,sub_grade,verification_status,total_high_credit_limit,application_type,join_annual_income,verification_status_joint,ingestion_date
d80ddb5ed2f44ba36...,Data Scientist,3,RENT,120000.0,WA,981xx,USA,C,C2,Not Verified,69331.0,Individual,,,2024-07-15 16:44:...
e469c81f404cb8812...,IT Tech,9,OWN,75000.0,CA,922xx,USA,C,C4,Verified,272816.0,Individual,,,2024-07-15 16:44:...
94e00bfa41db06702...,Nurse Practitioner,2,MORTGAGE,84000.0,IL,623xx,USA,C,C3,Verified,297344.0,Individual,,,2024-07-15 16:44:...
c34c85e958da5056b...,Wine Consultant,6,MORTGAGE,66000.0,IL,606xx,USA,B,B3,Source Verified,210800.0,Individual,,,2024-07-15 16:44:...
57060caf0bd84d6cf...,Plumber,1,MORTGAGE,92500.0,TX,750xx,USA,C,C5,Not Verified,194963.0,Individual,,,2024-07-15 16:44:...
bee8ccb6b9064de7e...,,6,RENT,30000.0,NY,104xx,USA,E,E1,Not Verified,30600.0,Joint App,60000.0,Not Verified,2024-07-15 16:44:...
949bcf76b68e46448...,,6,MORTGAGE,42000.0,NY,148xx,USA,F,F5,Verified,145519.0,Individual,,,2024-07-15 16:44:...
9c20b5d0ba86a6733...,IT Analyst,3,RENT,48650.0,NC,275xx,USA,B,B1,Source Verified,154424.0,Individual,,,2024-07-15 16:44:...
cd52a7fc8e2d2fcf5...,Librarian,3,MORTGAGE,60000.0,CA,913xx,USA,B,B1,Verified,169124.0,Individual,,,2024-07-15 16:44:...
a17b112d7a0b07684...,Fireman,8,MORTGAGE,96000.0,IL,601xx,USA,C,C4,Verified,359850.0,Individual,,,2024-07-15 16:44:...


In [4]:
customer_df.printSchema()

root
 |-- member_id: string (nullable = true)
 |-- emp_title: string (nullable = true)
 |-- emp_length: integer (nullable = true)
 |-- home_ownership: string (nullable = true)
 |-- annual_income: float (nullable = true)
 |-- address_state: string (nullable = true)
 |-- address_zipcode: string (nullable = true)
 |-- address_country: string (nullable = true)
 |-- grade: string (nullable = true)
 |-- sub_grade: string (nullable = true)
 |-- verification_status: string (nullable = true)
 |-- total_high_credit_limit: float (nullable = true)
 |-- application_type: string (nullable = true)
 |-- join_annual_income: float (nullable = true)
 |-- verification_status_joint: string (nullable = true)
 |-- ingestion_date: timestamp (nullable = true)



In [8]:
spark.sql("create database itv010130_Loan_Database")

In [7]:
spark.sql("""
CREATE EXTERNAL TABLE itv010130_Loan_Database.customers(member_id string,emp_title string,emp_length int,home_ownership string,annual_income float,address_state string,address_zipcode string,address_country string,grade string,sub_grade string,verification_status string,total_high_credit_limit float,application_type string,join_annual_income float,verification_status_joint string,ingestion_date timestamp) stored as parquet LOCATION '/user/itv010130/loanproject/clean/customer_data_parquet'
""")

In [8]:
spark.sql("select * from itv010130_Loan_Database.customers")

member_id,emp_title,emp_length,home_ownership,annual_income,address_state,address_zipcode,address_country,grade,sub_grade,verification_status,total_high_credit_limit,application_type,join_annual_income,verification_status_joint,ingestion_date
d80ddb5ed2f44ba36...,Data Scientist,3,RENT,120000.0,WA,981xx,USA,C,C2,Not Verified,69331.0,Individual,,,2024-07-15 16:44:...
e469c81f404cb8812...,IT Tech,9,OWN,75000.0,CA,922xx,USA,C,C4,Verified,272816.0,Individual,,,2024-07-15 16:44:...
94e00bfa41db06702...,Nurse Practitioner,2,MORTGAGE,84000.0,IL,623xx,USA,C,C3,Verified,297344.0,Individual,,,2024-07-15 16:44:...
c34c85e958da5056b...,Wine Consultant,6,MORTGAGE,66000.0,IL,606xx,USA,B,B3,Source Verified,210800.0,Individual,,,2024-07-15 16:44:...
57060caf0bd84d6cf...,Plumber,1,MORTGAGE,92500.0,TX,750xx,USA,C,C5,Not Verified,194963.0,Individual,,,2024-07-15 16:44:...
bee8ccb6b9064de7e...,,6,RENT,30000.0,NY,104xx,USA,E,E1,Not Verified,30600.0,Joint App,60000.0,Not Verified,2024-07-15 16:44:...
949bcf76b68e46448...,,6,MORTGAGE,42000.0,NY,148xx,USA,F,F5,Verified,145519.0,Individual,,,2024-07-15 16:44:...
9c20b5d0ba86a6733...,IT Analyst,3,RENT,48650.0,NC,275xx,USA,B,B1,Source Verified,154424.0,Individual,,,2024-07-15 16:44:...
cd52a7fc8e2d2fcf5...,Librarian,3,MORTGAGE,60000.0,CA,913xx,USA,B,B1,Verified,169124.0,Individual,,,2024-07-15 16:44:...
a17b112d7a0b07684...,Fireman,8,MORTGAGE,96000.0,IL,601xx,USA,C,C4,Verified,359850.0,Individual,,,2024-07-15 16:44:...


### 2. Loans External Table Creation

In [16]:
loan_df = spark.read \
.format("parquet") \
.load("/user/itv010130/loanproject/clean/loan_data_parquet")

In [17]:
loan_df.printSchema()

root
 |-- loan_id: string (nullable = true)
 |-- member_id: string (nullable = true)
 |-- loan_amount: float (nullable = true)
 |-- funded_amount: float (nullable = true)
 |-- loan_term_years: integer (nullable = true)
 |-- interest_rate: float (nullable = true)
 |-- monthly_installment: float (nullable = true)
 |-- issue_date: string (nullable = true)
 |-- loan_status: string (nullable = true)
 |-- loan_purpose: string (nullable = true)
 |-- loan_title: string (nullable = true)
 |-- ingestion_date: timestamp (nullable = true)



In [18]:
spark.sql("""
CREATE EXTERNAL TABLE itv010130_Loan_Database.loans(loan_id string,member_id string,loan_amount float,funded_amount float,loan_term_years int,interest_rate float,monthly_installment float,issue_date string,loan_status string,loan_purpose string,loan_title string,ingestion_date timestamp) stored as parquet LOCATION '/user/itv010130/loanproject/clean/loan_data_parquet'
""")

In [19]:
spark.sql("select * from itv010130_Loan_Database.loans")

loan_id,member_id,loan_amount,funded_amount,loan_term_years,interest_rate,monthly_installment,issue_date,loan_status,loan_purpose,loan_title,ingestion_date
5640558,75aa37b53c5c4dfd7...,25200.0,25200.0,3,14.33,865.33,Jun-2013,Fully Paid,debt_consolidation,2013 Credit Card ...,2024-07-04 23:42:...
5630489,3973abca4e9278951...,20000.0,20000.0,3,14.09,684.43,Jun-2013,Fully Paid,debt_consolidation,Pay Off,2024-07-04 23:42:...
5640536,66276848f487b4eea...,8000.0,8000.0,3,13.11,269.98,Jun-2013,Fully Paid,debt_consolidation,Debt consolidation,2024-07-04 23:42:...
5616872,f95485d0f0e57fafa...,35000.0,35000.0,3,21.0,1318.63,Jun-2013,Charged Off,debt_consolidation,LC consolidation ...,2024-07-04 23:42:...
5610618,a476e72aa5a196663...,20000.0,20000.0,3,7.9,625.81,Jun-2013,Fully Paid,credit_card,Payoff Cap One,2024-07-04 23:42:...
5610358,ed35f7be776a76eaa...,9000.0,9000.0,3,15.31,313.36,Jun-2013,Fully Paid,debt_consolidation,debt consolidation,2024-07-04 23:42:...
5610638,f5f2cad30f88066b9...,8875.0,8875.0,3,15.31,309.01,Jun-2013,Fully Paid,credit_card,Credit card refin...,2024-07-04 23:42:...
5619890,72d09470db841a41f...,20000.0,20000.0,3,11.14,656.11,Jul-2013,Fully Paid,debt_consolidation,Debt pay off,2024-07-04 23:42:...
5620443,89bceb321d30c19f1...,9000.0,9000.0,3,12.12,299.45,Jun-2013,Fully Paid,debt_consolidation,my loan,2024-07-04 23:42:...
5619319,4454029a0decbf5bd...,7000.0,7000.0,3,14.33,240.37,Jun-2013,Fully Paid,debt_consolidation,Home Refinance,2024-07-04 23:42:...


### 3. Repayment External Table Creation

In [21]:
repayment_df = spark.read \
.format("parquet") \
.load("/user/itv010130/loanproject/clean/repayment_data_parquet")

In [22]:
repayment_df.printSchema()

root
 |-- loan_id: string (nullable = true)
 |-- total_principle_received: float (nullable = true)
 |-- total_interest_received: float (nullable = true)
 |-- total_late_fee_received: float (nullable = true)
 |-- total_payment: float (nullable = true)
 |-- last_payment_amount: float (nullable = true)
 |-- last_payment_date: string (nullable = true)
 |-- next_payment_date: string (nullable = true)
 |-- ingestion_date: timestamp (nullable = true)



In [23]:
spark.sql("""
CREATE EXTERNAL TABLE itv010130_Loan_Database.repayment(loan_id string,total_principle_received float,total_interest_received float,total_late_fee_received float,total_payment float,last_payment_amount float,last_payment_date string,next_payment_date string,ingestion_date timestamp) stored as parquet LOCATION '/user/itv010130/loanproject/clean/repayment_data_parquet'
""")

In [24]:
spark.sql("select * from itv010130_Loan_Database.repayment")

loan_id,total_principle_received,total_interest_received,total_late_fee_received,total_payment,last_payment_amount,last_payment_date,next_payment_date,ingestion_date
84647143,4000.0,614.11,0.0,4614.1104,1235.28,Oct-2018,,2024-07-05 15:01:...
86463837,9967.93,6417.24,0.0,16385.17,529.11,Mar-2019,Apr-2019,2024-07-05 15:01:...
86643264,15200.0,2329.24,0.0,17529.244,14837.07,Mar-2017,,2024-07-05 15:01:...
85610005,1001.45,469.25,0.0,1470.7,246.05,Feb-2017,,2024-07-05 15:01:...
86101113,1316.02,517.0,0.0,2001.23,87.39,May-2018,,2024-07-05 15:01:...
85952284,14000.0,1851.72,0.0,15851.718,4733.71,Oct-2018,,2024-07-05 15:01:...
84666301,25193.35,5450.84,0.0,30644.19,989.14,Mar-2019,Apr-2019,2024-07-05 15:01:...
85872910,4525.0,616.49,0.0,5141.489,3693.97,Jun-2017,,2024-07-05 15:01:...
85615895,7500.0,1764.84,0.0,9264.843,2976.53,Sep-2018,,2024-07-05 15:01:...
86523233,2522.48,520.2,0.0,3042.68,98.21,Mar-2019,Apr-2019,2024-07-05 15:01:...


### 4. Defaulters External Table Creation

In [26]:
defaulters_df=spark.read \
.format("parquet") \
.load("/user/itv010130/loanproject/clean/detailed_defaulters_data_parquet")

In [30]:
defaulters_df.printSchema()

root
 |-- member_id: string (nullable = true)
 |-- pub_rec: integer (nullable = true)
 |-- pub_record_bankruptcies: integer (nullable = true)
 |-- inquiry_in_last_6mths: integer (nullable = true)



In [32]:
spark.sql("""
CREATE EXTERNAL TABLE itv010130_Loan_Database.defaulters(member_id string,pub_rec int,pub_record_bankruptcies int,inquiry_in_last_6mths int) stored as parquet LOCATION '/user/itv010130/loanproject/clean/detailed_defaulters_data_parquet'
""")

In [33]:
spark.sql("select * from itv010130_Loan_Database.defaulters")

member_id,pub_rec,pub_record_bankruptcies,inquiry_in_last_6mths
b59d80da191f5b573...,0,0,1
202d9f56ecb7c3bc9...,0,0,0
e5a140c0922b554b9...,0,0,0
e12aefc548f750777...,0,0,0
1b3a50d854fbbf97e...,0,0,0
1c4329e5f17697127...,0,0,0
5026c86ad983175eb...,1,0,2
9847d8c1e9d0b2084...,2,0,0
8340dbe1adea41fb4...,0,0,0
d4de0de3ab7d79ad4...,0,0,0


### 5. Delinquent External Table Creation

In [36]:
delinquent_df=spark.read \
.format("parquet") \
.load("/user/itv010130/loanproject/clean/defaulters_data_parquet")

In [37]:
delinquent_df.printSchema()

root
 |-- member_id: string (nullable = true)
 |-- delinq_in_2yrs: integer (nullable = true)
 |-- delinq_amount: float (nullable = true)
 |-- months_since_last_delinq: float (nullable = true)



In [50]:
spark.sql("""
CREATE EXTERNAL TABLE itv010130_Loan_Database.defaulters_delinquent(member_id string,delinq_in_2yrs int,delinq_amount float,months_since_last_delinq float) stored as parquet LOCATION '/user/itv010130/loanproject/clean/defaulters_data_parquet'
""")

In [51]:
spark.sql("select * from itv010130_Loan_Database.defaulters_delinquent")

member_id,delinq_in_2yrs,delinq_amount,months_since_last_delinq
b59d80da191f5b573...,0,0.0,31.0
202d9f56ecb7c3bc9...,1,0.0,6.0
e5a140c0922b554b9...,0,0.0,47.0
e12aefc548f750777...,0,0.0,33.0
1b3a50d854fbbf97e...,1,0.0,21.0
9847d8c1e9d0b2084...,1,0.0,6.0
8340dbe1adea41fb4...,0,0.0,36.0
d4de0de3ab7d79ad4...,0,0.0,35.0
1d4e1ef4353b73c00...,0,0.0,30.0
6f196952e71277fd4...,4,0.0,5.0


### 5.1 Drop defaulters_delinquent Table

In [48]:
spark.sql("drop table itv010130_Loan_Database.defaulters_delinquent")

In [49]:
spark.sql("select * from itv010130_Loan_Database.defaulters_delinquent")

AnalysisException: Table or view not found: itv010130_Loan_Database.defaulters_delinquent; line 1 pos 14;
'Project [*]
+- 'UnresolvedRelation [itv010130_Loan_Database, defaulters_delinquent], [], false


In [9]:
spark.stop()