In [71]:
from pyspark.sql import SparkSession
import getpass
username = getpass.getuser()
spark = SparkSession. \
builder. \
config('spark.ui.port', '0'). \
config("spark.shuffle.useOldFetchProtocol", "true"). \
config("spark.sql.warehouse.dir", f"/user/{username}/warehouse"). \
enableHiveSupport(). \
master('yarn'). \
getOrCreate()

In [72]:
customers_schema = "member_id string, emp_title string, emp_length string, home_ownership string, annual_inc float, addr_state string, zip_code string, country string, grade string, sub_grade string, verification_status string, tot_hi_cred_lim float, application_type string, annual_inc_joint float, verification_status_joint string"

In [73]:
customers_raw_df = spark.read \
.format("csv") \
.option("header", "true") \
.schema(customers_schema) \
.load("/user/itv015970/lendingclubproject/raw/customers_data_csv")

In [74]:
customers_raw_df

member_id,emp_title,emp_length,home_ownership,annual_inc,addr_state,zip_code,country,grade,sub_grade,verification_status,tot_hi_cred_lim,application_type,annual_inc_joint,verification_status_joint
707271898dcabc8b2...,Physician Service...,3 years,RENT,40400.0,CO,801xx,USA,A,A2,Not Verified,68759.0,Individual,,
8e1ea10aca3c4ad8f...,Operations,10+ years,MORTGAGE,53000.0,AR,720xx,USA,B,B2,Source Verified,63143.0,Individual,,
1d6546a2cbc1fd240...,Underwriter,2 years,RENT,65000.0,ME,040xx,USA,B,B4,Not Verified,66695.0,Individual,,
d6208beced388988f...,Crome restorer sp...,10+ years,MORTGAGE,60000.0,IL,606xx,USA,C,C1,Not Verified,68900.0,Individual,,
b4af936688c28c165...,Program Coordinator,1 year,RENT,38000.0,FL,322xx,USA,A,A5,Not Verified,76877.0,Individual,,
2c04e047879ada04e...,Executive Director,10+ years,MORTGAGE,166000.0,IL,601xx,USA,C,C2,Not Verified,217868.0,Individual,,
39dfcd293cb7b2c17...,Emergency Managme...,4 years,MORTGAGE,81000.0,TX,761xx,USA,C,C4,Not Verified,293276.0,Individual,,
5e6e1f8ad59c71a0b...,Clinical Applicat...,3 years,MORTGAGE,82000.0,CO,801xx,USA,A,A1,Not Verified,393500.0,Individual,,
afd3b57e55eb95ed8...,Systems Analyst 3,4 years,OWN,118030.0,MI,482xx,USA,A,A3,Not Verified,82137.0,Individual,,
8b5eed45ac53a0238...,Director of Front...,4 years,RENT,62000.0,NY,110xx,USA,A,A5,Not Verified,17400.0,Individual,,


In [75]:
customers_raw_df.printSchema()

root
 |-- member_id: string (nullable = true)
 |-- emp_title: string (nullable = true)
 |-- emp_length: string (nullable = true)
 |-- home_ownership: string (nullable = true)
 |-- annual_inc: float (nullable = true)
 |-- addr_state: string (nullable = true)
 |-- zip_code: string (nullable = true)
 |-- country: string (nullable = true)
 |-- grade: string (nullable = true)
 |-- sub_grade: string (nullable = true)
 |-- verification_status: string (nullable = true)
 |-- tot_hi_cred_lim: float (nullable = true)
 |-- application_type: string (nullable = true)
 |-- annual_inc_joint: float (nullable = true)
 |-- verification_status_joint: string (nullable = true)



In [76]:
# renaming column

customers_df_renamed = customers_raw_df.withColumnRenamed("annual_inc", "annual_income") \
.withColumnRenamed("addr_state", "address_state") \
.withColumnRenamed("zip_code", "address_zipcode") \
.withColumnRenamed("country", "address_country") \
.withColumnRenamed("tot_hi_cred_lim", "total_high_credit_limit") \
.withColumnRenamed("annaul_in_joint", "join_annaul_income")


In [77]:
customers_df_renamed

member_id,emp_title,emp_length,home_ownership,annual_income,address_state,address_zipcode,address_country,grade,sub_grade,verification_status,total_high_credit_limit,application_type,annual_inc_joint,verification_status_joint
707271898dcabc8b2...,Physician Service...,3 years,RENT,40400.0,CO,801xx,USA,A,A2,Not Verified,68759.0,Individual,,
8e1ea10aca3c4ad8f...,Operations,10+ years,MORTGAGE,53000.0,AR,720xx,USA,B,B2,Source Verified,63143.0,Individual,,
1d6546a2cbc1fd240...,Underwriter,2 years,RENT,65000.0,ME,040xx,USA,B,B4,Not Verified,66695.0,Individual,,
d6208beced388988f...,Crome restorer sp...,10+ years,MORTGAGE,60000.0,IL,606xx,USA,C,C1,Not Verified,68900.0,Individual,,
b4af936688c28c165...,Program Coordinator,1 year,RENT,38000.0,FL,322xx,USA,A,A5,Not Verified,76877.0,Individual,,
2c04e047879ada04e...,Executive Director,10+ years,MORTGAGE,166000.0,IL,601xx,USA,C,C2,Not Verified,217868.0,Individual,,
39dfcd293cb7b2c17...,Emergency Managme...,4 years,MORTGAGE,81000.0,TX,761xx,USA,C,C4,Not Verified,293276.0,Individual,,
5e6e1f8ad59c71a0b...,Clinical Applicat...,3 years,MORTGAGE,82000.0,CO,801xx,USA,A,A1,Not Verified,393500.0,Individual,,
afd3b57e55eb95ed8...,Systems Analyst 3,4 years,OWN,118030.0,MI,482xx,USA,A,A3,Not Verified,82137.0,Individual,,
8b5eed45ac53a0238...,Director of Front...,4 years,RENT,62000.0,NY,110xx,USA,A,A5,Not Verified,17400.0,Individual,,


In [78]:
# ingest current timestamp inside the data frame
from pyspark.sql.functions import current_timestamp

In [79]:
customers_df_ingestd = customers_df_renamed.withColumn("ingest_date", current_timestamp())

In [80]:
customers_df_ingestd

member_id,emp_title,emp_length,home_ownership,annual_income,address_state,address_zipcode,address_country,grade,sub_grade,verification_status,total_high_credit_limit,application_type,annual_inc_joint,verification_status_joint,ingest_date
707271898dcabc8b2...,Physician Service...,3 years,RENT,40400.0,CO,801xx,USA,A,A2,Not Verified,68759.0,Individual,,,2025-05-25 14:07:...
8e1ea10aca3c4ad8f...,Operations,10+ years,MORTGAGE,53000.0,AR,720xx,USA,B,B2,Source Verified,63143.0,Individual,,,2025-05-25 14:07:...
1d6546a2cbc1fd240...,Underwriter,2 years,RENT,65000.0,ME,040xx,USA,B,B4,Not Verified,66695.0,Individual,,,2025-05-25 14:07:...
d6208beced388988f...,Crome restorer sp...,10+ years,MORTGAGE,60000.0,IL,606xx,USA,C,C1,Not Verified,68900.0,Individual,,,2025-05-25 14:07:...
b4af936688c28c165...,Program Coordinator,1 year,RENT,38000.0,FL,322xx,USA,A,A5,Not Verified,76877.0,Individual,,,2025-05-25 14:07:...
2c04e047879ada04e...,Executive Director,10+ years,MORTGAGE,166000.0,IL,601xx,USA,C,C2,Not Verified,217868.0,Individual,,,2025-05-25 14:07:...
39dfcd293cb7b2c17...,Emergency Managme...,4 years,MORTGAGE,81000.0,TX,761xx,USA,C,C4,Not Verified,293276.0,Individual,,,2025-05-25 14:07:...
5e6e1f8ad59c71a0b...,Clinical Applicat...,3 years,MORTGAGE,82000.0,CO,801xx,USA,A,A1,Not Verified,393500.0,Individual,,,2025-05-25 14:07:...
afd3b57e55eb95ed8...,Systems Analyst 3,4 years,OWN,118030.0,MI,482xx,USA,A,A3,Not Verified,82137.0,Individual,,,2025-05-25 14:07:...
8b5eed45ac53a0238...,Director of Front...,4 years,RENT,62000.0,NY,110xx,USA,A,A5,Not Verified,17400.0,Individual,,,2025-05-25 14:07:...


In [81]:
# remove duplicate
customers_df_ingestd.count()

2260701

In [82]:
customers_distinct = customers_df_ingestd.distinct()

In [83]:
customers_distinct.count()

2260638

In [84]:
customers_distinct.createOrReplaceTempView("customers")

In [85]:
spark.sql("select * from customers")

member_id,emp_title,emp_length,home_ownership,annual_income,address_state,address_zipcode,address_country,grade,sub_grade,verification_status,total_high_credit_limit,application_type,annual_inc_joint,verification_status_joint,ingest_date
ccd8c24c89fc03167...,pca,6 years,RENT,33000.0,NY,134xx,USA,C,C4,Source Verified,19200.0,Individual,,,2025-05-25 14:07:...
fc686ab48dead233e...,Graphic Design/Pr...,10+ years,MORTGAGE,67925.0,CT,064xx,USA,A,A2,Verified,527314.0,Joint App,132425.0,,2025-05-25 14:07:...
f7237afa33700d087...,Pick up/ driver m...,8 years,RENT,35000.0,NJ,070xx,USA,D,D4,Verified,38216.0,Individual,,,2025-05-25 14:07:...
68262cc2ce7494729...,Sr. Program Director,10+ years,RENT,55000.0,KS,675xx,USA,B,B2,Not Verified,48900.0,Individual,,,2025-05-25 14:07:...
061a0c8db21c788e9...,Technology manager,2 years,MORTGAGE,57500.0,NE,685xx,USA,A,A3,Source Verified,173830.0,Individual,,,2025-05-25 14:07:...
0ed23313b791df8fd...,,,MORTGAGE,22000.0,FL,322xx,USA,B,B2,Not Verified,176200.0,Individual,,,2025-05-25 14:07:...
c5ff3314a7e32977a...,Manager,3 years,OWN,73000.0,NY,115xx,USA,B,B2,Source Verified,54859.0,Individual,,,2025-05-25 14:07:...
d328ec19fe4bf1f15...,Accounting Manager,5 years,MORTGAGE,66361.0,OR,970xx,USA,A,A1,Source Verified,103775.0,Individual,,,2025-05-25 14:07:...
1060b546e93e7a34d...,Supervisor,10+ years,MORTGAGE,64000.0,CA,952xx,USA,D,D1,Verified,248293.0,Individual,,,2025-05-25 14:07:...
0d74158f85a6cccf8...,Sales,5 years,OWN,52000.0,IL,600xx,USA,C,C1,Source Verified,100824.0,Individual,,,2025-05-25 14:07:...


In [86]:
spark.sql("select count(*) from customers where annual_income is null")

count(1)
5


In [87]:
customers_income_filtered = spark.sql("select * from customers where annual_income is not null")

In [88]:
customers_income_filtered.createOrReplaceTempView("customers")

In [89]:
spark.sql("select count(*) from customers where annual_income is null")

count(1)
0


In [90]:
spark.sql("select distinct(emp_length) from customers").show()

+----------+
|emp_length|
+----------+
|   5 years|
|   9 years|
|      null|
|    1 year|
|   2 years|
|   7 years|
|   8 years|
|   4 years|
|   6 years|
|   3 years|
| 10+ years|
|  < 1 year|
+----------+



In [91]:
from pyspark.sql.functions import regexp_replace, col

In [92]:
customers_emplength_cleaned = customers_income_filtered.withColumn("emp_length", regexp_replace(col("emp_length"), "(\D)",""))

In [93]:
customers_emplength_cleaned

member_id,emp_title,emp_length,home_ownership,annual_income,address_state,address_zipcode,address_country,grade,sub_grade,verification_status,total_high_credit_limit,application_type,annual_inc_joint,verification_status_joint,ingest_date
66ab9fd8dd5a1bc47...,Dispatch,3.0,RENT,61000.0,MS,397xx,USA,B,B4,Not Verified,106782.0,Individual,,,2025-05-25 14:08:...
9c32ca57993bd8834...,HR Coordinator,2.0,RENT,53000.0,GA,300xx,USA,C,C5,Verified,58500.0,Individual,,,2025-05-25 14:08:...
089a84334a3691918...,Senior UX Designer,3.0,MORTGAGE,95000.0,FL,334xx,USA,A,A3,Source Verified,511823.0,Individual,,,2025-05-25 14:08:...
01986ddb2ab734cbe...,Facility Manager,4.0,RENT,45000.0,NJ,087xx,USA,D,D2,Not Verified,90837.0,Individual,,,2025-05-25 14:08:...
7ec1824f76410f3c4...,Supply Chain Rep,10.0,MORTGAGE,64000.0,CA,950xx,USA,C,C1,Not Verified,454560.0,Individual,,,2025-05-25 14:08:...
c50c67adcb0886c18...,Continuous improv...,10.0,MORTGAGE,83000.0,MN,559xx,USA,A,A1,Not Verified,226345.0,Individual,,,2025-05-25 14:08:...
009ec0f3b791ede25...,Teacher's Aid,1.0,RENT,25000.0,NY,113xx,USA,B,B5,Not Verified,46800.0,Joint App,79000.0,Not Verified,2025-05-25 14:08:...
af9f426baa2ebd2cc...,Sales,10.0,OWN,58000.0,TX,782xx,USA,B,B4,Source Verified,133981.0,Individual,,,2025-05-25 14:08:...
42c470971c6172f49...,,6.0,MORTGAGE,150000.0,MA,017xx,USA,B,B1,Source Verified,51542.0,Individual,,,2025-05-25 14:08:...
c0a5fe976f24a0f44...,Account Director,2.0,MORTGAGE,70000.0,OH,441xx,USA,C,C4,Verified,208901.0,Individual,,,2025-05-25 14:08:...


In [94]:
customers_emplength_cleaned.printSchema()

root
 |-- member_id: string (nullable = true)
 |-- emp_title: string (nullable = true)
 |-- emp_length: string (nullable = true)
 |-- home_ownership: string (nullable = true)
 |-- annual_income: float (nullable = true)
 |-- address_state: string (nullable = true)
 |-- address_zipcode: string (nullable = true)
 |-- address_country: string (nullable = true)
 |-- grade: string (nullable = true)
 |-- sub_grade: string (nullable = true)
 |-- verification_status: string (nullable = true)
 |-- total_high_credit_limit: float (nullable = true)
 |-- application_type: string (nullable = true)
 |-- annual_inc_joint: float (nullable = true)
 |-- verification_status_joint: string (nullable = true)
 |-- ingest_date: timestamp (nullable = false)



In [95]:
customers_emplength_casted = customers_emplength_cleaned.withColumn("emp_length", customers_emplength_cleaned.emp_length.cast('int'))

In [96]:
customers_emplength_casted

member_id,emp_title,emp_length,home_ownership,annual_income,address_state,address_zipcode,address_country,grade,sub_grade,verification_status,total_high_credit_limit,application_type,annual_inc_joint,verification_status_joint,ingest_date
19102335af8c3851a...,taxi driver,10.0,OWN,65000.0,NY,115xx,USA,B,B1,Source Verified,32500.0,Individual,,,2025-05-25 14:08:...
bc83d3638c5608036...,Administrative As...,10.0,OWN,52370.0,PA,186xx,USA,C,C2,Source Verified,69600.0,Individual,,,2025-05-25 14:08:...
017db60a8fd54538f...,Quality Manager,3.0,MORTGAGE,147180.0,MO,641xx,USA,D,D5,Verified,243445.0,Individual,,,2025-05-25 14:08:...
5bca837037b6fad27...,public work director,10.0,MORTGAGE,80000.0,GA,317xx,USA,C,C2,Verified,114665.0,Individual,,,2025-05-25 14:08:...
89c6d7bd5a559bb11...,social worker,10.0,RENT,53500.0,ME,047xx,USA,C,C1,Not Verified,24299.0,Individual,,,2025-05-25 14:08:...
f3b1031957f585b47...,Department Manager,8.0,OWN,24000.0,UT,847xx,USA,D,D5,Verified,22806.0,Individual,,,2025-05-25 14:08:...
a25822c4728d1743d...,IT Systems Manager,1.0,MORTGAGE,72000.0,TX,786xx,USA,B,B3,Verified,463397.0,Joint App,196000.0,Not Verified,2025-05-25 14:08:...
07ffd138b3f7401c8...,maintenance super...,10.0,MORTGAGE,65000.0,PA,196xx,USA,B,B5,Not Verified,258709.0,Individual,,,2025-05-25 14:08:...
1d90cc2ed3a0a8aff...,,,RENT,30000.0,NY,136xx,USA,C,C5,Source Verified,12200.0,Individual,,,2025-05-25 14:08:...
651480bed5b889bf9...,CONSTRUCTION SUPE...,6.0,MORTGAGE,65000.0,AZ,850xx,USA,B,B3,Not Verified,341713.0,Individual,,,2025-05-25 14:08:...


In [97]:
customers_emplength_casted.printSchema()

root
 |-- member_id: string (nullable = true)
 |-- emp_title: string (nullable = true)
 |-- emp_length: integer (nullable = true)
 |-- home_ownership: string (nullable = true)
 |-- annual_income: float (nullable = true)
 |-- address_state: string (nullable = true)
 |-- address_zipcode: string (nullable = true)
 |-- address_country: string (nullable = true)
 |-- grade: string (nullable = true)
 |-- sub_grade: string (nullable = true)
 |-- verification_status: string (nullable = true)
 |-- total_high_credit_limit: float (nullable = true)
 |-- application_type: string (nullable = true)
 |-- annual_inc_joint: float (nullable = true)
 |-- verification_status_joint: string (nullable = true)
 |-- ingest_date: timestamp (nullable = false)



In [98]:
customers_emplength_casted.filter("emp_length is null").count()

146903

In [99]:
customers_emplength_casted.createOrReplaceTempView("customers")

In [101]:
avg_emp_length = spark.sql("select floor(avg(emp_length)) as avg_emp_length from customers").collect()

In [102]:
print(avg_emp_length)

[Row(avg_emp_length=6)]


In [104]:
avg_emp_duration = avg_emp_length[0][0]

In [105]:
avg_emp_duration

6

In [106]:
customers_emplength_replaced = customers_emplength_casted.na.fill(avg_emp_duration, subset=['emp_length'])

In [107]:
customers_emplength_replaced

member_id,emp_title,emp_length,home_ownership,annual_income,address_state,address_zipcode,address_country,grade,sub_grade,verification_status,total_high_credit_limit,application_type,annual_inc_joint,verification_status_joint,ingest_date
859c0e613aab8fbe1...,,1,RENT,28000.0,NY,114xx,USA,B,B3,Not Verified,18100.0,Individual,,,2025-05-25 14:11:...
bb44000500d396c26...,Lead IT Applicati...,5,MORTGAGE,128000.0,NC,282xx,USA,A,A4,Not Verified,549345.0,Individual,,,2025-05-25 14:11:...
d2a5f8aa376d2c6e9...,Creative Director,1,OWN,175000.0,NY,117xx,USA,D,D1,Source Verified,91675.0,Individual,,,2025-05-25 14:11:...
8af2c5726369d4b2b...,,6,MORTGAGE,10000.0,CA,959xx,USA,A,A1,Verified,446573.0,Joint App,95000.0,,2025-05-25 14:11:...
750dee3586c64fbf2...,Assistant Manager,4,OWN,44500.0,AL,352xx,USA,A,A1,Not Verified,146933.0,Individual,,,2025-05-25 14:11:...
a6f7ebd3091771828...,Director of opera...,10,MORTGAGE,70000.0,NV,891xx,USA,D,D5,Verified,603760.0,Individual,,,2025-05-25 14:11:...
4e9f5a0edea322b56...,technician,9,MORTGAGE,55000.0,NY,126xx,USA,A,A4,Not Verified,38900.0,Individual,,,2025-05-25 14:11:...
4528859236914b5f0...,,6,RENT,30000.0,FL,331xx,USA,C,C3,Source Verified,16600.0,Individual,,,2025-05-25 14:11:...
f97e485b701bb96b2...,Director of maint...,6,MORTGAGE,130000.0,MI,481xx,USA,B,B4,Source Verified,354297.0,Individual,,,2025-05-25 14:11:...
e5aca269c7be31f4d...,,1,RENT,30000.0,CA,946xx,USA,E,E4,Source Verified,35854.0,Individual,,,2025-05-25 14:11:...


In [108]:
customers_emplength_replaced.filter("emp_length is null").count()

0

In [109]:
customers_emplength_replaced.createOrReplaceTempView("customers")

In [110]:
spark.sql("select distinct(address_state) from customers")

address_state
Helping Kenya's D...
223xx
175 (total projec...
SC
AZ
I am 56 yrs. old ...
"so Plan """"C"""" is ..."
financially I mad...
but no one will l...
LA


In [113]:
spark.sql("select count(address_state) from customers where length(address_state)>2")

count(address_state)
254


In [114]:
from pyspark.sql.functions import when, col, length

In [115]:
customers_state_cleaned = customers_emplength_replaced.withColumn("address_state", when(length(col("address_state"))>2, "NA").otherwise(col("address_state")))

In [116]:
customers_state_cleaned

member_id,emp_title,emp_length,home_ownership,annual_income,address_state,address_zipcode,address_country,grade,sub_grade,verification_status,total_high_credit_limit,application_type,annual_inc_joint,verification_status_joint,ingest_date
1f66b3fdbaf5ba407...,Executive Assistant,10,RENT,50000.0,IL,606xx,USA,D,D2,Source Verified,48458.0,Individual,,,2025-05-25 14:20:...
c96bf369eba7f5cbf...,ATCS,10,MORTGAGE,120000.0,WA,980xx,USA,A,A1,Source Verified,41180.0,Individual,,,2025-05-25 14:20:...
a3310b1fbec5b56d7...,carpenter,2,RENT,75000.0,ND,581xx,USA,B,B1,Not Verified,27306.0,Individual,,,2025-05-25 14:20:...
636aeaa8a5da655c4...,Production Director,2,RENT,110000.0,WA,985xx,USA,D,D3,Source Verified,93991.0,Individual,,,2025-05-25 14:20:...
255d282d2a7fbb09e...,Dual Enrollment C...,10,RENT,53000.0,AL,256xx,USA,F,F2,Verified,94751.0,Individual,,,2025-05-25 14:20:...
3cf187a11402e95d2...,PM Analyst,8,RENT,70000.0,CA,958xx,USA,B,B2,Not Verified,43126.0,Individual,,,2025-05-25 14:20:...
7ccfc05f375e6435f...,Sr. Funder,1,MORTGAGE,58000.0,TX,750xx,USA,B,B1,Source Verified,213595.0,Individual,,,2025-05-25 14:20:...
450ac448c977cf8d2...,,6,MORTGAGE,82000.0,IN,464xx,USA,B,B5,Verified,160133.0,Individual,,,2025-05-25 14:20:...
6364e71faec5f09d7...,human resources s...,2,OWN,50000.0,NY,105xx,USA,C,C4,Source Verified,40378.0,Individual,,,2025-05-25 14:20:...
b8cadbcce06475d0f...,Supervisor,10,RENT,43000.0,NY,104xx,USA,C,C3,Source Verified,41800.0,Individual,,,2025-05-25 14:20:...


In [117]:
customers_state_cleaned.select("address_state").distinct()

address_state
SC
AZ
LA
MN
NJ
DC
OR
""
VA
""


In [119]:
customers_state_cleaned.write \
.format("parquet") \
.mode("overwrite") \
.option("path", "/user/itv015970/lendingclubproject/cleaned/customers_parquet") \
.save()

In [120]:
customers_state_cleaned.write \
.option("header", True) \
.format("csv") \
.mode("overwrite") \
.option("path", "/user/itv015970/lendingclubproject/cleaned/customers_csv") \
.save()