In [2]:
from pyspark.sql import SparkSession
import getpass
username = getpass.getuser()
spark = SparkSession. \
builder. \
config('spark.shuffle.useOldFetchProtocol','true'). \
config("spark.sql.warehouse.dir", f"/user/itv021558/warehouse"). \
enableHiveSupport(). \
master('yarn'). \
getOrCreate()

In [3]:
customer_schema = 'member_id string, emp_title string, emp_lenght string, home_ownership string, annual_inc float, addr_state string, zip_code string, country string, grade string, sub_grade string, verification_status string, tot_hi_cred_lim float, application_type string, annual_inc_joint float, verification_status_joint string'

In [4]:
customer_raw_df = spark.read \
.format("csv") \
.option("header","true") \
.schema(customer_schema) \
.load("/public/trendytech/lendingclubproject/raw/customers_data_csv")

In [5]:
customers_df_renamed = customer_raw_df. withColumnRenamed("annual_inc","annual_income") \
.withColumnRenamed("addr_state","address_state") \
.withColumnRenamed("zip_code","address_zipcode") \
.withColumnRenamed("country","address_country") \
.withColumnRenamed("tot_hi_cred_lim","total_high_credit_limit") \
.withColumnRenamed("annual_inc_joint","join_annual_income")

In [13]:
customers_df_renamed

member_id,emp_title,emp_lenght,home_ownership,annual_income,address_state,address_zipcode,address_country,grade,sub_grade,verification_status,total_high_credit_limit,application_type,join_annual_income,verification_status_joint
b59d80da191f5b573...,,,RENT,50000.0,OR,973xx,USA,A,A5,Source Verified,8600.0,Individual,,
202d9f56ecb7c3bc9...,police officer,7 years,OWN,85000.0,TX,799xx,USA,A,A5,Source Verified,272384.0,Individual,,
e5a140c0922b554b9...,community living ...,6 years,RENT,48000.0,NY,146xx,USA,B,B2,Source Verified,85092.0,Individual,,
e12aefc548f750777...,Office,10+ years,OWN,33000.0,CT,067xx,USA,F,F1,Verified,7100.0,Individual,,
1b3a50d854fbbf97e...,Special Tooling I...,10+ years,MORTGAGE,81000.0,TX,791xx,USA,E,E5,Verified,190274.0,Individual,,
1c4329e5f17697127...,Mine ops tech 6,2 years,MORTGAGE,68000.0,AZ,855xx,USA,C,C3,Not Verified,182453.0,Individual,,
5026c86ad983175eb...,caregiver,4 years,RENT,76020.0,WA,993xx,USA,C,C2,Source Verified,15308.0,Individual,,
9847d8c1e9d0b2084...,,,OWN,65000.0,IL,624xx,USA,E,E3,Verified,128800.0,Individual,,
8340dbe1adea41fb4...,Vice President Re...,8 years,MORTGAGE,111000.0,CT,063xx,USA,A,A1,Not Verified,343507.0,Individual,,
d4de0de3ab7d79ad4...,FOREMAN,10+ years,MORTGAGE,67000.0,WA,992xx,USA,G,G2,Verified,211501.0,Individual,,


In [6]:
from pyspark.sql.functions import current_timestamp

In [7]:
customers_df_ingest_date = customers_df_renamed.withColumn("ingest_date",current_timestamp())

In [8]:
customers_df_ingest_date.createOrReplaceTempView("customers")

In [17]:
spark.sql("select * from customers")

member_id,emp_title,emp_lenght,home_ownership,annual_income,address_state,address_zipcode,address_country,grade,sub_grade,verification_status,total_high_credit_limit,application_type,join_annual_income,verification_status_joint,ingest_date
b59d80da191f5b573...,,,RENT,50000.0,OR,973xx,USA,A,A5,Source Verified,8600.0,Individual,,,2025-11-10 23:39:...
202d9f56ecb7c3bc9...,police officer,7 years,OWN,85000.0,TX,799xx,USA,A,A5,Source Verified,272384.0,Individual,,,2025-11-10 23:39:...
e5a140c0922b554b9...,community living ...,6 years,RENT,48000.0,NY,146xx,USA,B,B2,Source Verified,85092.0,Individual,,,2025-11-10 23:39:...
e12aefc548f750777...,Office,10+ years,OWN,33000.0,CT,067xx,USA,F,F1,Verified,7100.0,Individual,,,2025-11-10 23:39:...
1b3a50d854fbbf97e...,Special Tooling I...,10+ years,MORTGAGE,81000.0,TX,791xx,USA,E,E5,Verified,190274.0,Individual,,,2025-11-10 23:39:...
1c4329e5f17697127...,Mine ops tech 6,2 years,MORTGAGE,68000.0,AZ,855xx,USA,C,C3,Not Verified,182453.0,Individual,,,2025-11-10 23:39:...
5026c86ad983175eb...,caregiver,4 years,RENT,76020.0,WA,993xx,USA,C,C2,Source Verified,15308.0,Individual,,,2025-11-10 23:39:...
9847d8c1e9d0b2084...,,,OWN,65000.0,IL,624xx,USA,E,E3,Verified,128800.0,Individual,,,2025-11-10 23:39:...
8340dbe1adea41fb4...,Vice President Re...,8 years,MORTGAGE,111000.0,CT,063xx,USA,A,A1,Not Verified,343507.0,Individual,,,2025-11-10 23:39:...
d4de0de3ab7d79ad4...,FOREMAN,10+ years,MORTGAGE,67000.0,WA,992xx,USA,G,G2,Verified,211501.0,Individual,,,2025-11-10 23:39:...


In [24]:
spark.sql("select count (*) from customers where annual_income is null")

count(1)
2260701


In [9]:
customers_filtered = spark.sql("select *from customers where annual_income is not null")

In [10]:
customers_filtered.createOrReplaceTempView("customers")

In [11]:
from pyspark.sql.functions import regexp_replace, col

In [12]:
customers_emp_cleaned = customers_filtered.withColumn("emp_lenght",regexp_replace(col("emp_lenght"),"(\D)",""))

In [22]:
customers_emp_cleaned

member_id,emp_title,emp_lenght,home_ownership,annual_income,address_state,address_zipcode,address_country,grade,sub_grade,verification_status,total_high_credit_limit,application_type,join_annual_income,verification_status_joint,ingest_date
b59d80da191f5b573...,,,RENT,50000.0,OR,973xx,USA,A,A5,Source Verified,8600.0,Individual,,,2025-11-10 23:41:...
202d9f56ecb7c3bc9...,police officer,7.0,OWN,85000.0,TX,799xx,USA,A,A5,Source Verified,272384.0,Individual,,,2025-11-10 23:41:...
e5a140c0922b554b9...,community living ...,6.0,RENT,48000.0,NY,146xx,USA,B,B2,Source Verified,85092.0,Individual,,,2025-11-10 23:41:...
e12aefc548f750777...,Office,10.0,OWN,33000.0,CT,067xx,USA,F,F1,Verified,7100.0,Individual,,,2025-11-10 23:41:...
1b3a50d854fbbf97e...,Special Tooling I...,10.0,MORTGAGE,81000.0,TX,791xx,USA,E,E5,Verified,190274.0,Individual,,,2025-11-10 23:41:...
1c4329e5f17697127...,Mine ops tech 6,2.0,MORTGAGE,68000.0,AZ,855xx,USA,C,C3,Not Verified,182453.0,Individual,,,2025-11-10 23:41:...
5026c86ad983175eb...,caregiver,4.0,RENT,76020.0,WA,993xx,USA,C,C2,Source Verified,15308.0,Individual,,,2025-11-10 23:41:...
9847d8c1e9d0b2084...,,,OWN,65000.0,IL,624xx,USA,E,E3,Verified,128800.0,Individual,,,2025-11-10 23:41:...
8340dbe1adea41fb4...,Vice President Re...,8.0,MORTGAGE,111000.0,CT,063xx,USA,A,A1,Not Verified,343507.0,Individual,,,2025-11-10 23:41:...
d4de0de3ab7d79ad4...,FOREMAN,10.0,MORTGAGE,67000.0,WA,992xx,USA,G,G2,Verified,211501.0,Individual,,,2025-11-10 23:41:...


In [13]:
customers_emplength = customers_emp_cleaned.withColumn("emp_lenght",customers_emp_cleaned.emp_lenght.cast('int'))

In [24]:
customers_emplength

member_id,emp_title,emp_lenght,home_ownership,annual_income,address_state,address_zipcode,address_country,grade,sub_grade,verification_status,total_high_credit_limit,application_type,join_annual_income,verification_status_joint,ingest_date
b59d80da191f5b573...,,,RENT,50000.0,OR,973xx,USA,A,A5,Source Verified,8600.0,Individual,,,2025-11-10 23:44:...
202d9f56ecb7c3bc9...,police officer,7.0,OWN,85000.0,TX,799xx,USA,A,A5,Source Verified,272384.0,Individual,,,2025-11-10 23:44:...
e5a140c0922b554b9...,community living ...,6.0,RENT,48000.0,NY,146xx,USA,B,B2,Source Verified,85092.0,Individual,,,2025-11-10 23:44:...
e12aefc548f750777...,Office,10.0,OWN,33000.0,CT,067xx,USA,F,F1,Verified,7100.0,Individual,,,2025-11-10 23:44:...
1b3a50d854fbbf97e...,Special Tooling I...,10.0,MORTGAGE,81000.0,TX,791xx,USA,E,E5,Verified,190274.0,Individual,,,2025-11-10 23:44:...
1c4329e5f17697127...,Mine ops tech 6,2.0,MORTGAGE,68000.0,AZ,855xx,USA,C,C3,Not Verified,182453.0,Individual,,,2025-11-10 23:44:...
5026c86ad983175eb...,caregiver,4.0,RENT,76020.0,WA,993xx,USA,C,C2,Source Verified,15308.0,Individual,,,2025-11-10 23:44:...
9847d8c1e9d0b2084...,,,OWN,65000.0,IL,624xx,USA,E,E3,Verified,128800.0,Individual,,,2025-11-10 23:44:...
8340dbe1adea41fb4...,Vice President Re...,8.0,MORTGAGE,111000.0,CT,063xx,USA,A,A1,Not Verified,343507.0,Individual,,,2025-11-10 23:44:...
d4de0de3ab7d79ad4...,FOREMAN,10.0,MORTGAGE,67000.0,WA,992xx,USA,G,G2,Verified,211501.0,Individual,,,2025-11-10 23:44:...


In [14]:
customers_emplength.filter("emp_lenght is null").count()

146907

In [15]:
customers_emplength.createOrReplaceTempView("customers")

In [16]:
avg_emplength = spark.sql("select floor(avg(emp_lenght)) as avg_emp_length from customers").collect()

In [17]:
avg_emp_duration = avg_emplength[0][0]

In [18]:
print(avg_emp_duration)

6


In [19]:
customer_emplength_replaced = customers_emplength.na.fill(avg_emp_duration,subset=['emp_lenght'])

In [34]:
customer_emplength_replaced

member_id,emp_title,emp_lenght,home_ownership,annual_income,address_state,address_zipcode,address_country,grade,sub_grade,verification_status,total_high_credit_limit,application_type,join_annual_income,verification_status_joint,ingest_date
b59d80da191f5b573...,,6,RENT,50000.0,OR,973xx,USA,A,A5,Source Verified,8600.0,Individual,,,2025-11-10 23:52:...
202d9f56ecb7c3bc9...,police officer,7,OWN,85000.0,TX,799xx,USA,A,A5,Source Verified,272384.0,Individual,,,2025-11-10 23:52:...
e5a140c0922b554b9...,community living ...,6,RENT,48000.0,NY,146xx,USA,B,B2,Source Verified,85092.0,Individual,,,2025-11-10 23:52:...
e12aefc548f750777...,Office,10,OWN,33000.0,CT,067xx,USA,F,F1,Verified,7100.0,Individual,,,2025-11-10 23:52:...
1b3a50d854fbbf97e...,Special Tooling I...,10,MORTGAGE,81000.0,TX,791xx,USA,E,E5,Verified,190274.0,Individual,,,2025-11-10 23:52:...
1c4329e5f17697127...,Mine ops tech 6,2,MORTGAGE,68000.0,AZ,855xx,USA,C,C3,Not Verified,182453.0,Individual,,,2025-11-10 23:52:...
5026c86ad983175eb...,caregiver,4,RENT,76020.0,WA,993xx,USA,C,C2,Source Verified,15308.0,Individual,,,2025-11-10 23:52:...
9847d8c1e9d0b2084...,,6,OWN,65000.0,IL,624xx,USA,E,E3,Verified,128800.0,Individual,,,2025-11-10 23:52:...
8340dbe1adea41fb4...,Vice President Re...,8,MORTGAGE,111000.0,CT,063xx,USA,A,A1,Not Verified,343507.0,Individual,,,2025-11-10 23:52:...
d4de0de3ab7d79ad4...,FOREMAN,10,MORTGAGE,67000.0,WA,992xx,USA,G,G2,Verified,211501.0,Individual,,,2025-11-10 23:52:...


In [20]:
customer_emplength_replaced.createOrReplaceTempView("customers")

In [21]:
spark.sql("select distinct(address_state) from customers")

address_state
Helping Kenya's D...
223xx
175 (total projec...
AZ
SC
"so Plan """"C"""" is ..."
I am 56 yrs. old ...
financially I mad...
but no one will l...
LA


In [22]:
spark.sql("select count(address_state) from customers where length(address_state)>2")

count(address_state)
254


In [23]:
from pyspark.sql.functions import when, col, length

In [25]:
customers_state_cleaned = customer_emplength_replaced.withColumn(
  "address_state",
    when(length(col("address_state"))>2, "NA").otherwise(col("address_state"))
)

In [26]:
customers_state_cleaned

member_id,emp_title,emp_lenght,home_ownership,annual_income,address_state,address_zipcode,address_country,grade,sub_grade,verification_status,total_high_credit_limit,application_type,join_annual_income,verification_status_joint,ingest_date
b59d80da191f5b573...,,6,RENT,50000.0,OR,973xx,USA,A,A5,Source Verified,8600.0,Individual,,,2025-11-11 00:05:...
202d9f56ecb7c3bc9...,police officer,7,OWN,85000.0,TX,799xx,USA,A,A5,Source Verified,272384.0,Individual,,,2025-11-11 00:05:...
e5a140c0922b554b9...,community living ...,6,RENT,48000.0,NY,146xx,USA,B,B2,Source Verified,85092.0,Individual,,,2025-11-11 00:05:...
e12aefc548f750777...,Office,10,OWN,33000.0,CT,067xx,USA,F,F1,Verified,7100.0,Individual,,,2025-11-11 00:05:...
1b3a50d854fbbf97e...,Special Tooling I...,10,MORTGAGE,81000.0,TX,791xx,USA,E,E5,Verified,190274.0,Individual,,,2025-11-11 00:05:...
1c4329e5f17697127...,Mine ops tech 6,2,MORTGAGE,68000.0,AZ,855xx,USA,C,C3,Not Verified,182453.0,Individual,,,2025-11-11 00:05:...
5026c86ad983175eb...,caregiver,4,RENT,76020.0,WA,993xx,USA,C,C2,Source Verified,15308.0,Individual,,,2025-11-11 00:05:...
9847d8c1e9d0b2084...,,6,OWN,65000.0,IL,624xx,USA,E,E3,Verified,128800.0,Individual,,,2025-11-11 00:05:...
8340dbe1adea41fb4...,Vice President Re...,8,MORTGAGE,111000.0,CT,063xx,USA,A,A1,Not Verified,343507.0,Individual,,,2025-11-11 00:05:...
d4de0de3ab7d79ad4...,FOREMAN,10,MORTGAGE,67000.0,WA,992xx,USA,G,G2,Verified,211501.0,Individual,,,2025-11-11 00:05:...


In [27]:
customers_state_cleaned.select("address_state").distinct()

address_state
AZ
SC
LA
MN
NJ
DC
OR
""
VA
""


In [29]:
customers_state_cleaned.write \
.format("parquet") \
.mode("overwrite") \
.option("path","/user/itv021558/lendingclubproject/cleaned/customers_parquet") \
.save()

In [30]:
customers_state_cleaned.write \
.format("csv") \
.option("header", True) \
.mode("overwrite") \
.option("path","/user/itv021558/lendingclubproject/cleaned/customers_csv") \
.save()