In [1]:
from pyspark.sql import SparkSession
import getpass
username = getpass.getuser()
spark = SparkSession. \
    builder. \
    config('spark.ui.port','0'). \
    config('spark.shuffle.useOldFetchProtocol', 'true'). \
    config("spark.sql.warehouse.dir", f"/user/{username}/warehouse"). \
    enableHiveSupport(). \
    master('yarn'). \
    getOrCreate()

In [2]:
!! hadoop fs -ls /public/trendytech/lendingclubproject/raw

['Found 4 items',
 'drwxr-xr-x   - itv005857 supergroup          0 2023-09-15 14:40 /public/trendytech/lendingclubproject/raw/customers_data_csv',
 'drwxr-xr-x   - itv005857 supergroup          0 2023-09-17 22:57 /public/trendytech/lendingclubproject/raw/loans_data_csv',
 'drwxr-xr-x   - itv005857 supergroup          0 2023-09-18 07:32 /public/trendytech/lendingclubproject/raw/loans_defaulters_csv',
 'drwxr-xr-x   - itv005857 supergroup          0 2023-09-18 07:31 /public/trendytech/lendingclubproject/raw/loans_repayments_csv']

In [3]:
customers_raw_df = spark.read \
.format("csv") \
.option("header", True) \
.option("inferschema", True) \
.load("/public/trendytech/lendingclubproject/raw/customers_data_csv")

In [4]:
customers_raw_df

member_id,emp_title,emp_length,home_ownership,annual_inc,addr_state,zip_code,country,grade,sub_grade,verification_status,tot_hi_cred_lim,application_type,annual_inc_joint,verification_status_joint
b59d80da191f5b573...,,,RENT,50000.0,OR,973xx,USA,A,A5,Source Verified,8600.0,Individual,,
202d9f56ecb7c3bc9...,police officer,7 years,OWN,85000.0,TX,799xx,USA,A,A5,Source Verified,272384.0,Individual,,
e5a140c0922b554b9...,community living ...,6 years,RENT,48000.0,NY,146xx,USA,B,B2,Source Verified,85092.0,Individual,,
e12aefc548f750777...,Office,10+ years,OWN,33000.0,CT,067xx,USA,F,F1,Verified,7100.0,Individual,,
1b3a50d854fbbf97e...,Special Tooling I...,10+ years,MORTGAGE,81000.0,TX,791xx,USA,E,E5,Verified,190274.0,Individual,,
1c4329e5f17697127...,Mine ops tech 6,2 years,MORTGAGE,68000.0,AZ,855xx,USA,C,C3,Not Verified,182453.0,Individual,,
5026c86ad983175eb...,caregiver,4 years,RENT,76020.0,WA,993xx,USA,C,C2,Source Verified,15308.0,Individual,,
9847d8c1e9d0b2084...,,,OWN,65000.0,IL,624xx,USA,E,E3,Verified,128800.0,Individual,,
8340dbe1adea41fb4...,Vice President Re...,8 years,MORTGAGE,111000.0,CT,063xx,USA,A,A1,Not Verified,343507.0,Individual,,
d4de0de3ab7d79ad4...,FOREMAN,10+ years,MORTGAGE,67000.0,WA,992xx,USA,G,G2,Verified,211501.0,Individual,,


In [5]:
customers_raw_df.printSchema()

root
 |-- member_id: string (nullable = true)
 |-- emp_title: string (nullable = true)
 |-- emp_length: string (nullable = true)
 |-- home_ownership: string (nullable = true)
 |-- annual_inc: string (nullable = true)
 |-- addr_state: string (nullable = true)
 |-- zip_code: string (nullable = true)
 |-- country: string (nullable = true)
 |-- grade: string (nullable = true)
 |-- sub_grade: string (nullable = true)
 |-- verification_status: string (nullable = true)
 |-- tot_hi_cred_lim: double (nullable = true)
 |-- application_type: string (nullable = true)
 |-- annual_inc_joint: string (nullable = true)
 |-- verification_status_joint: string (nullable = true)



### 1. Create dataframe with proper datatypes

In [6]:
customers_schema = '''member_id string, emp_title string, emp_length string, home_ownership string, 
annual_inc float, addr_state string, zip_code string, country string, grade string, sub_grade string, 
verification_status string, tot_hi_cred_lim float, application_type string, annual_inc_joint float, verification_status_joint string'''

In [7]:
customers_raw_df = spark.read \
.format("csv") \
.option("header", True) \
.schema(customers_schema) \
.load("/public/trendytech/lendingclubproject/raw/customers_data_csv")

In [8]:
customers_raw_df.printSchema()

root
 |-- member_id: string (nullable = true)
 |-- emp_title: string (nullable = true)
 |-- emp_length: string (nullable = true)
 |-- home_ownership: string (nullable = true)
 |-- annual_inc: float (nullable = true)
 |-- addr_state: string (nullable = true)
 |-- zip_code: string (nullable = true)
 |-- country: string (nullable = true)
 |-- grade: string (nullable = true)
 |-- sub_grade: string (nullable = true)
 |-- verification_status: string (nullable = true)
 |-- tot_hi_cred_lim: float (nullable = true)
 |-- application_type: string (nullable = true)
 |-- annual_inc_joint: float (nullable = true)
 |-- verification_status_joint: string (nullable = true)



### 2. Rename few columns

#### withColumnRenamed("annual_inc","annual_income")
#### withColumnRenamed("addr_state","address_state")
#### withColumnRenamed("zip_code","address_zipcode")
#### withColumnRenamed("country","address_country")
#### withColumnRenamed("tot_hi_cred_lim","total_high_credit_limit")
#### withColumnRenamed("annual_inc_joint","join_annual_income")

In [9]:
customers_df_renamed = customers_raw_df.withColumnRenamed("annual_inc","annual_income") \
.withColumnRenamed("addr_state","address_state") \
.withColumnRenamed("zip_code","address_zipcode")\
.withColumnRenamed("country","address_country") \
.withColumnRenamed("tot_hi_cred_lim","total_high_credit_limit") \
.withColumnRenamed("annual_inc_joint","join_annual_income") 

In [10]:
customers_df_renamed

member_id,emp_title,emp_length,home_ownership,annual_income,address_state,address_zipcode,address_country,grade,sub_grade,verification_status,total_high_credit_limit,application_type,join_annual_income,verification_status_joint
b59d80da191f5b573...,,,RENT,50000.0,OR,973xx,USA,A,A5,Source Verified,8600.0,Individual,,
202d9f56ecb7c3bc9...,police officer,7 years,OWN,85000.0,TX,799xx,USA,A,A5,Source Verified,272384.0,Individual,,
e5a140c0922b554b9...,community living ...,6 years,RENT,48000.0,NY,146xx,USA,B,B2,Source Verified,85092.0,Individual,,
e12aefc548f750777...,Office,10+ years,OWN,33000.0,CT,067xx,USA,F,F1,Verified,7100.0,Individual,,
1b3a50d854fbbf97e...,Special Tooling I...,10+ years,MORTGAGE,81000.0,TX,791xx,USA,E,E5,Verified,190274.0,Individual,,
1c4329e5f17697127...,Mine ops tech 6,2 years,MORTGAGE,68000.0,AZ,855xx,USA,C,C3,Not Verified,182453.0,Individual,,
5026c86ad983175eb...,caregiver,4 years,RENT,76020.0,WA,993xx,USA,C,C2,Source Verified,15308.0,Individual,,
9847d8c1e9d0b2084...,,,OWN,65000.0,IL,624xx,USA,E,E3,Verified,128800.0,Individual,,
8340dbe1adea41fb4...,Vice President Re...,8 years,MORTGAGE,111000.0,CT,063xx,USA,A,A1,Not Verified,343507.0,Individual,,
d4de0de3ab7d79ad4...,FOREMAN,10+ years,MORTGAGE,67000.0,WA,992xx,USA,G,G2,Verified,211501.0,Individual,,


### 3. Insert a new column named as ingestion date(current time)

In [11]:
from pyspark.sql.functions import current_timestamp

In [12]:
customers_df_ingested = customers_df_renamed.withColumn("ingest_date", current_timestamp())

In [13]:
customers_df_ingested

member_id,emp_title,emp_length,home_ownership,annual_income,address_state,address_zipcode,address_country,grade,sub_grade,verification_status,total_high_credit_limit,application_type,join_annual_income,verification_status_joint,ingest_date
b59d80da191f5b573...,,,RENT,50000.0,OR,973xx,USA,A,A5,Source Verified,8600.0,Individual,,,2026-01-17 23:21:...
202d9f56ecb7c3bc9...,police officer,7 years,OWN,85000.0,TX,799xx,USA,A,A5,Source Verified,272384.0,Individual,,,2026-01-17 23:21:...
e5a140c0922b554b9...,community living ...,6 years,RENT,48000.0,NY,146xx,USA,B,B2,Source Verified,85092.0,Individual,,,2026-01-17 23:21:...
e12aefc548f750777...,Office,10+ years,OWN,33000.0,CT,067xx,USA,F,F1,Verified,7100.0,Individual,,,2026-01-17 23:21:...
1b3a50d854fbbf97e...,Special Tooling I...,10+ years,MORTGAGE,81000.0,TX,791xx,USA,E,E5,Verified,190274.0,Individual,,,2026-01-17 23:21:...
1c4329e5f17697127...,Mine ops tech 6,2 years,MORTGAGE,68000.0,AZ,855xx,USA,C,C3,Not Verified,182453.0,Individual,,,2026-01-17 23:21:...
5026c86ad983175eb...,caregiver,4 years,RENT,76020.0,WA,993xx,USA,C,C2,Source Verified,15308.0,Individual,,,2026-01-17 23:21:...
9847d8c1e9d0b2084...,,,OWN,65000.0,IL,624xx,USA,E,E3,Verified,128800.0,Individual,,,2026-01-17 23:21:...
8340dbe1adea41fb4...,Vice President Re...,8 years,MORTGAGE,111000.0,CT,063xx,USA,A,A1,Not Verified,343507.0,Individual,,,2026-01-17 23:21:...
d4de0de3ab7d79ad4...,FOREMAN,10+ years,MORTGAGE,67000.0,WA,992xx,USA,G,G2,Verified,211501.0,Individual,,,2026-01-17 23:21:...


### 4.Remove all the duplicate rows

In [14]:
customers_df_ingested.count()

2260701

In [15]:
customers_distinct = customers_df_ingested.distinct()

In [16]:
customers_distinct.count()

2260638

## this customers_distinct will not have any dupicate rows init lets create a table out of it (as its having unique rows)

In [17]:
customers_distinct.createOrReplaceTempView("customers")

In [18]:
spark.sql("select count(*) from customers")

count(1)
2260638


In [19]:
spark.sql("select * from customers")

member_id,emp_title,emp_length,home_ownership,annual_income,address_state,address_zipcode,address_country,grade,sub_grade,verification_status,total_high_credit_limit,application_type,join_annual_income,verification_status_joint,ingest_date
090c1ce20b0f4d911...,ELEMENTARY P.E. T...,10+ years,MORTGAGE,70000.0,IN,462xx,USA,B,B2,Not Verified,220441.0,Individual,,,2026-01-17 23:22:...
92bdafec308b52b72...,Operator II,10+ years,MORTGAGE,32000.0,CA,958xx,USA,D,D1,Not Verified,230405.0,Individual,,,2026-01-17 23:22:...
1dec47b624368acfb...,Social services r...,1 year,RENT,50800.0,DC,200xx,USA,D,D1,Source Verified,13500.0,Individual,,,2026-01-17 23:22:...
be3a81921721e09d9...,Certified Nursing...,10+ years,OWN,26000.0,TN,378xx,USA,E,E1,Verified,33762.0,Individual,,,2026-01-17 23:22:...
5c8694ea573a0d037...,Housing Finance M...,2 years,OWN,75000.0,LA,708xx,USA,D,D4,Source Verified,75818.0,Individual,,,2026-01-17 23:22:...
a68b8181187999a83...,Locomotive Engineer,6 years,MORTGAGE,95000.0,NE,693xx,USA,A,A4,Not Verified,213930.0,Individual,,,2026-01-17 23:22:...
d4a3da6904fd0a206...,truck driver,10+ years,RENT,70000.0,NY,104xx,USA,C,C1,Not Verified,24799.0,Individual,,,2026-01-17 23:22:...
6eb614a030a286f5e...,Purchasing Manager,< 1 year,MORTGAGE,48000.0,KS,671xx,USA,C,C1,Not Verified,79959.0,Individual,,,2026-01-17 23:22:...
5e21d9d60f155bafd...,Coordinator II,10+ years,RENT,25000.0,LA,700xx,USA,E,E3,Verified,20200.0,Individual,,,2026-01-17 23:22:...
eafa5505545a58b34...,"Director, Project...",10+ years,MORTGAGE,148000.0,MI,481xx,USA,D,D3,Source Verified,8641.0,Individual,,,2026-01-17 23:22:...


## 5. Remove the rows where annual_income is null

In [20]:
spark.sql("select count(*) from customers where annual_income is null")

count(1)
5


In [21]:
spark.sql("select count(*) from customers where annual_income is not null")

count(1)
2260633


In [22]:
customers_income_filtered = spark.sql("select * from customers where annual_income is not null")

In [23]:
customers_income_filtered.createOrReplaceTempView("customers")

# so inthis customers table we removed all the null values

## 6. Convert emp_length into integer

In [24]:
spark.sql("select distinct(emp_length) from customers").show()

+----------+
|emp_length|
+----------+
|   5 years|
|   9 years|
|      null|
|    1 year|
|   2 years|
|   7 years|
|   8 years|
|   4 years|
|   6 years|
|   3 years|
| 10+ years|
|  < 1 year|
+----------+



In [25]:
from pyspark.sql.functions import regexp_replace, col

In [26]:
customers_emplength_cleaned = customers_income_filtered.withColumn("emp_length",regexp_replace(col("emp_length"),"(\D)",""))


## here withColumn- Creates if not exist or Replaces the column if exist 
# we are applyting withColumn for emp_length and using a regular expression
# we are using regexp_replace on column named emp_length for those values whos is having non digit "(/D)" replace it with empty ""

In [27]:
customers_emplength_cleaned

member_id,emp_title,emp_length,home_ownership,annual_income,address_state,address_zipcode,address_country,grade,sub_grade,verification_status,total_high_credit_limit,application_type,join_annual_income,verification_status_joint,ingest_date
01c5ee91048ab393d...,General manager,2.0,RENT,50000.0,GA,302xx,USA,C,C2,Source Verified,20886.0,Individual,,,2026-01-17 23:22:...
417d708fe1f44149b...,ETN2,5.0,RENT,64000.0,VA,234xx,USA,A,A5,Not Verified,52700.0,Individual,,,2026-01-17 23:22:...
ccddb5c18900d5ee6...,Senior Merchandis...,7.0,MORTGAGE,65000.0,CA,917xx,USA,A,A1,Not Verified,334400.0,Individual,,,2026-01-17 23:22:...
dbf1f0be9db10137d...,Workforce Plannin...,1.0,RENT,73000.0,CA,944xx,USA,A,A3,Source Verified,159883.0,Individual,,,2026-01-17 23:22:...
97332f92dc02e6cf7...,Aircraft technician,8.0,OWN,89000.0,CA,902xx,USA,C,C3,Not Verified,71898.0,Individual,,,2026-01-17 23:22:...
4818021fe36f7bf22...,Deputy Assessor,10.0,MORTGAGE,58597.0,IL,605xx,USA,A,A3,Not Verified,76534.0,Individual,,,2026-01-17 23:22:...
5bc696d29ebaca2a6...,Health Informatio...,10.0,MORTGAGE,50000.0,NE,680xx,USA,C,C4,Not Verified,332017.0,Individual,,,2026-01-17 23:22:...
006073596a788a4b0...,Financial assistant,10.0,MORTGAGE,60000.0,NY,119xx,USA,C,C5,Not Verified,215882.0,Joint App,120000.0,Not Verified,2026-01-17 23:22:...
baf6e218833f4aca3...,Service Manager,10.0,MORTGAGE,75000.0,CA,925xx,USA,B,B4,Source Verified,175916.0,Individual,,,2026-01-17 23:22:...
fbaf59275cb201ef9...,Graphic Arts,2.0,RENT,52000.0,GA,307xx,USA,A,A5,Source Verified,36950.0,Individual,,,2026-01-17 23:22:...


In [28]:
customers_emplength_cleaned.printSchema()

root
 |-- member_id: string (nullable = true)
 |-- emp_title: string (nullable = true)
 |-- emp_length: string (nullable = true)
 |-- home_ownership: string (nullable = true)
 |-- annual_income: float (nullable = true)
 |-- address_state: string (nullable = true)
 |-- address_zipcode: string (nullable = true)
 |-- address_country: string (nullable = true)
 |-- grade: string (nullable = true)
 |-- sub_grade: string (nullable = true)
 |-- verification_status: string (nullable = true)
 |-- total_high_credit_limit: float (nullable = true)
 |-- application_type: string (nullable = true)
 |-- join_annual_income: float (nullable = true)
 |-- verification_status_joint: string (nullable = true)
 |-- ingest_date: timestamp (nullable = false)



In [29]:
customers_emplength_casted = customers_emplength_cleaned.withColumn("emp_length",customers_emplength_cleaned.emp_length.cast('int'))

In [30]:
customers_emplength_casted

member_id,emp_title,emp_length,home_ownership,annual_income,address_state,address_zipcode,address_country,grade,sub_grade,verification_status,total_high_credit_limit,application_type,join_annual_income,verification_status_joint,ingest_date
5f12d7d3cc72d3063...,Construction Manager,1.0,MORTGAGE,75000.0,FL,329xx,USA,C,C1,Not Verified,165084.0,Individual,,,2026-01-17 23:22:...
bb57345e6331f98fc...,drilling supervisor,9.0,MORTGAGE,60000.0,NC,274xx,USA,D,D3,Verified,248383.0,Individual,,,2026-01-17 23:22:...
2e893fd171ce59df3...,Senior Underwriter,3.0,MORTGAGE,115000.0,WA,981xx,USA,C,C1,Verified,1346864.0,Individual,,,2026-01-17 23:22:...
4567a8873973b661f...,Executive Director,5.0,MORTGAGE,93500.0,DC,200xx,USA,B,B4,Verified,496131.0,Individual,,,2026-01-17 23:22:...
fd9f62788bf90f881...,Executive Assistant,3.0,RENT,84000.0,IL,600xx,USA,D,D2,Source Verified,131656.0,Individual,,,2026-01-17 23:22:...
47fbab503f80ed52a...,Brand Ambassador,1.0,RENT,38200.0,NC,284xx,USA,B,B4,Not Verified,3300.0,Individual,,,2026-01-17 23:22:...
30d38c27d2279e997...,Computer Analyst,10.0,RENT,56000.0,NY,112xx,USA,B,B3,Not Verified,61600.0,Individual,,,2026-01-17 23:22:...
23188cde088f760ee...,Accounting clerk,10.0,RENT,372000.0,AR,720xx,USA,C,C2,Source Verified,29163.0,Individual,,,2026-01-17 23:22:...
7bd915f4b4608f908...,,,RENT,50000.0,MO,638xx,USA,A,A4,Not Verified,60379.0,Individual,,,2026-01-17 23:22:...
ac1599361dee4c717...,RN,9.0,MORTGAGE,66000.0,IN,472xx,USA,B,B4,Not Verified,178988.0,Individual,,,2026-01-17 23:22:...


In [31]:
customers_emplength_casted.printSchema()

root
 |-- member_id: string (nullable = true)
 |-- emp_title: string (nullable = true)
 |-- emp_length: integer (nullable = true)
 |-- home_ownership: string (nullable = true)
 |-- annual_income: float (nullable = true)
 |-- address_state: string (nullable = true)
 |-- address_zipcode: string (nullable = true)
 |-- address_country: string (nullable = true)
 |-- grade: string (nullable = true)
 |-- sub_grade: string (nullable = true)
 |-- verification_status: string (nullable = true)
 |-- total_high_credit_limit: float (nullable = true)
 |-- application_type: string (nullable = true)
 |-- join_annual_income: float (nullable = true)
 |-- verification_status_joint: string (nullable = true)
 |-- ingest_date: timestamp (nullable = false)



## 7. Replace all the nulls in emp_length column with average of this column

In [32]:
customers_emplength_casted.filter("emp_length is null").count()

146903

In [33]:
customers_emplength_casted.createOrReplaceTempView("customers")

In [34]:
avg_emp_length = spark.sql("""select floor(avg(emp_length)) as avg_emp_length from customers""").collect()

In [35]:
print(avg_emp_length)

[Row(avg_emp_length=6)]


In [36]:
avg_emp_duration = avg_emp_length[0][0]

In [37]:
avg_emp_duration

6

In [38]:
customers_emplength_replaced = customers_emplength_casted.na.fill(avg_emp_duration,subset = ['emp_length'])

In [39]:
customers_emplength_replaced.filter("emp_length is null").count()

0

In [40]:
customers_emplength_replaced

member_id,emp_title,emp_length,home_ownership,annual_income,address_state,address_zipcode,address_country,grade,sub_grade,verification_status,total_high_credit_limit,application_type,join_annual_income,verification_status_joint,ingest_date
37e6df85348941598...,Supervisor,1,RENT,30000.0,MI,496xx,USA,B,B1,Verified,22738.0,Individual,,,2026-01-17 23:23:...
12c49d4235702c6b3...,School Psychologist,7,RENT,68000.0,OR,972xx,USA,B,B4,Verified,168765.0,Individual,,,2026-01-17 23:23:...
59f1439f00d331af0...,Senior Associate,1,RENT,95000.0,WA,980xx,USA,A,A5,Verified,166228.0,Individual,,,2026-01-17 23:23:...
b8d48a78d332022b2...,Nurse Practitioner,10,MORTGAGE,70000.0,OH,441xx,USA,B,B2,Verified,513322.0,Individual,,,2026-01-17 23:23:...
0b81f14da6883aada...,Admin assistant,10,MORTGAGE,30000.0,IN,474xx,USA,B,B1,Source Verified,31774.0,Individual,,,2026-01-17 23:23:...
97ecda0bf4789fe6a...,Managing Consultant,8,MORTGAGE,152000.0,VA,222xx,USA,B,B1,Source Verified,57031.0,Individual,,,2026-01-17 23:23:...
097ad8d42079f5b9e...,Driver,2,OWN,75000.0,TX,755xx,USA,E,E3,Verified,37949.0,Individual,,,2026-01-17 23:23:...
ceb14ae1478c36f49...,Shop Mgr,10,MORTGAGE,55000.0,TX,775xx,USA,E,E2,Source Verified,123175.0,Individual,,,2026-01-17 23:23:...
e2eb676eb74700ebc...,Sales,2,RENT,126000.0,CA,949xx,USA,B,B2,Source Verified,67591.0,Individual,,,2026-01-17 23:23:...
67c7386557e6b253c...,Educational Diagn...,10,MORTGAGE,85000.0,TX,761xx,USA,B,B4,Source Verified,233979.0,Individual,,,2026-01-17 23:23:...


## Clean address state (it should be two characters only) replace alll other with NA

In [41]:
customers_emplength_replaced.createOrReplaceTempView("customers")

In [42]:
spark.sql("select distinct(address_state) from customers")

address_state
Helping Kenya's D...
175 (total projec...
223xx
SC
AZ
"so Plan """"C"""" is ..."
I am 56 yrs. old ...
financially I mad...
but no one will l...
LA


In [43]:
spark.sql("select count(address_state) from customers where length(address_state)>2")

count(address_state)
254


In [44]:
from pyspark.sql.functions import when, col, length

In [45]:
customers_state_cleaned = customers_emplength_replaced.withColumn(
    "address_state",
    when(length(col("address_state"))>2,"NA").otherwise(col("address_state"))
    )

In [46]:
customers_state_cleaned

member_id,emp_title,emp_length,home_ownership,annual_income,address_state,address_zipcode,address_country,grade,sub_grade,verification_status,total_high_credit_limit,application_type,join_annual_income,verification_status_joint,ingest_date
8e50656ff1d4e88a7...,Northeast OMS,10,MORTGAGE,40000.0,SC,290xx,USA,D,D5,Verified,,Individual,,,2026-01-17 23:23:...
936d2c862daddc994...,Philos Technologi...,5,RENT,49000.0,IL,607xx,USA,C,C2,Not Verified,,Individual,,,2026-01-17 23:23:...
b504a4c03b997b69e...,Truitt Bros Inc,10,RENT,70000.0,OR,973xx,USA,B,B3,Verified,,Individual,,,2026-01-17 23:23:...
99e4b0214dc6e572c...,GM Financial,1,OWN,60000.0,TX,775xx,USA,A,A3,Verified,,Individual,,,2026-01-17 23:23:...
87071e4f1d2aa6afe...,TD Auto Finance,10,MORTGAGE,77000.0,MI,480xx,USA,C,C1,Not Verified,,Individual,,,2026-01-17 23:23:...
a30aa4c71a4dffca3...,University at Alb...,10,MORTGAGE,145000.0,NY,121xx,USA,B,B1,Source Verified,,Individual,,,2026-01-17 23:23:...
29bbbe3ee15688349...,,6,MORTGAGE,44000.0,IL,615xx,USA,A,A1,Not Verified,,Individual,,,2026-01-17 23:23:...
859cc815b2f5da0ca...,King Food Service,8,RENT,60000.0,HI,968xx,USA,E,E4,Verified,,Individual,,,2026-01-17 23:23:...
24986bd1a608203fb...,Minerva Biotechno...,3,RENT,15000.0,NY,105xx,USA,A,A2,Not Verified,,Individual,,,2026-01-17 23:23:...
c5d11cdba957c7ab7...,town and country ...,10,MORTGAGE,95000.0,NC,282xx,USA,C,C5,Verified,,Individual,,,2026-01-17 23:23:...


In [47]:
customers_state_cleaned.select("address_state").distinct()

address_state
AZ
SC
LA
MN
NJ
DC
OR
""
VA
""


## Write the cleaned customers data to a cleaned folder in hdfs

In [49]:
customers_state_cleaned.write \
.format("parquet") \
.mode("overwrite") \
.option("path","/user/itv022692/lendingclubproject/cleaned/customers_parquet") \
.save()


In [50]:
customers_state_cleaned.write \
.format("csv") \
.mode("overwrite") \
.option("path","/user/itv022692/lendingclubproject/cleaned/customers_csv") \
.save()
