# Cleansing customers Data

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import sha2, concat_ws
from pyspark.sql.functions import *
spark  = SparkSession.builder\
                    .appName("lendingClubProject")\
                    .config("spark.sql.warehouse.dir","/user/anil/warehouse")\
                    .enableHiveSupport()\
                    .master('yarn')\
                    .getOrCreate()

# proper Type casting 

In [3]:
customersSchema = 'member_id string, emp_title string, emp_length string, home_ownership string, annual_inc float, addr_state string, zip_code string, country string, grade string, sub_grade string, verification_status string, tot_hi_cred_lim float, application_type string, annual_inc_joint float, verification_status_joint string'
customers = spark.read.format('csv')\
                        .option('header','true')\
                        .schema(customersSchema)\
                        .load('/user/anil/lendingClubProject/customers')
customers.printSchema()

root
 |-- member_id: string (nullable = true)
 |-- emp_title: string (nullable = true)
 |-- emp_length: string (nullable = true)
 |-- home_ownership: string (nullable = true)
 |-- annual_inc: float (nullable = true)
 |-- addr_state: string (nullable = true)
 |-- zip_code: string (nullable = true)
 |-- country: string (nullable = true)
 |-- grade: string (nullable = true)
 |-- sub_grade: string (nullable = true)
 |-- verification_status: string (nullable = true)
 |-- tot_hi_cred_lim: float (nullable = true)
 |-- application_type: string (nullable = true)
 |-- annual_inc_joint: float (nullable = true)
 |-- verification_status_joint: string (nullable = true)



In [17]:
customers

member_id,emp_title,emp_length,home_ownership,annual_inc,addr_state,zip_code,country,grade,sub_grade,verification_status,tot_hi_cred_lim,application_type,annual_inc_joint,verification_status_joint
9cb79aa7323e81be1...,Supervisor,10+ years,MORTGAGE,125000.0,KY,424xx,USA,E,E3,Verified,594540.0,Individual,,
0dd2bbc517e3c8f9e...,Security,< 1 year,RENT,54000.0,WI,532xx,USA,A,A4,Not Verified,57729.0,Individual,,
458458599d3df3bfc...,Manager,5 years,MORTGAGE,28000.0,AR,721xx,USA,C,C4,Verified,82780.0,Individual,,
05ea141ec28b5c7f7...,Teller,3 years,RENT,26000.0,WI,532xx,USA,A,A5,Not Verified,41114.0,Individual,,
aac68850fdac09fd0...,Quality Control A...,3 years,RENT,67000.0,TX,760xx,USA,F,F2,Verified,43449.0,Individual,,
3a423e4589e89f429...,Technician,2 years,RENT,65000.0,CA,900xx,USA,C,C3,Verified,127916.0,Joint App,118000.0,Verified
f1efcf7dfbfef21be...,Host,< 1 year,MORTGAGE,325000.0,NY,111xx,USA,B,B5,Source Verified,41000.0,Individual,,
c89986155a070db2e...,Guest Services Su...,2 years,MORTGAGE,60000.0,AZ,850xx,USA,C,C2,Source Verified,280890.0,Individual,,
118dc629b6e134419...,general manager,10+ years,MORTGAGE,74000.0,GA,310xx,USA,A,A3,Source Verified,306897.0,Individual,,
a86fa4b7493708333...,Sheet Metal Mechanic,9 years,MORTGAGE,54496.0,GA,307xx,USA,B,B1,Verified,153345.0,Joint App,119496.0,Verified


# Renaming few columns

In [4]:
customersRenamed = customers.withColumnRenamed("annual_inc", "annual_income")\
.withColumnRenamed("addr_state", "address_state") \
.withColumnRenamed("zip_code", "address_zipcode") \
.withColumnRenamed("country", "address_country") \
.withColumnRenamed("tot_hi_cred_lim", "total_high_credit_limit") \
.withColumnRenamed("annual_inc_joint", "join_annual_income")

# add current timestamp for processed/ingested data & time

In [5]:
customersIngested =customersRenamed.withColumn('Ingeste_date',current_timestamp())

In [6]:
customersIngested

member_id,emp_title,emp_length,home_ownership,annual_income,address_state,address_zipcode,address_country,grade,sub_grade,verification_status,total_high_credit_limit,application_type,join_annual_income,verification_status_joint,Ingeste_date
9cb79aa7323e81be1...,Supervisor,10+ years,MORTGAGE,125000.0,KY,424xx,USA,E,E3,Verified,594540.0,Individual,,,2024-11-13 07:33:...
0dd2bbc517e3c8f9e...,Security,< 1 year,RENT,54000.0,WI,532xx,USA,A,A4,Not Verified,57729.0,Individual,,,2024-11-13 07:33:...
458458599d3df3bfc...,Manager,5 years,MORTGAGE,28000.0,AR,721xx,USA,C,C4,Verified,82780.0,Individual,,,2024-11-13 07:33:...
05ea141ec28b5c7f7...,Teller,3 years,RENT,26000.0,WI,532xx,USA,A,A5,Not Verified,41114.0,Individual,,,2024-11-13 07:33:...
aac68850fdac09fd0...,Quality Control A...,3 years,RENT,67000.0,TX,760xx,USA,F,F2,Verified,43449.0,Individual,,,2024-11-13 07:33:...
3a423e4589e89f429...,Technician,2 years,RENT,65000.0,CA,900xx,USA,C,C3,Verified,127916.0,Joint App,118000.0,Verified,2024-11-13 07:33:...
f1efcf7dfbfef21be...,Host,< 1 year,MORTGAGE,325000.0,NY,111xx,USA,B,B5,Source Verified,41000.0,Individual,,,2024-11-13 07:33:...
c89986155a070db2e...,Guest Services Su...,2 years,MORTGAGE,60000.0,AZ,850xx,USA,C,C2,Source Verified,280890.0,Individual,,,2024-11-13 07:33:...
118dc629b6e134419...,general manager,10+ years,MORTGAGE,74000.0,GA,310xx,USA,A,A3,Source Verified,306897.0,Individual,,,2024-11-13 07:33:...
a86fa4b7493708333...,Sheet Metal Mechanic,9 years,MORTGAGE,54496.0,GA,307xx,USA,B,B1,Verified,153345.0,Joint App,119496.0,Verified,2024-11-13 07:33:...


# Remove Dupicate Rows

In [7]:
customersIngested.count()

2260701

In [8]:
customersIngested.distinct().count()

2260638

In [9]:
customersDistinct = customersIngested.distinct()

# Remove the rows where annual_income is null

In [10]:
customersDistinct.filter(col('annual_income').isNull()).count()

5

In [12]:
customersFiltered = customersDistinct.filter(col('annual_income').isNotNull())
customersFiltered.count()

2260633

# convert emp_length to integer

In [24]:
customersdf = customersFiltered.withColumn("emp_length", regexp_replace(col("emp_length"), "\D","").cast('int') )

In [25]:
customersdf

member_id,emp_title,emp_length,home_ownership,annual_income,address_state,address_zipcode,address_country,grade,sub_grade,verification_status,total_high_credit_limit,application_type,join_annual_income,verification_status_joint,Ingeste_date
f91a9c6d6490ae166...,Office Manager,10.0,MORTGAGE,37232.0,FL,334xx,USA,C,C4,Source Verified,185360.0,Individual,,,2024-11-13 07:37:...
153db23fe98162aba...,SENIOR SERVICE REP,10.0,MORTGAGE,97000.0,NJ,070xx,USA,B,B3,Verified,217055.0,Individual,,,2024-11-13 07:37:...
9809f807816c481cd...,Production Manager,10.0,MORTGAGE,65000.0,IL,610xx,USA,A,A1,Not Verified,186287.0,Individual,,,2024-11-13 07:37:...
ccf59e5c346acbdf5...,Laborer,6.0,RENT,30000.0,ME,044xx,USA,B,B4,Not Verified,52839.0,Individual,,,2024-11-13 07:37:...
2b0958ddeed614647...,psychologist,6.0,MORTGAGE,165000.0,FL,330xx,USA,B,B4,Not Verified,464285.0,Individual,,,2024-11-13 07:37:...
39a33db1b3f9aa00d...,,,MORTGAGE,79328.0,MO,656xx,USA,C,C2,Verified,238089.0,Individual,,,2024-11-13 07:37:...
fbb9ac9c0008278a8...,Solutions Consultant,4.0,RENT,66000.0,TX,770xx,USA,A,A2,Source Verified,116349.0,Individual,,,2024-11-13 07:37:...
1cc41378cf5a2a361...,Import Specialist,10.0,OWN,79500.0,CA,945xx,USA,A,A1,Source Verified,286100.0,Individual,,,2024-11-13 07:37:...
9eabc6f0eda96861a...,surgical coordinator,10.0,MORTGAGE,60000.0,IL,604xx,USA,C,C2,Not Verified,269690.0,Individual,,,2024-11-13 07:37:...
b0c7878a8e6994d69...,Associate Warden,10.0,RENT,50000.0,TN,380xx,USA,C,C2,Verified,58225.0,Individual,,,2024-11-13 07:37:...


In [26]:
customersdf.printSchema()

root
 |-- member_id: string (nullable = true)
 |-- emp_title: string (nullable = true)
 |-- emp_length: integer (nullable = true)
 |-- home_ownership: string (nullable = true)
 |-- annual_income: float (nullable = true)
 |-- address_state: string (nullable = true)
 |-- address_zipcode: string (nullable = true)
 |-- address_country: string (nullable = true)
 |-- grade: string (nullable = true)
 |-- sub_grade: string (nullable = true)
 |-- verification_status: string (nullable = true)
 |-- total_high_credit_limit: float (nullable = true)
 |-- application_type: string (nullable = true)
 |-- join_annual_income: float (nullable = true)
 |-- verification_status_joint: string (nullable = true)
 |-- Ingeste_date: timestamp (nullable = false)



# Replace all the nulls in emp_length column with average of this column

In [39]:
avgEmp_experience = customersdf.agg(floor(avg('emp_length'))).collect()[0][0]
customersModified = customersdf.withColumn('emp_length', when(col('emp_length').isNull(), avgEmp_experience).otherwise(col('emp_length')))
customersModified

member_id,emp_title,emp_length,home_ownership,annual_income,address_state,address_zipcode,address_country,grade,sub_grade,verification_status,total_high_credit_limit,application_type,join_annual_income,verification_status_joint,Ingeste_date
62989615616a2f4fb...,Stockroom Clerk,3,MORTGAGE,54000.0,NY,129xx,USA,B,B2,Not Verified,88695.0,Individual,,,2024-11-13 07:50:...
7ccdfb5ac896c8eb9...,Network Engineer,3,MORTGAGE,53000.0,IL,617xx,USA,C,C5,Source Verified,216399.0,Individual,,,2024-11-13 07:50:...
5b7bdf3da54079a6b...,Group Leader,10,MORTGAGE,37824.0,KY,421xx,USA,B,B1,Not Verified,189164.0,Joint App,74208.0,,2024-11-13 07:50:...
e899fd1d151804b0e...,Barber,1,RENT,45000.0,IL,606xx,USA,D,D1,Source Verified,31208.0,Individual,,,2024-11-13 07:50:...
5095e67678079101c...,Estimator,10,MORTGAGE,67600.0,TX,780xx,USA,A,A4,Not Verified,136394.0,Individual,,,2024-11-13 07:50:...
93f51d819a9c3396b...,Delivery Driver,10,OWN,70000.0,IL,606xx,USA,B,B2,Source Verified,247054.0,Individual,,,2024-11-13 07:50:...
ed4de75e1cf2c4f0f...,Consumer Solution...,10,MORTGAGE,60000.0,MO,652xx,USA,D,D1,Source Verified,237699.0,Individual,,,2024-11-13 07:50:...
c58ce191ca41d5ca8...,Corporate Controller,8,MORTGAGE,155000.0,MD,217xx,USA,C,C3,Verified,719981.0,Joint App,235000.0,Verified,2024-11-13 07:50:...
22005e49d7c0b9eab...,Longshoreman Labor,10,OWN,100000.0,CA,945xx,USA,C,C2,Source Verified,518800.0,Individual,,,2024-11-13 07:50:...
f586099a4b7567f5e...,Sales,2,RENT,85000.0,FL,333xx,USA,C,C2,Source Verified,90326.0,Individual,,,2024-11-13 07:50:...


# Clean the address_state(it should be 2 characters only),replace all others with NA

In [44]:
customersModified.select('address_state').distinct()

address_state
Helping Kenya's D...
223xx
175 (total projec...
AZ
SC
"so Plan """"C"""" is ..."
I am 56 yrs. old ...
financially I mad...
but no one will l...
LA


In [51]:
customersStateCleaned = customersModified.withColumn("address_state",when(length(col('address_state')) > 2 , 'NA').otherwise(col('address_state')))
customersStateCleaned

member_id,emp_title,emp_length,home_ownership,annual_income,address_state,address_zipcode,address_country,grade,sub_grade,verification_status,total_high_credit_limit,application_type,join_annual_income,verification_status_joint,Ingeste_date
830acbbf3d11f4c72...,data processor,10,MORTGAGE,30800.0,TX,769xx,USA,C,C4,Source Verified,21062.0,Individual,,,2024-11-13 08:00:...
77028e3163b9e2447...,Member,4,MORTGAGE,90000.0,MN,553xx,USA,C,C4,Source Verified,374083.0,Individual,,,2024-11-13 08:00:...
ce9a09b0451f7cc5a...,Area Manager,10,MORTGAGE,120000.0,PA,161xx,USA,C,C5,Verified,272117.0,Individual,,,2024-11-13 08:00:...
565dd59818a21d009...,Sales rep,7,MORTGAGE,115000.0,IL,605xx,USA,A,A4,Not Verified,355495.0,Individual,,,2024-11-13 08:00:...
5b619a268051f7691...,Associate Directo...,1,RENT,143000.0,MA,018xx,USA,D,D5,Source Verified,228550.0,Individual,,,2024-11-13 08:00:...
e0b56c8ca640ba6d7...,HR,4,MORTGAGE,36500.0,OK,730xx,USA,B,B5,Verified,125776.0,Individual,,,2024-11-13 08:00:...
ddc4c5d5e9d8f7c16...,Network Administr...,1,MORTGAGE,110000.0,FL,330xx,USA,C,C3,Not Verified,364342.0,Individual,,,2024-11-13 08:00:...
7b44a9c47b29ec573...,Route supervisor,10,MORTGAGE,60000.0,CT,064xx,USA,B,B4,Source Verified,143563.0,Individual,,,2024-11-13 08:00:...
ff1b89e07e37ae11f...,Supervisor mainte...,6,RENT,36000.0,NY,112xx,USA,C,C3,Not Verified,20400.0,Individual,,,2024-11-13 08:00:...
acc38c259e4610da4...,Chief Engineer,2,RENT,63500.0,NY,112xx,USA,D,D1,Source Verified,121348.0,Individual,,,2024-11-13 08:00:...


In [53]:
customersStateCleaned.select('address_state').distinct()

address_state
AZ
SC
LA
MN
NJ
DC
OR
""
VA
""


In [54]:
customersStateCleaned.write \
.format("parquet") \
.mode("overwrite") \
.option("path", "/user/anil/lendingClubProject/cleansed/customersParquet") \
.save()

In [None]:
customersStateCleaned.write \
.format("csv") \
.mode("overwrite") \
.option("path", "/user/anil/lendingClubProject/cleansed/customersCSV") \
.save()