# Data Preprocessing

# --- Step 1: Data Cleaning (Pre-Split) ---

Cleaning functions before split:
1. Remove Duplicates
2. Drop null values

In [28]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [3]:
df = pd.read_csv('LoansDataset.csv')
df.head()

  df = pd.read_csv('LoansDataset.csv')


Unnamed: 0,Loan ID,Customer ID,Loan Status,Current Loan Amount,Term,Credit Score,Years in current job,Home Ownership,Annual Income,Purpose,Monthly Debt,Years of Credit History,Months since last delinquent,Number of Open Accounts,Number of Credit Problems,Current Credit Balance,Maximum Open Credit,Bankruptcies,Tax Liens
0,6cf51492-02a2-423e-b93d-676f05b9ad53,7c202b37-2add-44e8-9aea-d5b119aea935,Charged Off,12232,Short Term,7280.0,< 1 year,Rent,46643.0,Debt Consolidation,777.39,18.0,10.0,12,0,6762,7946,0.0,0.0
1,552e7ade-4292-4354-9ff9-c48031697d72,e7217b0a-07ac-47dd-b379-577b5a35b7c6,Charged Off,25014,Long Term,7330.0,10+ years,Home Mortgage,81099.0,Debt Consolidation,892.09,26.7,,14,0,35706,77961,0.0,0.0
2,9b5e32b3-8d76-4801-afc8-d729d5a2e6b9,0a62fc41-16c8-40b5-92ff-9e4b763ce714,Charged Off,16117,Short Term,7240.0,9 years,Home Mortgage,60438.0,Home Improvements,1244.02,16.7,32.0,11,1,11275,14815,1.0,0.0
3,5419b7c7-ac11-4be2-a8a7-b131fb6d6dbe,30f36c59-5182-4482-8bbb-5b736849ae43,Charged Off,11716,Short Term,7400.0,3 years,Rent,34171.0,Debt Consolidation,990.94,10.0,,21,0,7009,43533,0.0,0.0
4,1450910f-9495-4fc9-afaf-9bdf4b9821df,70c26012-bba5-42c0-8dcb-75295ada31bb,Charged Off,9789,Long Term,6860.0,10+ years,Home Mortgage,47003.0,Home Improvements,503.71,16.7,25.0,13,1,16913,19553,1.0,0.0


### Step 1: Data Cleaning - 1. Remove Duplicates

In [12]:
def drop_duplicates(data):
  data = data.copy()
  return data.drop_duplicates()

### Step 1: Data Cleaning - 2. Drop nulls
Drop null values in the following features:
  - Credit Score
  - Annual Income
  - Bankruptcies
  - Tax Liens

In [24]:
def drop_nulls(data, columns=['Credit Score','Annual Income','Bankruptcies','Tax Liens']):
  data = data.copy()
  return data.dropna(subset=columns)

# --- Step 2: Split train/test/val ---

In [56]:
X = df.drop(columns='Loan Status')
y = df['Loan Status'].copy()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=42)

In [57]:
print(f'Train shape: {X_train.shape}')
print(f'Test shape: {X_test.shape}')
print(f'Test shape: {X_val.shape}')

Train shape: (88885, 18)
Test shape: (11111, 18)
Test shape: (11111, 18)


# --- Step 3: Data Cleaning (Train Only) ---

Cleaning functions for train data only:
1. Current Loan Amount - drop all rows where value = 99999999
2. Credit Score - 

In [72]:
df_drop_duplicated = df.drop_duplicates()

In [77]:
df_drop_duplicated[df_drop_duplicated['Loan ID'].duplicated()]

Unnamed: 0,Loan ID,Customer ID,Loan Status,Current Loan Amount,Term,Credit Score,Years in current job,Home Ownership,Annual Income,Purpose,Monthly Debt,Years of Credit History,Months since last delinquent,Number of Open Accounts,Number of Credit Problems,Current Credit Balance,Maximum Open Credit,Bankruptcies,Tax Liens
88910,a9382c50-9176-4639-9fa1-6bdf32fe9702,6fdbbbd6-0e34-4039-9f47-c371d352bf40,Fully Paid,20891,Long Term,714.0,10+ years,Home Mortgage,88537.0,Debt Consolidation,1719.09,18.0,,13,0,56708,60456,0.0,0.0
88912,e7569ce9-f3ec-4461-9604-ccefe9164788,b9aefd67-0d4f-4180-805f-6c1870e280c3,Fully Paid,11000,Long Term,707.0,7 years,Rent,52640.0,Debt Consolidation,666.76,8.8,14.0,7,0,1946,11446,0.0,0.0
88913,96d15e73-541a-44be-8c11-b2e46d73d1b8,78281f18-48b5-40d4-9064-be5d18263d04,Fully Paid,5924,Short Term,716.0,7 years,Rent,97743.0,Debt Consolidation,2248.08,22.0,,5,0,62261,69956,0.0,0.0
88914,12adaace-17a8-4bbe-a6ce-dc5ffcc46f54,fc4ad10d-b42d-4c44-89fd-ac33c8bdbffe,Fully Paid,11145,Short Term,685.0,2 years,Home Mortgage,61998.0,other,503.74,13.1,28.0,5,0,296,17424,0.0,0.0
88915,e5f6b3cd-3507-4d84-b856-e89672fd0c63,7c761d99-2743-4884-9113-bb190478f327,Fully Paid,14780,Short Term,679.0,10+ years,Rent,61089.0,Debt Consolidation,636.34,21.5,12.0,9,0,11531,13000,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
111102,503f01e2-c3c4-4b10-886e-f15de57b6f1a,27a17610-1d8c-48a7-b0db-2d81a4fa099a,Fully Paid,15654,Long Term,686.0,10+ years,Rent,43050.0,Debt Consolidation,889.69,14.1,49.0,18,0,14730,29757,0.0,0.0
111103,8af53bc0-56a4-4801-a960-52a46a90b33f,c124ca71-f42a-47cc-8ec2-236f7e9868a5,Fully Paid,15237,Short Term,706.0,10+ years,Home Mortgage,60948.0,Debt Consolidation,700.9,15.9,48.0,11,0,17733,23060,0.0,0.0
111104,96efd327-f965-42ae-8b2b-d2555205ce19,7d959a54-8eeb-4df6-b582-046f889e4cff,Fully Paid,10413,Short Term,719.0,5 years,Home Mortgage,66945.0,Debt Consolidation,1210.59,8.7,,11,0,5205,6348,0.0,0.0
111105,ec19166c-70c5-4d49-adb6-fb3fc15024a2,5723fbf2-1ce8-4752-8bee-1bebf339f278,Fully Paid,34447,Short Term,626.0,10+ years,Home Mortgage,78736.0,Debt Consolidation,1476.3,30.4,9.0,12,0,32318,37580,0.0,0.0


In [79]:
df_drop_duplicated[df_drop_duplicated['Loan ID'] == 'a9382c50-9176-4639-9fa1-6bdf32fe9702'].T

Unnamed: 0,56850,88910
Loan ID,a9382c50-9176-4639-9fa1-6bdf32fe9702,a9382c50-9176-4639-9fa1-6bdf32fe9702
Customer ID,6fdbbbd6-0e34-4039-9f47-c371d352bf40,6fdbbbd6-0e34-4039-9f47-c371d352bf40
Loan Status,Fully Paid,Fully Paid
Current Loan Amount,20891,20891
Term,Long Term,Long Term
Credit Score,714.0,714.0
Years in current job,10+ years,10+ years
Home Ownership,Home Mortgage,Home Mortgage
Annual Income,88537.0,88537.0
Purpose,Debt Consolidation,Debt Consolidation


In [81]:
df_drop_duplicated.duplicated().sum()

np.int64(0)

In [82]:
differences = df_drop_duplicated.iloc[56850].compare(df_drop_duplicated.iloc[88910])

In [91]:
print(differences)

                        self    other
Monthly Debt         1719.09  1719.09
Maximum Open Credit    60456    60456


In [92]:
print(differences.apply(lambda x: [type(x[0]), type(x[1])]))

                              self            other
Monthly Debt         <class 'str'>  <class 'float'>
Maximum Open Credit  <class 'str'>    <class 'int'>


  print(differences.apply(lambda x: [type(x[0]), type(x[1])]))
