# Data Preprocessing

# --- Step 1: Data Cleaning ---

Cleaning functions before split:
1. Clean data types:
    - Monthly Debt - remove non-numeric characters, then transform strings to floats.
    - Maximum Open Credit - remove non-numeric characters, then transform strings into floats.
2. Correct inconsistent values:
    -  Home Ownership - change "HaveMortgage" to "Home Mortgage"
3. Remove null values
4. Drop bad values
5. Remove Duplicates

In [46]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import FunctionTransformer, StandardScaler, OneHotEncoder, RobustScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn import set_config
set_config(transform_output='pandas')

import category_encoders as ce

In [47]:
df = pd.read_csv('Data/LoansDataset.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 111107 entries, 0 to 111106
Data columns (total 19 columns):
 #   Column                        Non-Null Count   Dtype  
---  ------                        --------------   -----  
 0   Loan ID                       111107 non-null  object 
 1   Customer ID                   111107 non-null  object 
 2   Loan Status                   111107 non-null  object 
 3   Current Loan Amount           111107 non-null  int64  
 4   Term                          111107 non-null  object 
 5   Credit Score                  89769 non-null   float64
 6   Years in current job          106414 non-null  object 
 7   Home Ownership                111107 non-null  object 
 8   Annual Income                 89769 non-null   float64
 9   Purpose                       111107 non-null  object 
 10  Monthly Debt                  111107 non-null  object 
 11  Years of Credit History       111107 non-null  float64
 12  Months since last delinquent  52104 non-null

  df = pd.read_csv('Data/LoansDataset.csv')


### 1. Clean data types:
 - Monthly Debt - remove non-numeric characters, then transform strings to floats.
 - Maximum Open Credit - remove non-numeric characters, then transform strings into floats.

In [48]:
def string_to_float(data, column_name):
  data = data.copy()
  data[column_name] = data[column_name].astype(str)
  data[column_name] = data[column_name].str.replace(r'[^0-9.]', '', regex=True)
  data[column_name] = data[column_name].replace('', np.nan)
  data[column_name] = pd.to_numeric(data[column_name], errors='coerce')
  return data

In [49]:
monthly_debt_to_float_transformer = FunctionTransformer(lambda X: string_to_float(X, 'Monthly Debt'))
max_credit_to_float_tranformer = FunctionTransformer(lambda X: string_to_float(X, 'Maximum Open Credit'))

### 2. Correct inconsistent values:
- Home Ownership - change "HaveMortgage" to "Home Mortgage"

In [50]:
def correct_label(data, column_name, wrong_label, right_label):
  data = data.copy()
  data[column_name] = data[column_name].replace({wrong_label: right_label})
  return data

In [51]:
correct_label_transformer = FunctionTransformer(lambda X: correct_label(X, 'Home Ownership', 'HaveMortgage', 'Home Mortgage'))

### 3. Drop nulls
Drop null values in the following features:
   - Credit Score
   - Annual Income
   - Bankruptcies
   - Tax Liens

In [52]:
def drop_nulls(data, columns_list):
  data = data.copy()
  return data.dropna(subset=columns_list)

In [53]:
drop_nulls_transformer = FunctionTransformer(lambda X: drop_nulls(X,['Credit Score','Annual Income','Bankruptcies','Tax Liens']))

### 4. Drop bad values

In [54]:
def drop_bad_values(data, column_name, bad_value):
  data = data.copy()
  mask = data[column_name] != bad_value
  return data[mask]

In [55]:
drop_bad_loan_amount_transformer = FunctionTransformer(lambda X: X[X['Current Loan Amount'] != 99999999])

### 5. Remove Duplicates

In [56]:
def drop_duplicates(data):
  data = data.copy()
  return data.drop_duplicates()

In [57]:
drop_duplicates_transformer = FunctionTransformer(lambda X: drop_duplicates(X))

### 6. Set Index to Loan ID

In [58]:
def change_index(data):
  data = data.copy()
  data.set_index('Loan ID', inplace=True)
  return data

In [59]:
set_index_transformer = FunctionTransformer(lambda X: change_index(X))

# --- Step 2: Function Transformers ---

- Credit Score - For all values above 800, remove last 0. 
- Months since last delinquent - change to binary, null as "0", non-null as "1"

### 1. Credit Score
- For all values above 800, remove last 0

In [60]:
df[df['Credit Score'] > 800]['Credit Score'].count()

np.int64(5044)

In [61]:
def adjust_credit(data):
  data = data.copy()
  data.loc[data['Credit Score'] > 800, 'Credit Score'] /= 10
  return data

credit_transformer = FunctionTransformer(adjust_credit)

### 2. Months since last delinquent
- Change to binary, null as "0", non-null as "1"
- Rename column to "Ever Delinquent"

In [62]:
df['Months since last delinquent'].isna().sum()

np.int64(59003)

In [63]:
def delinquent_binary(data):
  data = data.copy()
  data['Months since last delinquent'] = data['Months since last delinquent'].notna().astype(int)
  data.rename(columns={'Months since last delinquent': 'Ever Delinquent'}, inplace=True)
  return data

In [64]:
delinquent_transformer = FunctionTransformer(delinquent_binary)

# --- Step 3: Feature Engineering ---
1. Debt to Income Ratio (Monthly Debt / Annual Income)
2. Credit Usage Ratio (Current Credit Balance / Maximum Open Credit)
3. Have Bankruptcies (Binary Yes/No)(Bankruptcies = 0 vs all others)
4. Have Tax Liens (Binary Yes/No)(Tax Liens = 0 vs all others)
5. Have Credit Problems (Binary Yes/No)(Number of Credit Problems = 0 vs all others)

### 1. Debt to Income Ratio (Monthly Debt / Annual Income)

In [65]:
def add_debt_to_income(data):
  data = data.copy()
  data['Debt to Income Ratio'] = data['Monthly Debt'] / data['Annual Income']
  return data

In [66]:
debt_income_transformer = FunctionTransformer(add_debt_to_income)

### 2. Have Bankruptcies (Binary Yes/No)(Bankruptcies = 0 vs all others)

In [67]:
def bankruptcies_binary(data):
  data = data.copy()
  data['Bankruptcies'] = data['Bankruptcies'].ne(0).astype(int)
  return data

In [68]:
bankruptcies_transformer = FunctionTransformer(bankruptcies_binary)

### 3. Have Tax Liens (Binary Yes/No)(Tax Liens = 0 vs all others)

In [69]:
def tax_liens_binary(data):
  data = data.copy()
  data['Tax Liens'] = data['Tax Liens'].ne(0).astype(int)
  return data

In [70]:
tax_liens_transformer = FunctionTransformer(tax_liens_binary)

### 4. Have Credit Problems (Binary Yes/No)(Tax Liens = 0 vs all others)

In [71]:
def credit_problems_binary(data):
  data = data.copy()
  data['Number of Credit Problems'] = data['Number of Credit Problems'].ne(0).astype(int)
  return data

In [72]:
credit_problems_transformer = FunctionTransformer(credit_problems_binary)

# --- 4. Encoding & Scaling: ---
Categorical, nominal - OHE
  1. Term
  2. Home Ownership
  3. Purpose

Categorical, ordinal - Ordinal Encoder
  1. Years in current job

Numercial - StandardScaler:
  1. Current Loan Amount
  2. Credit Score
  3. Years of Credit History

Numerical - RobustScaler:
  1. Annual Income
  2. Monthly Debt
  3. Number of Open Accounts
  4. Current Credit Balance
  5. Maximum Open Credit

Feature is ready, no encoding or scaling:
  1. Ever Delinquent
  2. Bankruptcies
  3. Tax Liens
  4. Number of Credit Problems
  5. Debt to Income Ratio

In [73]:
categorical_nominal = ['Term', 'Home Ownership', 'Purpose']
numeric_standard = ['Current Loan Amount', 'Years of Credit History']
numeric_robust = ['Annual Income', 'Monthly Debt', 'Number of Open Accounts', 'Current Credit Balance']
ready_features = ['Ever Delinquent', 'Debt to Income Ratio']

impute_encode_years_job = Pipeline([
    ('years_job_imputer', SimpleImputer(strategy='constant', fill_value='0')),
    ('years_job_encoder', ce.OrdinalEncoder(mapping = [
    {'col': 'Years in current job', 'mapping': {
        '0': 0, '< 1 year': 1, '1 year': 2, '2 years': 3, '3 years': 4, '4 years': 5, 
        '5 years': 6, '6 years': 7, '7 years': 8, '8 years': 9, '9 years': 10, '10+ years': 11
      }}
    ]))
])  

impute_encode_max_credit = Pipeline([
  ('max_credit_imputer', SimpleImputer(strategy='median')),
  ('max_credit_scaler', RobustScaler())
])

transform_encode_credit_score = Pipeline([
  ('credit_transformer', credit_transformer),
  ('credit_score_encoder', StandardScaler())
])

preprocessor = ColumnTransformer(transformers=[

  ('bankruptcies_transformer', bankruptcies_transformer, ['Bankruptcies']),
  ('tax_liens_transformer', tax_liens_transformer, ['Tax Liens']),
  ('credit_problems_transformer', credit_problems_transformer, ['Number of Credit Problems']),

  ('impute_encode_years_job', impute_encode_years_job, ['Years in current job']),
  ('impute_encode_max_credit', impute_encode_max_credit, ['Maximum Open Credit']),
  ('transform_encode_credit_score', transform_encode_credit_score, ['Credit Score']),

  ('ohe', OneHotEncoder(sparse_output=False, handle_unknown='ignore'), categorical_nominal),

  ('standardscaler', StandardScaler(), numeric_standard),
  ('robustscaler', RobustScaler(), numeric_robust),
  ('passthrough', 'passthrough', ready_features)
], remainder='drop')



# --- 6. Pipelines ---

In [74]:
pre_split_pipeline = Pipeline([
  ('monthly_debt_to_float_transformer', monthly_debt_to_float_transformer),
  ('max_credit_to_float_transformer', max_credit_to_float_tranformer),
  ('correct_label_transformer', correct_label_transformer),
  ('drop_nulls_transformer', drop_nulls_transformer),
  ('drop_bad_loan_amount_transformer', drop_bad_loan_amount_transformer),
  ('drop_duplicates_transformer', drop_duplicates_transformer),
  ('set_index_transformer', set_index_transformer), 
  ('delinquent_transformer', delinquent_transformer)
])



In [75]:
df_processed = pd.DataFrame(pre_split_pipeline.fit_transform(df))

In [76]:
X = df_processed.drop(columns='Loan Status')
y = df_processed['Loan Status'].copy()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.333, random_state=42)

In [77]:
print(f'Train shape: {X_train.shape}')
print(f'Val shape: {X_val.shape}')
print(f'Test shape: {X_test.shape}')

Train shape: (45059, 17)
Val shape: (12881, 17)
Test shape: (6431, 17)


In [78]:
main_pipeline = Pipeline([
  ('debt_income_transformer', debt_income_transformer),
  ('preprocessor', preprocessor),
])

In [79]:
main_pipeline.fit(X_train)

X_train_proc = main_pipeline.transform(X_train)
X_test_proc = main_pipeline.transform(X_test)
X_val_proc = main_pipeline.transform(X_val)

In [80]:
X_train_proc.describe()

Unnamed: 0,bankruptcies_transformer__Bankruptcies,tax_liens_transformer__Tax Liens,credit_problems_transformer__Number of Credit Problems,impute_encode_years_job__Years in current job,impute_encode_max_credit__Maximum Open Credit,transform_encode_credit_score__Credit Score,ohe__Term_Long Term,ohe__Term_Short Term,ohe__Home Ownership_Home Mortgage,ohe__Home Ownership_Own Home,...,ohe__Purpose_vacation,ohe__Purpose_wedding,standardscaler__Current Loan Amount,standardscaler__Years of Credit History,robustscaler__Annual Income,robustscaler__Monthly Debt,robustscaler__Number of Open Accounts,robustscaler__Current Credit Balance,passthrough__Ever Delinquent,passthrough__Debt to Income Ratio
count,45059.0,45059.0,45059.0,45059.0,45059.0,45059.0,45059.0,45059.0,45059.0,45059.0,...,45059.0,45059.0,45059.0,45059.0,45059.0,45059.0,45059.0,45059.0,45059.0,45059.0
mean,0.109434,0.018731,0.137864,6.59735,0.609162,1.590162e-15,0.280565,0.719435,0.482412,0.091635,...,0.000954,0.000954,-9.177653e-17,2.48522e-16,0.257796,0.164612,0.189463,0.324479,0.465811,0.014416
std,0.312187,0.135575,0.344761,3.811419,17.871258,1.000011,0.449281,0.449281,0.499696,0.288514,...,0.030877,0.030877,1.000011,1.000011,1.517479,0.89392,0.834436,1.530871,0.498835,0.006696
min,0.0,0.0,0.0,0.0,-0.91802,-4.645636,0.0,0.0,0.0,0.0,...,0.0,0.0,-1.566467,-2.074651,-1.364618,-1.179782,-1.5,-0.826534,0.0,0.0
25%,0.0,0.0,0.0,3.0,-0.380426,-0.4732245,0.0,0.0,0.0,0.0,...,0.0,0.0,-0.7505116,-0.6694473,-0.403087,-0.434176,-0.333333,-0.383689,0.0,0.009417
50%,0.0,0.0,0.0,7.0,0.0,0.2280212,0.0,1.0,0.0,0.0,...,0.0,0.0,-0.2387844,-0.1819278,0.0,0.0,0.0,0.0,0.0,0.014167
75%,0.0,0.0,0.0,11.0,0.619574,0.7539554,1.0,1.0,1.0,0.0,...,0.0,0.0,0.6540315,0.5063351,0.596913,0.565824,0.666667,0.616311,1.0,0.019083
max,1.0,1.0,1.0,11.0,3017.305928,1.174703,1.0,1.0,1.0,1.0,...,1.0,1.0,2.585888,7.503675,206.732462,30.594933,11.0,128.588416,1.0,0.033333


In [81]:
X_test_proc.describe()

Unnamed: 0,bankruptcies_transformer__Bankruptcies,tax_liens_transformer__Tax Liens,credit_problems_transformer__Number of Credit Problems,impute_encode_years_job__Years in current job,impute_encode_max_credit__Maximum Open Credit,transform_encode_credit_score__Credit Score,ohe__Term_Long Term,ohe__Term_Short Term,ohe__Home Ownership_Home Mortgage,ohe__Home Ownership_Own Home,...,ohe__Purpose_vacation,ohe__Purpose_wedding,standardscaler__Current Loan Amount,standardscaler__Years of Credit History,robustscaler__Annual Income,robustscaler__Monthly Debt,robustscaler__Number of Open Accounts,robustscaler__Current Credit Balance,passthrough__Ever Delinquent,passthrough__Debt to Income Ratio
count,6431.0,6431.0,6431.0,6431.0,6431.0,6431.0,6431.0,6431.0,6431.0,6431.0,...,6431.0,6431.0,6431.0,6431.0,6431.0,6431.0,6431.0,6431.0,6431.0,6431.0
mean,0.105738,0.02037,0.135593,6.574405,0.417838,-0.003278,0.283315,0.716685,0.480174,0.09392,...,0.000466,0.000933,-0.015016,0.007442,0.222129,0.16307,0.196859,0.309375,0.462448,0.014591
std,0.307526,0.141274,0.342383,3.823805,3.460916,0.980429,0.450643,0.450643,0.499646,0.29174,...,0.021595,0.030533,0.980285,1.018045,1.074962,0.869782,0.84152,1.286516,0.498627,0.006683
min,0.0,0.0,0.0,0.0,-0.91802,-4.610574,0.0,0.0,0.0,0.0,...,0.0,0.0,-1.566348,-2.031634,-1.273965,-1.179782,-1.5,-0.826534,0.0,0.0
25%,0.0,0.0,0.0,3.0,-0.377817,-0.473225,0.0,0.0,0.0,0.0,...,0.0,0.0,-0.743075,-0.669447,-0.404843,-0.431995,-0.333333,-0.37636,0.0,0.009667
50%,0.0,0.0,0.0,7.0,0.002264,0.228021,0.0,1.0,0.0,0.0,...,0.0,0.0,-0.246042,-0.167589,-0.01393,0.002853,0.0,0.012788,0.0,0.014333
75%,0.0,0.0,0.0,11.0,0.615564,0.753955,1.0,1.0,1.0,0.0,...,0.0,0.0,0.60995,0.506335,0.561443,0.553717,0.666667,0.604083,1.0,0.019167
max,1.0,1.0,1.0,11.0,191.215107,1.174703,1.0,1.0,1.0,1.0,...,1.0,1.0,2.585531,6.069794,20.866984,6.933692,7.666667,25.635942,1.0,0.033333


In [82]:
X_val_proc.describe()

Unnamed: 0,bankruptcies_transformer__Bankruptcies,tax_liens_transformer__Tax Liens,credit_problems_transformer__Number of Credit Problems,impute_encode_years_job__Years in current job,impute_encode_max_credit__Maximum Open Credit,transform_encode_credit_score__Credit Score,ohe__Term_Long Term,ohe__Term_Short Term,ohe__Home Ownership_Home Mortgage,ohe__Home Ownership_Own Home,...,ohe__Purpose_vacation,ohe__Purpose_wedding,standardscaler__Current Loan Amount,standardscaler__Years of Credit History,robustscaler__Annual Income,robustscaler__Monthly Debt,robustscaler__Number of Open Accounts,robustscaler__Current Credit Balance,passthrough__Ever Delinquent,passthrough__Debt to Income Ratio
count,12881.0,12881.0,12881.0,12881.0,12881.0,12881.0,12881.0,12881.0,12881.0,12881.0,...,12881.0,12881.0,12881.0,12881.0,12881.0,12881.0,12881.0,12881.0,12881.0,12881.0
mean,0.106125,0.019952,0.135393,6.589861,0.442093,0.00149,0.282199,0.717801,0.490179,0.093782,...,0.000776,0.000932,0.006835,0.011454,0.260722,0.174405,0.188119,0.354831,0.464327,0.014422
std,0.30801,0.13984,0.342156,3.834567,3.486656,1.003189,0.450087,0.450087,0.499923,0.291536,...,0.027853,0.030509,1.007753,1.017554,1.101452,0.879768,0.83478,1.414441,0.498745,0.006624
min,0.0,0.0,0.0,0.0,-0.91802,-4.645636,0.0,0.0,0.0,0.0,...,0.0,0.0,-1.599067,-2.017296,-1.370233,-1.179782,-1.5,-0.826534,0.0,0.0
25%,0.0,0.0,0.0,3.0,-0.383099,-0.438162,0.0,0.0,0.0,0.0,...,0.0,0.0,-0.749203,-0.683786,-0.403637,-0.428324,-0.333333,-0.372658,0.0,0.0095
50%,0.0,0.0,0.0,7.0,0.000798,0.228021,0.0,1.0,0.0,0.0,...,0.0,0.0,-0.233787,-0.167589,0.001242,0.010929,0.0,0.011367,0.0,0.014167
75%,0.0,0.0,0.0,11.0,0.621968,0.789018,1.0,1.0,1.0,0.0,...,0.0,0.0,0.65998,0.506335,0.60702,0.578907,0.666667,0.644131,1.0,0.019083
max,1.0,1.0,1.0,11.0,285.092373,1.174703,1.0,1.0,1.0,1.0,...,1.0,1.0,2.585055,6.71504,21.672799,9.283878,5.5,28.394571,1.0,0.033333
