# Data Preprocessing

In [112]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import FunctionTransformer, StandardScaler, OneHotEncoder, RobustScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.base import TransformerMixin

from sklearn import set_config
set_config(transform_output='pandas')

import category_encoders as ce

# --- Step 1: Data Cleaning ---

In [152]:
df = pd.read_csv('../Data/LoansDataset.csv')

  df = pd.read_csv('../Data/LoansDataset.csv')


### 1. Clean data types:
 - Monthly Debt - remove non-numeric characters, then transform strings to floats.
 - Maximum Open Credit - remove non-numeric characters, then transform strings into floats.

In [114]:
def string_to_float(data, column_name):
  data = data.copy()
  data[column_name] = data[column_name].astype(str)
  data[column_name] = data[column_name].str.replace(r'[^0-9.]', '', regex=True)
  data[column_name] = data[column_name].replace('', np.nan)
  data[column_name] = pd.to_numeric(data[column_name], errors='coerce')
  return data

In [115]:
monthly_debt_to_float_transformer = FunctionTransformer(lambda X: string_to_float(X, 'Monthly Debt'))
max_credit_to_float_tranformer = FunctionTransformer(lambda X: string_to_float(X, 'Maximum Open Credit'))

### 2. Correct inconsistent values:
- Home Ownership - change "HaveMortgage" to "Home Mortgage"

In [116]:
def correct_label(data, column_name, wrong_label, right_label):
  data = data.copy()
  data[column_name] = data[column_name].replace({wrong_label: right_label})
  return data

In [117]:
correct_label_transformer = FunctionTransformer(lambda X: correct_label(X, 'Home Ownership', 'HaveMortgage', 'Home Mortgage'))

### 3. Drop nulls
Drop null values in the following features:
   - Credit Score
   - Annual Income
   - Bankruptcies
   - Tax Liens

In [118]:
def drop_nulls(data, columns_list):
  data = data.copy()
  return data.dropna(subset=columns_list)

In [119]:
drop_nulls_transformer = FunctionTransformer(lambda X: drop_nulls(X,['Credit Score','Annual Income','Bankruptcies','Tax Liens']))

### 4. Drop bad values

In [120]:
def drop_bad_values(data, column_name, bad_value):
  data = data.copy()
  mask = data[column_name] != bad_value
  return data[mask]

In [121]:
drop_bad_loan_amount_transformer = FunctionTransformer(lambda X: X[X['Current Loan Amount'] != 99999999])

### 5. Remove Duplicates

In [122]:
def drop_duplicates(data):
  data = data.copy()
  return data.drop_duplicates()

In [123]:
drop_duplicates_transformer = FunctionTransformer(lambda X: drop_duplicates(X))

### 6. Set Index to Loan ID

In [124]:
def change_index(data):
  data = data.copy()
  data.set_index('Loan ID', inplace=True)
  return data

In [125]:
set_index_transformer = FunctionTransformer(lambda X: change_index(X))

# --- Step 2: Function Transformers ---

- Credit Score - For all values above 800, remove last 0. 
- Months since last delinquent - change to binary, null as "0", non-null as "1"

### 1. Credit Score
- For all values above 800, remove last 0

In [126]:
df[df['Credit Score'] > 800]['Credit Score'].count()

np.int64(5044)

In [127]:
def adjust_credit(data):
  data = data.copy()
  data.loc[data['Credit Score'] > 800, 'Credit Score'] /= 10
  return data

credit_transformer = FunctionTransformer(adjust_credit)

### 2. Months since last delinquent
- Change to binary, null as "0", non-null as "1"
- Rename column to "Ever Delinquent"

In [128]:
df['Months since last delinquent'].isna().sum()

np.int64(59003)

In [129]:
def delinquent_binary(data):
  data = data.copy()
  data['Months since last delinquent'] = data['Months since last delinquent'].notna().astype(int)
  data.rename(columns={'Months since last delinquent': 'Ever Delinquent'}, inplace=True)
  return data

In [130]:
delinquent_transformer = FunctionTransformer(delinquent_binary)

# --- Step 3: Feature Engineering ---
1. Debt to Income Ratio (Monthly Debt / Annual Income)
2. Have Bankruptcies (Binary Yes/No)(Bankruptcies = 0 vs all others)
3. Have Tax Liens (Binary Yes/No)(Tax Liens = 0 vs all others)
4. Have Credit Problems (Binary Yes/No)(Number of Credit Problems = 0 vs all others)
5. Debt Consolidation (Binary Yes/No)(Purpose = Debt Consolidation)

### 1. Debt to Income Ratio (Monthly Debt / Annual Income)

In [131]:
def add_debt_to_income(data):
  data = data.copy()
  data['Debt to Income Ratio'] = data['Monthly Debt'] / data['Annual Income']
  return data

In [132]:
debt_income_transformer = FunctionTransformer(add_debt_to_income)

### 2. Have Bankruptcies (Binary Yes/No)(Bankruptcies = 0 vs all others)

In [133]:
def bankruptcies_binary(data):
  data = data.copy()
  data['Bankruptcies'] = data['Bankruptcies'].ne(0).astype(int)
  return data

In [134]:
bankruptcies_transformer = FunctionTransformer(bankruptcies_binary)

### 3. Have Tax Liens (Binary Yes/No)(Tax Liens = 0 vs all others)

In [135]:
def tax_liens_binary(data):
  data = data.copy()
  data['Tax Liens'] = data['Tax Liens'].ne(0).astype(int)
  return data

In [136]:
tax_liens_transformer = FunctionTransformer(tax_liens_binary)

### 4. Have Credit Problems (Binary Yes/No)(Number of Credit Problems = 0 vs all others)

In [137]:
def credit_problems_binary(data):
  data = data.copy()
  data['Number of Credit Problems'] = data['Number of Credit Problems'].ne(0).astype(int)
  return data

In [138]:
credit_problems_transformer = FunctionTransformer(credit_problems_binary)

### 5. Debt Consolidation (Binary Yes/No)(Purpose = Debt Consolidation)

In [139]:
def purpose_binary(data):
  data = data.copy()
  data['Purpose'] = data['Purpose'].eq('Debt Consolidation').astype(int)
  data.rename(columns={'Purpose': 'Debt Consolidation'}, inplace=True)
  return data

In [140]:
purpose_binary_transformer = FunctionTransformer(purpose_binary)

# --- 4. Custom Transformer ---

In [141]:
class RemoveOutliers(TransformerMixin):
  def __init__(self, column_name, num_std):
    """Remove outliers based on the number of standard deviations.
    
    Parameters:
    - column_name (str): from which column you wish to remove the outlier from
    - num_std (float): how many standard deviations away is your threshold for determining an outlier
    """

    self.num_std = num_std
    self.column_name = column_name
    self.mean_ = None
    self.std_ = None

  def fit(self, X, y=None):
    self.mean_ = X[self.column_name].mean()
    self.std_ = X[self.column_name].std()
    return self 

  def transform(self, X, y=None):
    X = X.copy()

    lower_bound = self.mean_ - self.num_std * self.std_
    upper_bound = self.mean_ + self.num_std * self.std_

    X_adjusted = X[(X[self.column_name] >= lower_bound) & (X[self.column_name] <= upper_bound)]
    return X_adjusted
  
remove_outlier_transformer = RemoveOutliers(column_name='Annual Income', num_std=20)


# --- 5. Encoding & Scaling: ---

In [142]:
categorical_nominal = ['Term', 'Home Ownership']
numeric_standard = ['Current Loan Amount', 'Years of Credit History']
numeric_robust = ['Monthly Debt', 'Number of Open Accounts', 'Current Credit Balance']
ready_features = ['Ever Delinquent', 'Debt to Income Ratio', 'Debt Consolidation']

impute_encode_years_job = Pipeline([
    ('years_job_imputer', SimpleImputer(strategy='constant', fill_value='0')),
    ('years_job_encoder', ce.OrdinalEncoder(mapping = [
    {'col': 'Years in current job', 'mapping': {
        '0': 0, '< 1 year': 1, '1 year': 2, '2 years': 3, '3 years': 4, '4 years': 5, 
        '5 years': 6, '6 years': 7, '7 years': 8, '8 years': 9, '9 years': 10, '10+ years': 11
      }}
    ]))
])  

impute_scale_max_credit = Pipeline([
  ('max_credit_imputer', SimpleImputer(strategy='median')),
  ('max_credit_scaler', RobustScaler())
])

transform_scale_credit_score = Pipeline([
  ('credit_transformer', credit_transformer),
  ('credit_score_scaler', StandardScaler())
])

transform_scale_annual_income = Pipeline([
  ('remove_outlier_transformer', remove_outlier_transformer),
  ('annual_income_scaler', RobustScaler())
])

preprocessor = ColumnTransformer(transformers=[

  ('bankruptcies_transformer', bankruptcies_transformer, ['Bankruptcies']),
  ('tax_liens_transformer', tax_liens_transformer, ['Tax Liens']),
  ('credit_problems_transformer', credit_problems_transformer, ['Number of Credit Problems']),

  ('impute_encode_years_job', impute_encode_years_job, ['Years in current job']),
  ('impute_encode_max_credit', impute_scale_max_credit, ['Maximum Open Credit']),
  ('transform_encode_credit_score', transform_scale_credit_score, ['Credit Score']),
  ('transform_scale_annual_income', transform_scale_annual_income, ['Annual Income']),

  ('ohe', OneHotEncoder(sparse_output=False, handle_unknown='ignore'), categorical_nominal),

  ('standardscaler', StandardScaler(), numeric_standard),
  ('robustscaler', RobustScaler(), numeric_robust),
  ('passthrough', 'passthrough', ready_features)
], remainder='drop')



# --- 6. Pipelines ---

##### Pre-split pipeline

In [143]:
pre_split_pipeline = Pipeline([
  ('monthly_debt_to_float_transformer', monthly_debt_to_float_transformer),
  ('max_credit_to_float_transformer', max_credit_to_float_tranformer),
  ('correct_label_transformer', correct_label_transformer),
  ('drop_nulls_transformer', drop_nulls_transformer),
  ('drop_bad_loan_amount_transformer', drop_bad_loan_amount_transformer),
  ('drop_duplicates_transformer', drop_duplicates_transformer),
  ('set_index_transformer', set_index_transformer), 
  ('delinquent_transformer', delinquent_transformer),
  ('purpose_binary', purpose_binary_transformer)
])



##### Fit & transform the pre-split pipeline

In [144]:
df_processed = pd.DataFrame(pre_split_pipeline.fit_transform(df))

##### Train/Test/Val Split

In [145]:
X = df_processed.drop(columns='Loan Status')
y = df_processed['Loan Status'].copy()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=42)

In [146]:
print(f'Train shape: {X_train.shape}')
print(f'Test shape: {X_test.shape}')
print(f'Test shape: {X_val.shape}')

Train shape: (51496, 17)
Test shape: (6438, 17)
Test shape: (6437, 17)


##### Main Pipeline

In [147]:
main_pipeline = Pipeline([
  ('debt_income_transformer', debt_income_transformer),
  ('preprocessor', preprocessor),
])

##### Fit and transform main pipeline

In [148]:
main_pipeline.fit(X_train)

X_train_proc = main_pipeline.transform(X_train)
X_test_proc = main_pipeline.transform(X_test)
X_val_proc = main_pipeline.transform(X_val)

##### Check processed datasets

In [149]:
X_train_proc.describe()

Unnamed: 0,bankruptcies_transformer__Bankruptcies,tax_liens_transformer__Tax Liens,credit_problems_transformer__Number of Credit Problems,impute_encode_years_job__Years in current job,impute_encode_max_credit__Maximum Open Credit,transform_encode_credit_score__Credit Score,transform_scale_annual_income__Annual Income,ohe__Term_Long Term,ohe__Term_Short Term,ohe__Home Ownership_Home Mortgage,ohe__Home Ownership_Own Home,ohe__Home Ownership_Rent,standardscaler__Current Loan Amount,standardscaler__Years of Credit History,robustscaler__Monthly Debt,robustscaler__Number of Open Accounts,robustscaler__Current Credit Balance,passthrough__Ever Delinquent,passthrough__Debt to Income Ratio,passthrough__Debt Consolidation
count,51496.0,51496.0,51496.0,51496.0,51496.0,51496.0,51492.0,51496.0,51496.0,51496.0,51496.0,51496.0,51496.0,51496.0,51496.0,51496.0,51496.0,51496.0,51496.0,51496.0
mean,0.108455,0.01905,0.137001,6.598707,0.580757,-1.4619e-15,0.250317,0.281187,0.718813,0.48229,0.092104,0.425606,4.139405e-18,3.444675e-16,0.1649784,0.189202,0.32158,0.465978,0.014417,0.788799
std,0.310957,0.136702,0.343852,3.814329,16.764024,1.00001,1.116569,0.449583,0.449583,0.499691,0.289176,0.494439,1.00001,1.00001,0.8898871,0.8345,1.492167,0.498846,0.006687,0.408165
min,0.0,0.0,0.0,0.0,-0.919482,-4.637975,-1.362898,0.0,0.0,0.0,0.0,0.0,-1.566024,-2.06998,-1.178836,-1.5,-0.826004,0.0,0.0,0.0
25%,0.0,0.0,0.0,3.0,-0.382365,-0.4716789,-0.402387,0.0,0.0,0.0,0.0,0.0,-0.7503976,-0.66827,-0.4326861,-0.333333,-0.383934,0.0,0.009417,1.0
50%,0.0,0.0,0.0,7.0,0.0,0.2285389,0.0,0.0,1.0,0.0,0.0,0.0,-0.2375072,-0.1819624,-7.871308e-17,0.0,0.0,0.0,0.014167,1.0
75%,0.0,0.0,0.0,11.0,0.617635,0.7537023,0.597613,1.0,1.0,1.0,0.0,1.0,0.6534465,0.5045897,0.5673139,0.666667,0.616066,1.0,0.019083,1.0
max,1.0,1.0,1.0,11.0,3021.899604,1.173833,28.891439,1.0,1.0,1.0,1.0,1.0,2.587079,7.484535,30.58587,11.0,128.482571,1.0,0.033333,1.0


In [150]:
X_test_proc.describe()

Unnamed: 0,bankruptcies_transformer__Bankruptcies,tax_liens_transformer__Tax Liens,credit_problems_transformer__Number of Credit Problems,impute_encode_years_job__Years in current job,impute_encode_max_credit__Maximum Open Credit,transform_encode_credit_score__Credit Score,transform_scale_annual_income__Annual Income,ohe__Term_Long Term,ohe__Term_Short Term,ohe__Home Ownership_Home Mortgage,ohe__Home Ownership_Own Home,ohe__Home Ownership_Rent,standardscaler__Current Loan Amount,standardscaler__Years of Credit History,robustscaler__Monthly Debt,robustscaler__Number of Open Accounts,robustscaler__Current Credit Balance,passthrough__Ever Delinquent,passthrough__Debt to Income Ratio,passthrough__Debt Consolidation
count,6438.0,6438.0,6438.0,6438.0,6438.0,6438.0,6438.0,6438.0,6438.0,6438.0,6438.0,6438.0,6438.0,6438.0,6438.0,6438.0,6438.0,6438.0,6438.0,6438.0
mean,0.105468,0.019727,0.136067,6.556229,0.465627,0.004639,0.238424,0.278037,0.721963,0.489904,0.092731,0.417366,-0.00724,0.007545,0.170045,0.204204,0.356368,0.464585,0.01457,0.782231
std,0.307179,0.13907,0.342886,3.819225,3.48989,0.979715,1.080417,0.448066,0.448066,0.499937,0.290077,0.493163,1.005074,1.018182,0.873401,0.838734,1.451789,0.498783,0.006661,0.412762
min,0.0,0.0,0.0,0.0,-0.919482,-4.637975,-1.262782,0.0,0.0,0.0,0.0,0.0,-1.565072,-2.027071,-1.178836,-1.5,-0.826004,0.0,0.0,0.0
25%,0.0,0.0,0.0,3.0,-0.374634,-0.471679,-0.40549,0.0,0.0,0.0,0.0,0.0,-0.747839,-0.66827,-0.429314,-0.333333,-0.363366,0.0,0.009667,1.0
50%,0.0,0.0,0.0,7.0,0.012245,0.228539,-0.008904,0.0,1.0,0.0,0.0,0.0,-0.248515,-0.167659,0.006238,0.0,0.016776,0.0,0.01425,1.0
75%,0.0,0.0,0.0,11.0,0.639781,0.788713,0.571205,1.0,1.0,1.0,0.0,1.0,0.640297,0.490286,0.574175,0.666667,0.644461,1.0,0.019167,1.0
max,1.0,1.0,1.0,11.0,191.50616,1.173833,20.847874,1.0,1.0,1.0,1.0,1.0,2.586722,6.697861,6.932082,7.666667,25.614721,1.0,0.033333,1.0


In [151]:
X_val_proc.describe()

Unnamed: 0,bankruptcies_transformer__Bankruptcies,tax_liens_transformer__Tax Liens,credit_problems_transformer__Number of Credit Problems,impute_encode_years_job__Years in current job,impute_encode_max_credit__Maximum Open Credit,transform_encode_credit_score__Credit Score,transform_scale_annual_income__Annual Income,ohe__Term_Long Term,ohe__Term_Short Term,ohe__Home Ownership_Home Mortgage,ohe__Home Ownership_Own Home,ohe__Home Ownership_Rent,standardscaler__Current Loan Amount,standardscaler__Years of Credit History,robustscaler__Monthly Debt,robustscaler__Number of Open Accounts,robustscaler__Current Credit Balance,passthrough__Ever Delinquent,passthrough__Debt to Income Ratio,passthrough__Debt Consolidation
count,6437.0,6437.0,6437.0,6437.0,6437.0,6437.0,6437.0,6437.0,6437.0,6437.0,6437.0,6437.0,6437.0,6437.0,6437.0,6437.0,6437.0,6437.0,6437.0,6437.0
mean,0.110921,0.019264,0.139351,6.589716,0.462318,0.003587,0.259379,0.284139,0.715861,0.489203,0.093366,0.41743,0.013169,0.017866,0.179523,0.181503,0.357228,0.459375,0.014443,0.78375
std,0.314059,0.137461,0.346339,3.838923,4.299627,0.992467,1.106089,0.451038,0.451038,0.499922,0.290968,0.493173,0.992809,1.010088,0.891705,0.837271,1.45092,0.498386,0.006648,0.411719
min,0.0,0.0,0.0,0.0,-0.919482,-4.637975,-1.368508,0.0,0.0,0.0,0.0,0.0,-1.59863,-2.012768,-1.178836,-1.5,-0.826004,0.0,0.0,0.0
25%,0.0,0.0,0.0,3.0,-0.380854,-0.436668,-0.401528,0.0,0.0,0.0,0.0,0.0,-0.739331,-0.682573,-0.427905,-0.333333,-0.373473,0.0,0.0095,1.0
50%,0.0,0.0,0.0,7.0,-0.007213,0.228539,0.002876,0.0,1.0,0.0,0.0,0.0,-0.229177,-0.167659,0.021927,0.0,0.01756,0.0,0.014084,1.0
75%,0.0,0.0,0.0,11.0,0.622159,0.753702,0.596962,1.0,1.0,1.0,0.0,1.0,0.667012,0.518893,0.563624,0.666667,0.620811,1.0,0.0192,1.0
max,1.0,1.0,1.0,11.0,285.526352,1.173833,19.272247,1.0,1.0,1.0,1.0,1.0,2.586246,4.924268,9.00008,5.5,28.371082,1.0,0.033333,1.0


In [154]:
X_train_proc.to_csv('../Data/X_train_proc.csv')