# Import libraries

In [1]:
# General
import os
import pathlib
import pickle

# Analysis
import numpy as np
import pandas as pd

# Visualisation
import seaborn as sns
import matplotlib.pyplot as plt

# Machine learning
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer

# Autoreload
%load_ext autoreload
%autoreload 2

In [2]:
# Import project preprocessd data
from package_folder import preprocessor_light_PCA

# 1. Load data, create X and Y, preprocess

In [16]:
#Load raw data
ROOT_PATH = pathlib.Path().resolve().parent # Get the parent directory of the current working directory
raw_data_path = os.path.join(ROOT_PATH, 'raw_data', 'Loan_Default.csv')
print(raw_data_path)

/home/yann/code/YannAll/automated_loan_review_project/raw_data/Loan_Default.csv


In [17]:
data = pd.read_csv(raw_data_path)

In [18]:
data.shape

(148670, 34)

In [19]:
data.columns

Index(['ID', 'year', 'loan_limit', 'Gender', 'approv_in_adv', 'loan_type',
       'loan_purpose', 'Credit_Worthiness', 'open_credit',
       'business_or_commercial', 'loan_amount', 'rate_of_interest',
       'Interest_rate_spread', 'Upfront_charges', 'term', 'Neg_ammortization',
       'interest_only', 'lump_sum_payment', 'property_value',
       'construction_type', 'occupancy_type', 'Secured_by', 'total_units',
       'income', 'credit_type', 'Credit_Score', 'co-applicant_credit_type',
       'age', 'submission_of_application', 'LTV', 'Region', 'Security_Type',
       'Status', 'dtir1'],
      dtype='object')

In [27]:
full_pipeline = preprocessor_light_PCA.create_preprocessing_pipeline()

In [28]:
data_processed=full_pipeline.fit_transform(data)

✅ Data cleaned
✅ Columns ['credit_type','year','ID','dtir1','Upfront_charges', 'LTV', 'Interest_rate_spread'] dropped
✅ Missing values in categorical variables imputed
✅ Categorical variables encoded successfully, including 'term'
✅ Missing values imputed with Simple Imputer (mean), remaining NaNs filled with 0
✅ Outliers removed based on IQR threshold
✅ Continuous variables scaled and centered around 0


In [9]:
X = data_processed.drop(columns='Status')
y = data_processed["Status"]

In [10]:
X_column_names=list(X.columns)

In [11]:
print(f"X_shape: {X.shape} y shape: {y.shape}")

X_shape: (144218, 86) y shape: (144218,)


In [12]:
# Instantiate the base model
logistic_regression = LogisticRegression()
#Create train/test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test=train_test_split(X,y,test_size=0.2,random_state=42)
# Train the model 
logistic_regression.fit(X_train, y_train)
# Make predictions on the test set 
y_pred = logistic_regression.predict(X_test) 
# Evaluate the model accuracy
accuracy= accuracy_score(y_test, y_pred) 
conf_matrix = confusion_matrix(y_test, y_pred) 
class_report = classification_report(y_test, y_pred)
print(f"Accuracy score: {accuracy}\n")
print(f"Confusion matrix: {conf_matrix}\n'")
print(f"Class_report: {class_report}\n")

NameError: name 'accuracy_score' is not defined

# 2. Principal Component Analysis

In [None]:
#Import and instantiate a PCA
from sklearn.decomposition import PCA
pca=PCA()
#Fit and transform X and display it as a dataframe
pca.fit(X)
X_proj=pca.transform(X)
X_proj=pd.DataFrame(X_proj,columns=[f"PC{i}" for i in range(1,87)])

In [None]:
#Display correlations after PCA treatment 
sns.heatmap(X_proj.corr(),cmap='coolwarm')

In [None]:
#Observation: unlike expected, the correlations are not disappearing fully post PCA treatment. Let's investigate this later.

In [None]:
#For now let's focus on reducing dimensionality
pca.explained_variance_ratio_
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.title("Variance explained by PCA factors")
plt.xlabel('PCA')
plt.ylabel('Variance')

In [None]:
#Put the % of variance explained in a dataframe
percentage=[round(pca, 3) for pca in list(np.cumsum(pca.explained_variance_ratio_))]
percentage=[float(pca) for pca in percentage]
PCA_index=[i for i in range(1,87)]
summary=pd.DataFrame(percentage,PCA_index,columns=['% of cumulated variance explained'])
#Check how many PCA factors are requested to explain 95% of variance
summary[summary['% of cumulated variance explained']<0.951]

In [None]:
#Conclusion: 2=with only 24 PCA factors, we would still explain 95% of our initial X variance

In [None]:
#Build a PCA model to reduce dimensionality from currently 95 to 24 features
pca_24=PCA(n_components=24)
#Fit and transform X and display it as a dataframe
pca_24.fit(X)
X_proj_24=pca_24.transform(X)
X_proj_24=pd.DataFrame(X_proj,columns=[f"PC{i}" for i in range(1,25)])

# 3.Run a logistic regression on X_proj_24

In [None]:
# Instantiate the base model
logistic_regression = LogisticRegression()

In [None]:
#Create train/test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test=train_test_split(X_proj_24,y,test_size=0.2,random_state=42)

In [None]:
# Train the model 
logistic_regression.fit(X_train, y_train)

In [None]:
X_train.shape

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
# Make predictions on the test set 
y_pred = logistic_regression.predict(X_test) 
# Evaluate the model accuracy
accuracy= accuracy_score(y_test, y_pred) 
conf_matrix = confusion_matrix(y_test, y_pred) 
class_report = classification_report(y_test, y_pred)

In [None]:
print(f"Accuracy score: {accuracy}\n")
print(f"Confusion matrix: {conf_matrix}\n'")
print(f"Class_report: {class_report}\n")

Observation: the logistic regression recall drops from previously 29% to 25% when we add a PCA treatment to the preprocessing
This is significant but logical given the 95% of variance kept post-PCA treatment.
The main difference between preprocessing_light and preprocessing_light_PCA is actually not triggered by the PCA itself but by the scaler. PCA requires the data to be centered around 0 i.e. a standard scaler, while our preprocessing_light model worked with a MinMax scaler. It appears MinMax has a much better performance.

In [None]:
X_train_pd=pd.DataFrame(X_train,columns=[f"PC{i}" for i in range(1,25)])

In [None]:
#Check correlation matrix of X_train
sns.heatmap(X_train.corr(), cmap='coolwarm')

In [None]:
#This time the correlation matrix is clean!

# END OF THE NOTEBOOK