# Import libraries

In [1]:
# General
import os
import pathlib
import pickle

# Analysis
import numpy as np
import pandas as pd

# Visualisation
import seaborn as sns
import matplotlib.pyplot as plt

# Machine learning
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer

# Autoreload
%load_ext autoreload
%autoreload 2

In [2]:
# Import project preprocessd data
from package_folder.preprocessor import * #load_loan_data

# 1. Load data, create X and Y, preprocess

In [181]:
#Load raw data
ROOT_PATH = pathlib.Path().resolve().parent # Get the parent directory of the current working directory
raw_data_path = os.path.join(ROOT_PATH, 'raw_data', 'Loan_Default.csv')
print(raw_data_path)

/home/yann/code/YannAll/automated_loan_review_project/raw_data/Loan_Default.csv


In [182]:
data = pd.read_csv(raw_data_path)

In [191]:
data_processed=full_pipeline.fit_transform(data)

✅ Data cleaned
✅ Columns ['year', 'ID'] dropped
✅ Missing values in categorical variables imputed
✅ Categorical variables encoded successfully, including 'term'
✅ Missing values imputed with Simple Imputer (mean), remaining NaNs filled with 0
✅ Outliers removed based on IQR threshold
✅ Continuous variables scaled between 0 and 1


In [196]:
X = data_processed.drop(columns='Status')
y = data_processed["Status"]

In [197]:
X_processed_column_names=list(X_processed.columns)

In [204]:
print(f"X_shape: {X.shape} y shape: {y.shape}")

X_shape: (141441, 95) y shape: (141441,)


# 2. Instantiate, cross validate, train and evaluate the base model

In [208]:
# Instantiate the base model
logistic_regression = LogisticRegression()

In [205]:
#Create train/test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test=train_test_split(X,y,test_size=0.2,random_state=42)

In [209]:
#Cross validate data
from sklearn.model_selection import cross_validate
base_model_score=cross_validate(logistic_regression,X_train,y_train,cv=5,verbose=0)['test_score'].mean()
print(f"base model: {base_model_score}")

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

base model: 0.8708727877342334


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [211]:
# Train the model 
logistic_regression.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [214]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
# Make predictions on the test set 
y_pred = logistic_regression.predict(X_test) 
# Evaluate the model accuracy
accuracy= accuracy_score(y_test, y_pred) 
conf_matrix = confusion_matrix(y_test, y_pred) 
class_report = classification_report(y_test, y_pred)

In [215]:
print(accuracy_score,conf_matrix,class_report)

<function accuracy_score at 0x7fb1812df880> [[21020   246]
 [ 3386  3637]]               precision    recall  f1-score   support

         0.0       0.86      0.99      0.92     21266
         1.0       0.94      0.52      0.67      7023

    accuracy                           0.87     28289
   macro avg       0.90      0.75      0.79     28289
weighted avg       0.88      0.87      0.86     28289



## 3. Predict one raw including user input

In [216]:
from package_folder import preprocessor_light 

In [235]:
#Import raw data
data=preprocessor_light.load_loan_data()
X_raw=data.drop(columns='Status')

🔍 Checking for file at path: /home/yann/code/YannAll/automated_loan_review_project/raw_data/Loan_Default.csv
✅ Data loaded successfully


In [236]:
X_small=data.drop(columns='Status').iloc[0]

In [237]:
# Add user inputs
X_small['age']='25-34'
X_small['income']=2500
X_small['loan_limit']='cf'

In [238]:
X_small=pd.DataFrame(X_small).transpose()

In [244]:
full_pipeline = preprocessor_light.create_preprocessing_pipeline()
full_pipeline.fit(X_raw)
X_small_processed= full_pipeline.transform(X_small)

✅ Data cleaned
✅ Columns ['year', 'ID'] dropped
✅ Missing values in categorical variables imputed
✅ Categorical variables encoded successfully, including 'term'
✅ Missing values imputed with Simple Imputer (mean), remaining NaNs filled with 0
✅ Outliers removed based on IQR threshold
⚠️ Small dataset detected, skipping column removal
✅ Columns ['year', 'ID'] dropped
✅ Missing values in categorical variables imputed
✅ Categorical variables encoded successfully, including 'term'
✅ Missing values imputed with Simple Imputer (mean), remaining NaNs filled with 0
⚠️ Small dataset detected, skipping outlier removal to avoid excessive data loss
✅ Continuous variables scaled between 0 and 1


In [245]:
X_small_processed

Unnamed: 0,loan_amount,rate_of_interest,Interest_rate_spread,Upfront_charges,property_value,income,Credit_Score,LTV,dtir1,loan_limit_cf,...,age_<25,age_>74,submission_of_application_not_inst,submission_of_application_to_inst,Region_North,Region_North-East,Region_central,Region_south,Security_Type_Indriect,Security_Type_direct
0,0.087719,0.512993,0.506896,0.272279,0.068027,0.118708,0.645,0.677815,0.714286,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0


In [246]:
X_small_processed_column_names=list(X_small_processed.columns)

In [249]:
#Check shapes of X and X_small to ensure they are compatible
difference = list(set(X_small_processed_column_names) -set(X_processed_column_names))
print(f"X_shape: {X.shape} y shape: {y.shape}")
print(f"X_small_processed shape: {X_small_processed.shape}")

X_shape: (141441, 95) y shape: (141441,)
X_small_processed shape: (1, 95)


In [252]:
#Predict
y_small=logistic_regression.predict(X_small_processed)
if int(y_small)==0:
    print('Your credit application is approved')
else:
    print ('Sorry. Your credit application is NOT approved')

Your credit application is approved


  if int(y_small)==0:


# END OF THE NOTEBOOK