# Import libraries

In [2]:
# General
import os
import pathlib
import pickle

# Analysis
import numpy as np
import pandas as pd

# Visualisation
import seaborn as sns
import matplotlib.pyplot as plt

# Machine learning
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer

# Autoreload
%load_ext autoreload
%autoreload 2

In [3]:
# Import project preprocessd data
from package_folder.preprocessor import * #load_loan_data

# 1. MVP model and predict function to plug into API

## Using already preprocessed data

### Import data

In [13]:
ROOT_PATH = pathlib.Path().resolve().parent # Get the parent directory of the current working directory
raw_data_path = os.path.join(ROOT_PATH, 'raw_data', 'loan_preprocessed_shared_manually_by_Gilian.csv')
print(raw_data_path)

/home/yann/code/YannAll/automated_loan_review_project/raw_data/loan_preprocessed_shared_manually_by_Gilian.csv


In [14]:
# Full dataset from the csv file
data = pd.read_csv(raw_data_path)
data.head(3)

Unnamed: 0,loan_amount,rate_of_interest,Interest_rate_spread,property_value,income,Credit_Score,LTV,Status,dtir1,loan_limit_cf,...,age_<25,age_>74,submission_of_application_not_inst,submission_of_application_to_inst,Region_North,Region_North-East,Region_central,Region_south,Security_Type_Indriect,Security_Type_direct
0,0.087719,0.333333,0.29075,0.062112,0.07754,0.645,0.622958,1.0,0.714286,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0
1,0.166667,0.361905,0.443806,0.144928,0.221925,0.13,0.532207,1.0,0.529762,1.0,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0
2,0.342105,0.556571,0.354616,0.304348,0.42246,0.835,0.501101,0.0,0.732143,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0


### Define the features (X) and the target (y)

In [18]:
X = data.drop(columns='Status')
X.head(3)

Unnamed: 0,loan_amount,rate_of_interest,Interest_rate_spread,property_value,income,Credit_Score,LTV,dtir1,loan_limit_cf,loan_limit_ncf,...,age_<25,age_>74,submission_of_application_not_inst,submission_of_application_to_inst,Region_North,Region_North-East,Region_central,Region_south,Security_Type_Indriect,Security_Type_direct
0,0.087719,0.333333,0.29075,0.062112,0.07754,0.645,0.622958,0.714286,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0
1,0.166667,0.361905,0.443806,0.144928,0.221925,0.13,0.532207,0.529762,1.0,0.0,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0
2,0.342105,0.556571,0.354616,0.304348,0.42246,0.835,0.501101,0.732143,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0


In [16]:
y = data[["Status"]]
y.head(3)

Unnamed: 0,Status
0,1.0
1,1.0
2,0.0


In [140]:
preprocessor = process_data()

🔍 Checking for file at path: /home/nicolas/code/YannAll/automated_loan_review_project/raw_data/Loan_Default.csv
✅ Data loaded successfully
🔍 Checking for file at path: /home/nicolas/code/YannAll/automated_loan_review_project/raw_data/Loan_Default.csv
✅ Data loaded successfully
✅ Data cleaned
✅ Categorical variables encoded successfully, including 'term'
✅ Outliers removed based on 3 * IQR threshold
✅ Columns 'year' and 'ID' dropped
✅ Missing values imputed with KNN Imputer
✅ Tree-based imputation models fitted
✅ Missing values imputed with tree-based models
✅ Continuous variables scaled between 0 and 1
✅ Transformed data saved successfully at /home/nicolas/code/YannAll/automated_loan_review_project/raw_data/loan_preprocessed.csv


In [127]:
ROOT_PATH = pathlib.Path().resolve().parent # Get the parent directory of the current working directory
raw_data_path = os.path.join(ROOT_PATH, 'raw_data', 'loan_preprocessed.csv')

In [128]:
data = pd.read_csv(raw_data_path)
data.head(3)

Unnamed: 0,income,Status,age_25-34,age_35-44,age_45-54,age_55-64,age_65-74,age_<25,age_>74,age_nan,loan_limit_cf,loan_limit_ncf,loan_limit_nan
0,0.030623,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0.087645,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,0.166843,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


### Define the features (X) and the target (y)

In [32]:
X = data.drop(columns="Status")
X.head(3)

Unnamed: 0,ID,year,loan_limit,Gender,approv_in_adv,loan_type,loan_purpose,Credit_Worthiness,open_credit,business_or_commercial,...,income,credit_type,Credit_Score,co-applicant_credit_type,age,submission_of_application,LTV,Region,Security_Type,dtir1
0,24890,2019,cf,Sex Not Available,nopre,type1,p1,l1,nopc,nob/c,...,1740.0,EXP,758,CIB,25-34,to_inst,98.728814,south,direct,45.0
1,24891,2019,cf,Male,nopre,type2,p1,l1,nopc,b/c,...,4980.0,EQUI,552,EXP,55-64,to_inst,,North,direct,
2,24892,2019,cf,Male,pre,type1,p1,l1,nopc,nob/c,...,9480.0,EXP,834,CIB,35-44,to_inst,80.019685,south,direct,46.0


In [33]:
y = data[["Status"]]
y.head(3)

Unnamed: 0,Status
0,1
1,1
2,0


# Instantiating, fitting and saving the model

In [19]:
# Instantiate the model
model = LogisticRegression()

In [None]:
from sklearn.model_selection import cross_validate
base_model_score=cross_validate(model,X_scaled,y,cv=5)['test_score'].mean()
base_model_score

In [20]:
# Train the model on the full dataset
model.fit(X, y)

  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [21]:
# Save the pretrain model
with open('../models/mvp_model.pkl', 'wb') as file:
    pickle.dump(model, file)

# Predict

In [70]:
# Build a one row small dataframe X_predict with the 3 inputs given by the user. For the other features we simply take the average value of X
X_predict=pd.DataFrame(X.mean()).transpose()
X_predict['age_35-44']=1
X_predict['income']=100
X_predict['loan_limit_ncf']=1

In [74]:
X_predict

Unnamed: 0,loan_amount,rate_of_interest,Interest_rate_spread,property_value,income,Credit_Score,LTV,dtir1,loan_limit_cf,loan_limit_ncf,...,age_<25,age_>74,submission_of_application_not_inst,submission_of_application_to_inst,Region_North,Region_North-East,Region_central,Region_south,Security_Type_Indriect,Security_Type_direct
0,0.268322,0.419955,0.408185,0.27477,100,0.499578,0.462871,0.588397,0.939623,1,...,0.009138,0.047978,0.349531,0.650469,0.502397,0.008448,0.058961,0.430195,0.000221,0.999779


In [65]:
from package_folder.preprocessor import CategoricalEncoder
categorical_encoder = CategoricalEncoder()
categorical_encoder.fit(X)
X_predict_encoded=categorical_encoder.transform(X_predict)

✅ Categorical variables encoded successfully, including 'term'


In [69]:
X_predict_encoded

Unnamed: 0,loan_amount,rate_of_interest,Interest_rate_spread,property_value,income,Credit_Score,LTV,dtir1,loan_limit_cf,loan_limit_ncf,...,age_<25,age_>74,submission_of_application_not_inst,submission_of_application_to_inst,Region_North,Region_North-East,Region_central,Region_south,Security_Type_Indriect,Security_Type_direct
0,0.268322,0.419955,0.408185,0.27477,100.0,0.499578,0.462871,0.588397,0.939623,1,...,0.009138,0.047978,0.349531,0.650469,0.502397,0.008448,0.058961,0.430195,0.000221,0.999779


In [None]:
#Observation: running the categorical encoder on X_predict is useless as X_predict has only numerical values.

In [71]:
from package_folder.preprocessor import MinMaxScalerTransformer
minMaxScalerTransformer = MinMaxScalerTransformer()
minMaxScalerTransformer.fit(X)
X_predict_encoded_scaled=minMaxScalerTransformer.transform(X_predict_encoded)

✅ Continuous variables scaled between 0 and 1


In [72]:
X_predict_encoded_scaled

Unnamed: 0,loan_amount,rate_of_interest,Interest_rate_spread,property_value,income,Credit_Score,LTV,dtir1,loan_limit_cf,loan_limit_ncf,...,age_<25,age_>74,submission_of_application_not_inst,submission_of_application_to_inst,Region_North,Region_North-East,Region_central,Region_south,Security_Type_Indriect,Security_Type_direct
0,0.268322,0.419955,0.408185,0.27477,100.0,0.499578,0.462871,0.588397,0.939623,1,...,0.009138,0.047978,0.349531,0.650469,0.502397,0.008448,0.058961,0.430195,0.000221,0.999779


In [None]:
#Observation: running the MinMaxScalerTransformer on X_predict does not scale the income value as expected. ???

In [67]:
#Run predict on our "applicant" (i.e. X_predict_encoded_scaled)
prediction=model.predict(X_predict_encoded_scaled)

In [68]:
prediction

array([0.])

# 2. Data quality check on preprocessed data

In [85]:
#Imports
from package_folder import preprocessor_light

## 2.1. Using preprocessor light

In [104]:
ROOT_PATH = pathlib.Path().resolve().parent # Get the parent directory of the current working directory
raw_data_path = os.path.join(ROOT_PATH, 'raw_data', 'loan_preprocessed_shared_manually_by_Gilian.csv')
print(raw_data_path)

/home/yann/code/YannAll/automated_loan_review_project/raw_data/loan_preprocessed_shared_manually_by_Gilian.csv


In [111]:
df=pd.read_csv(raw_data_path)

In [110]:
sorted(list(df.columns))

['Credit_Score',
 'Credit_Worthiness_l1',
 'Credit_Worthiness_l2',
 'Gender_Female',
 'Gender_Joint',
 'Gender_Male',
 'Gender_Sex Not Available',
 'Interest_rate_spread',
 'LTV',
 'Neg_ammortization_neg_amm',
 'Neg_ammortization_not_neg',
 'Region_North',
 'Region_North-East',
 'Region_central',
 'Region_south',
 'Secured_by_home',
 'Secured_by_land',
 'Security_Type_Indriect',
 'Security_Type_direct',
 'Status',
 'age_25-34',
 'age_35-44',
 'age_45-54',
 'age_55-64',
 'age_65-74',
 'age_<25',
 'age_>74',
 'approv_in_adv_nopre',
 'approv_in_adv_pre',
 'business_or_commercial_b/c',
 'business_or_commercial_nob/c',
 'co-applicant_credit_type_CIB',
 'co-applicant_credit_type_EXP',
 'construction_type_mh',
 'construction_type_sb',
 'credit_type_CIB',
 'credit_type_CRIF',
 'credit_type_EQUI',
 'credit_type_EXP',
 'dtir1',
 'income',
 'interest_only_int_only',
 'interest_only_not_int',
 'loan_amount',
 'loan_limit_cf',
 'loan_limit_ncf',
 'loan_purpose_p1',
 'loan_purpose_p2',
 'loan_purpos

# END OF THE NOTEBOOK