# Import libraries

In [1]:
# General
import os
import pathlib
import pickle

# Analysis
import numpy as np
import pandas as pd

# Visualisation
import seaborn as sns
import matplotlib.pyplot as plt

# Machine learning
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer

# Autoreload
%load_ext autoreload
%autoreload 2

In [2]:
# Import preprocess_light functions
from package_folder.preprocessor_light import * #load_loan_data

# Get processed X, y and fitted preprocessed 

In [3]:
# Creating a file containing the processed values (including X and y), saving it to the directory raw_data > loan_preprocessed and get the preprocessor
preprocessor = process_data()
preprocessor

🔍 Checking for file at path: /home/nicolas/code/YannAll/automated_loan_review_project/raw_data/Loan_Default.csv
✅ Data loaded successfully
✅ Data cleaned
✅ Columns ['year', 'ID'] dropped
✅ Missing values in categorical variables imputed
✅ Categorical variables encoded successfully, including 'term'
✅ Missing values imputed with Simple Imputer (mean), remaining NaNs filled with 0
✅ Outliers removed based on IQR threshold
✅ Continuous variables scaled between 0 and 1
✅ Transformed data saved successfully at /home/nicolas/code/YannAll/automated_loan_review_project/raw_data/loan_preprocessed.csv


In [4]:
# Get the path to the loan_preprocessed.csv file that contains processed X and y
ROOT_PATH = pathlib.Path().resolve().parent # Get the parent directory of the current working directory
raw_data_path = os.path.join(ROOT_PATH, 'raw_data', 'loan_preprocessed.csv')

In [5]:
# Creating a dataframe with the processed X and y
data_processed = pd.read_csv(raw_data_path)
print(data_processed.shape)
data_processed.head(3)

(141441, 96)


Unnamed: 0,loan_amount,rate_of_interest,Interest_rate_spread,Upfront_charges,property_value,income,Credit_Score,LTV,Status,dtir1,...,age_<25,age_>74,submission_of_application_not_inst,submission_of_application_to_inst,Region_North,Region_North-East,Region_central,Region_south,Security_Type_Indriect,Security_Type_direct
0,0.087719,0.512993,0.506896,0.272279,0.068027,0.082621,0.645,0.677815,1.0,0.714286,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0
1,0.166667,0.512993,0.506896,0.272279,0.326458,0.236467,0.13,0.491878,1.0,0.584517,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0
2,0.342105,0.66,0.426611,0.050235,0.333333,0.450142,0.835,0.543927,0.0,0.732143,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0


In [6]:
# Creating X
X = data_processed.drop(columns=["Status"])
print(X.shape)
X.head(3)

(141441, 95)


Unnamed: 0,loan_amount,rate_of_interest,Interest_rate_spread,Upfront_charges,property_value,income,Credit_Score,LTV,dtir1,loan_limit_cf,...,age_<25,age_>74,submission_of_application_not_inst,submission_of_application_to_inst,Region_North,Region_North-East,Region_central,Region_south,Security_Type_Indriect,Security_Type_direct
0,0.087719,0.512993,0.506896,0.272279,0.068027,0.082621,0.645,0.677815,0.714286,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0
1,0.166667,0.512993,0.506896,0.272279,0.326458,0.236467,0.13,0.491878,0.584517,1.0,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0
2,0.342105,0.66,0.426611,0.050235,0.333333,0.450142,0.835,0.543927,0.732143,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0


In [7]:
# Creating y
y = data_processed[["Status"]]
print(y.shape)
y.head(3)

(141441, 1)


Unnamed: 0,Status
0,1.0
1,1.0
2,0.0


# Instantiating, fitting and saving the model

In [8]:
# Instantiate the model
model = LogisticRegression()

In [9]:
# Train the model on the full dataset
model.fit(X, y)

  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [10]:
# Save the pretrain model
with open('../models/mvp_model.pkl', 'wb') as file:
    pickle.dump(model, file)

# User inputs

In [11]:
loan_limit = "cf" # categorical feature
income = 5760.0 # numerical feature
age = "45-54" # categorical feature

# Building an input dataframe

In [12]:
# Get the path to the Loan_Default.csv file (raw data))
ROOT_PATH = pathlib.Path().resolve().parent # Get the parent directory of the current working directory
raw_data_path = os.path.join(ROOT_PATH, 'raw_data', 'Loan_Default.csv')

In [13]:
data_raw = pd.read_csv(raw_data_path)
print(data_raw.shape)
data_raw.head(3)

(148670, 34)


Unnamed: 0,ID,year,loan_limit,Gender,approv_in_adv,loan_type,loan_purpose,Credit_Worthiness,open_credit,business_or_commercial,...,credit_type,Credit_Score,co-applicant_credit_type,age,submission_of_application,LTV,Region,Security_Type,Status,dtir1
0,24890,2019,cf,Sex Not Available,nopre,type1,p1,l1,nopc,nob/c,...,EXP,758,CIB,25-34,to_inst,98.728814,south,direct,1,45.0
1,24891,2019,cf,Male,nopre,type2,p1,l1,nopc,b/c,...,EQUI,552,EXP,55-64,to_inst,,North,direct,1,
2,24892,2019,cf,Male,pre,type1,p1,l1,nopc,nob/c,...,EXP,834,CIB,35-44,to_inst,80.019685,south,direct,0,46.0


In [14]:
X_user = pd.DataFrame(data_raw.iloc[0, :]).transpose()#.drop(columns="Status")
X_user["loan_limit"] = loan_limit
X_user["income"] = income
X_user["age"] = age
X_user

Unnamed: 0,ID,year,loan_limit,Gender,approv_in_adv,loan_type,loan_purpose,Credit_Worthiness,open_credit,business_or_commercial,...,credit_type,Credit_Score,co-applicant_credit_type,age,submission_of_application,LTV,Region,Security_Type,Status,dtir1
0,24890,2019,cf,Sex Not Available,nopre,type1,p1,l1,nopc,nob/c,...,EXP,758,CIB,45-54,to_inst,98.728814,south,direct,1,45.0


# Predict

In [15]:
X_user_processed = preprocessor.transform(X_user).drop(columns="Status")


⚠️ Small dataset detected, skipping column removal
✅ Columns ['year', 'ID'] dropped
✅ Missing values in categorical variables imputed
✅ Categorical variables encoded successfully, including 'term'
✅ Missing values imputed with Simple Imputer (mean), remaining NaNs filled with 0
⚠️ Small dataset detected, skipping outlier removal to avoid excessive data loss
✅ Continuous variables scaled between 0 and 1


In [16]:
X_user_processed

Unnamed: 0,loan_amount,rate_of_interest,Interest_rate_spread,Upfront_charges,property_value,income,Credit_Score,LTV,dtir1,loan_limit_cf,...,age_<25,age_>74,submission_of_application_not_inst,submission_of_application_to_inst,Region_North,Region_North-East,Region_central,Region_south,Security_Type_Indriect,Security_Type_direct
0,0.087719,0.512993,0.506896,0.272279,0.068027,0.273504,0.645,0.677815,0.714286,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0


In [17]:
prediction = model.predict(X_user_processed)
prediction

array([0.])

# Building functions

## Creating input dataframe

In [18]:
def creating_full_dataframe_from_inputs(loan_limit, income, age):
    # Get the path to the Loan_Default.csv file (raw data))
    ROOT_PATH = pathlib.Path().resolve().parent # Get the parent directory of the current working directory
    raw_data_path = os.path.join(ROOT_PATH, 'raw_data', 'Loan_Default.csv')

    # Convert the Loan_Default.file into a DataFrame
    data_raw = pd.read_csv(raw_data_path)

    # Creating the input Dataframe
    X_user = pd.DataFrame(data_raw.iloc[0, :]).transpose()#.drop(columns="Status")
    X_user["loan_limit"] = loan_limit
    X_user["income"] = income
    X_user["age"] = age

    print("✅ Input dataframe created successfully")

    return X_user

In [19]:
creating_full_dataframe_from_inputs("cf", 5760.0, "45-54")

✅ Input dataframe created successfully


Unnamed: 0,ID,year,loan_limit,Gender,approv_in_adv,loan_type,loan_purpose,Credit_Worthiness,open_credit,business_or_commercial,...,credit_type,Credit_Score,co-applicant_credit_type,age,submission_of_application,LTV,Region,Security_Type,Status,dtir1
0,24890,2019,cf,Sex Not Available,nopre,type1,p1,l1,nopc,nob/c,...,EXP,758,CIB,45-54,to_inst,98.728814,south,direct,1,45.0


## Creating my_prediction_function

In [20]:
def my_prediction_function(loan_limit, income, age):
    # Create the input dataframe
    X_user = creating_full_dataframe_from_inputs(loan_limit, income, age)

    # Load the preprocessor and transform the input dataframe
    preprocessor = process_data()
    X_user_processed = preprocessor.transform(X_user).drop(columns="Status")

    # Load the model from the pretrain model pickle file
    ROOT_PATH = pathlib.Path().resolve().parent # Get the parent directory of the current working directory
    model_path = os.path.join(ROOT_PATH, 'models', 'mvp_model.pkl')
    print(f"Path of the model.pkl:\n{model_path}\n")
    with open(model_path, 'rb') as file:
        model = pickle.load(file)

    # Predict
    prediction = model.predict(X_user_processed)
    print(f"Prediction: {prediction[0]}")

    print("✅ Prediction done succesfully")

    return prediction

In [None]:
# Test if the function work
my_prediction_function("cf", 5760.0, "45-54")

✅ Input dataframe created successfully
🔍 Checking for file at path: /home/nicolas/code/YannAll/automated_loan_review_project/raw_data/Loan_Default.csv
✅ Data loaded successfully
✅ Data cleaned
✅ Columns ['year', 'ID'] dropped
✅ Missing values in categorical variables imputed
✅ Categorical variables encoded successfully, including 'term'
✅ Missing values imputed with Simple Imputer (mean), remaining NaNs filled with 0
✅ Outliers removed based on IQR threshold
✅ Continuous variables scaled between 0 and 1
✅ Transformed data saved successfully at /home/nicolas/code/YannAll/automated_loan_review_project/raw_data/loan_preprocessed.csv
⚠️ Small dataset detected, skipping column removal
✅ Columns ['year', 'ID'] dropped
✅ Missing values in categorical variables imputed
✅ Categorical variables encoded successfully, including 'term'
✅ Missing values imputed with Simple Imputer (mean), remaining NaNs filled with 0
⚠️ Small dataset detected, skipping outlier removal to avoid excessive data loss
✅ 

array([0.])

# END OF THE NOTEBOOK