# Import libraries

In [1]:
# General
import os
import pathlib
import pickle

# Analysis
import numpy as np
import pandas as pd

# Visualisation
import seaborn as sns
import matplotlib.pyplot as plt

# Machine learning
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer

# Autoreload
%load_ext autoreload
%autoreload 2

In [116]:
# Import project preprocessd data
from package_folder.preprocessorG import * #load_loan_data

# Get clean X and y

## Locally (without using the preprocessor.py)

### Import data

In [168]:
ROOT_PATH = pathlib.Path().resolve().parent # Get the parent directory of the current working directory
raw_data_path = os.path.join(ROOT_PATH, 'raw_data', 'Loan_Default.csv')
print(raw_data_path)

/home/nicolas/code/YannAll/automated_loan_review_project/raw_data/Loan_Default.csv


In [169]:
# Full dataset from the csv file
data = pd.read_csv(raw_data_path)
data.head(3)

Unnamed: 0,ID,year,loan_limit,Gender,approv_in_adv,loan_type,loan_purpose,Credit_Worthiness,open_credit,business_or_commercial,...,credit_type,Credit_Score,co-applicant_credit_type,age,submission_of_application,LTV,Region,Security_Type,Status,dtir1
0,24890,2019,cf,Sex Not Available,nopre,type1,p1,l1,nopc,nob/c,...,EXP,758,CIB,25-34,to_inst,98.728814,south,direct,1,45.0
1,24891,2019,cf,Male,nopre,type2,p1,l1,nopc,b/c,...,EQUI,552,EXP,55-64,to_inst,,North,direct,1,
2,24892,2019,cf,Male,pre,type1,p1,l1,nopc,nob/c,...,EXP,834,CIB,35-44,to_inst,80.019685,south,direct,0,46.0


### Have light data

In order to develop faster the model, only 1000 rows are selected.

In [170]:
data_light = data.sample(1000)
data_light.head(3)

Unnamed: 0,ID,year,loan_limit,Gender,approv_in_adv,loan_type,loan_purpose,Credit_Worthiness,open_credit,business_or_commercial,...,credit_type,Credit_Score,co-applicant_credit_type,age,submission_of_application,LTV,Region,Security_Type,Status,dtir1
35835,60725,2019,ncf,Male,nopre,type1,p1,l1,nopc,nob/c,...,CIB,845,CIB,25-34,not_inst,79.826958,south,direct,0,36.0
128183,153073,2019,ncf,Joint,pre,type1,p1,l1,nopc,nob/c,...,EQUI,746,EXP,45-54,not_inst,,south,direct,1,
89732,114622,2019,cf,Joint,nopre,type1,p3,l1,nopc,nob/c,...,CRIF,669,EXP,65-74,to_inst,68.54067,North,direct,0,47.0


### Define the features (X) and the target (y)

In [171]:
X = data_light[["age", "income", "loan_limit"]]
X.head(3)

Unnamed: 0,age,income,loan_limit
35835,25-34,13140.0,ncf
128183,45-54,14400.0,ncf
89732,65-74,4860.0,cf


In [172]:
y = data_light[["Status"]]
y.head(3)

Unnamed: 0,Status
35835,0
128183,1
89732,0


### Preprocess the data

In [173]:
# Import from preprocessor2
def create_preprocessor(data):
    # Define categorical and numerical columns
    categorical_features = data.select_dtypes(include=['object']).columns.tolist()
    numerical_features = data.select_dtypes(include=['int64', 'float64']).columns.tolist()

    # Define transformers for numerical and categorical features
    numerical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler())
    ])

    categorical_transformer = Pipeline(steps=[
        ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
    ])

    # Combine transformers into a column transformer
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_features),
            ('cat', categorical_transformer, categorical_features)
        ]
    )

    return preprocessor, categorical_features, numerical_features

In [174]:
preprocessor, categorical_features, numerical_features = create_preprocessor(X)

In [175]:
# X . fit and transform
X = preprocessor.fit_transform(X)

In [176]:
# Get feature names from the preprocessor
transformed_columns = numerical_features + list(preprocessor.named_transformers_['cat'].named_steps['onehot'].get_feature_names_out(categorical_features))

In [177]:
# Convert the transformed data into a DataFrame
X = pd.DataFrame(X, columns=transformed_columns)
X.head(3)

Unnamed: 0,income,age_25-34,age_35-44,age_45-54,age_55-64,age_65-74,age_<25,age_>74,loan_limit_cf,loan_limit_ncf,loan_limit_nan
0,1.074253,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,1.297719,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,-0.394233,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0


## Using preprocessor.py

In [140]:
preprocessor = process_data()

🔍 Checking for file at path: /home/nicolas/code/YannAll/automated_loan_review_project/raw_data/Loan_Default.csv
✅ Data loaded successfully
🔍 Checking for file at path: /home/nicolas/code/YannAll/automated_loan_review_project/raw_data/Loan_Default.csv
✅ Data loaded successfully
✅ Data cleaned
✅ Categorical variables encoded successfully, including 'term'
✅ Outliers removed based on 3 * IQR threshold
✅ Columns 'year' and 'ID' dropped
✅ Missing values imputed with KNN Imputer
✅ Tree-based imputation models fitted
✅ Missing values imputed with tree-based models
✅ Continuous variables scaled between 0 and 1
✅ Transformed data saved successfully at /home/nicolas/code/YannAll/automated_loan_review_project/raw_data/loan_preprocessed.csv


In [127]:
ROOT_PATH = pathlib.Path().resolve().parent # Get the parent directory of the current working directory
raw_data_path = os.path.join(ROOT_PATH, 'raw_data', 'loan_preprocessed.csv')

In [128]:
data = pd.read_csv(raw_data_path)
data.head(3)

Unnamed: 0,income,Status,age_25-34,age_35-44,age_45-54,age_55-64,age_65-74,age_<25,age_>74,age_nan,loan_limit_cf,loan_limit_ncf,loan_limit_nan
0,0.030623,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0.087645,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,0.166843,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


### Define the features (X) and the target (y)

In [130]:
X = data.drop(columns="Status")
X.head(3)

Unnamed: 0,income,age_25-34,age_35-44,age_45-54,age_55-64,age_65-74,age_<25,age_>74,age_nan,loan_limit_cf,loan_limit_ncf,loan_limit_nan
0,0.030623,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0.087645,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,0.166843,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [131]:
y = data[["Status"]]
y.head(3)

Unnamed: 0,Status
0,1.0
1,1.0
2,0.0


# Instantiating, fitting and saving the model

In [178]:
# Instantiate the model
model = LogisticRegression()

In [179]:
# Train the model on the full dataset
model.fit(X, y)

  y = column_or_1d(y, warn=True)


In [180]:
# Save the pretrain model
with open('../models/mvp_model.pkl', 'wb') as file:
    pickle.dump(model, file)

# Predict

In [None]:
def my_prediction_function(age, income, loan_limit, preprocessor):
    """Prediction function using a pretrained model loaded from disk

    Arguments:
    - age
    - income
    - loan_limit
    - preprocessor
    """
    print(f"""Arguments taken into account:
        - age: {age}
        - income: {income}
        - loan limit: {loan_limit}
        - preprocessor: {preprocessor}""")

    # Load the model from the pretrain model pickle file
    ROOT_PATH = pathlib.Path().resolve().parent # Get the parent directory of the current working directory
    model_path = os.path.join(ROOT_PATH, 'models', 'mvp_model.pkl')
    print(f"Path of the model.pkl:\n{model_path}\n")
    with open(model_path, 'rb') as file:
        model = pickle.load(file)

    # Build a dataframe with the inputs
    X = pd.DataFrame({
        "age": age,
        "income": income,
        "loan_limit": loan_limit}, index = [0])
    print(f"Data before preprocessing:\n{X}\n")

    # Transform the features
    X_preprocessed = preprocessor.transform(X)
    print(f"Data after preprocessing:\n{X_preprocessed}\n")

    # Use the model to predict the given inputs
    prediction = model.predict(X_preprocessed)
    print(f"Prediction: {prediction}")

    print("✅ Prediction done succesfully")

    return prediction

In [183]:
my_prediction_function("35-44", 13680.0, "ncf", preprocessor)

Arguments taken into account:
        - age: 35-44
        - income: 13680.0
        - loan limit: ncf
        - preprocessor: ColumnTransformer(transformers=[('num',
                                 Pipeline(steps=[('imputer', SimpleImputer()),
                                                 ('scaler', StandardScaler())]),
                                 ['income']),
                                ('cat',
                                 Pipeline(steps=[('onehot',
                                                  OneHotEncoder(handle_unknown='ignore',
                                                                sparse_output=False))]),
                                 ['age', 'loan_limit'])])
Path of the model.pkl:
/home/nicolas/code/YannAll/automated_loan_review_project/models/mvp_model.pkl

Data before preprocessing:
     age   income loan_limit
0  35-44  13680.0        ncf

Data after preprocessing:
[[1.17002434 0.         1.         0.         0.         0.
  0.         0.  



array([0])

# END OF THE NOTEBOOK