# Import libraries

In [1]:
# General
import os
import pathlib
import pickle

# Analysis
import numpy as np
import pandas as pd

# Visualisation
import seaborn as sns
import matplotlib.pyplot as plt

# Machine learning
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer

# Autoreload
%load_ext autoreload
%autoreload 2

In [2]:
# Import project preprocessd data
from package_folder.preprocessorG import * #load_loan_data

ModuleNotFoundError: No module named 'package_folder.preprocessorG'

# Get clean X and y

## Locally (without using the preprocessor.py)

### Import data

In [3]:
ROOT_PATH = pathlib.Path().resolve().parent # Get the parent directory of the current working directory
raw_data_path = os.path.join(ROOT_PATH, 'raw_data', 'Loan_Default.csv')
print(raw_data_path)

/home/yann/code/YannAll/automated_loan_review_project/raw_data/Loan_Default.csv


In [4]:
# Full dataset from the csv file
data = pd.read_csv(raw_data_path)
data.head(3)

Unnamed: 0,ID,year,loan_limit,Gender,approv_in_adv,loan_type,loan_purpose,Credit_Worthiness,open_credit,business_or_commercial,...,credit_type,Credit_Score,co-applicant_credit_type,age,submission_of_application,LTV,Region,Security_Type,Status,dtir1
0,24890,2019,cf,Sex Not Available,nopre,type1,p1,l1,nopc,nob/c,...,EXP,758,CIB,25-34,to_inst,98.728814,south,direct,1,45.0
1,24891,2019,cf,Male,nopre,type2,p1,l1,nopc,b/c,...,EQUI,552,EXP,55-64,to_inst,,North,direct,1,
2,24892,2019,cf,Male,pre,type1,p1,l1,nopc,nob/c,...,EXP,834,CIB,35-44,to_inst,80.019685,south,direct,0,46.0


### Have light data

In order to develop faster the model, only 1000 rows are selected.

In [28]:
data_light = data.sample(1000)
data_light.head(3)

Unnamed: 0,ID,year,loan_limit,Gender,approv_in_adv,loan_type,loan_purpose,Credit_Worthiness,open_credit,business_or_commercial,...,credit_type,Credit_Score,co-applicant_credit_type,age,submission_of_application,LTV,Region,Security_Type,Status,dtir1
93262,118152,2019,cf,Joint,nopre,type1,p3,l1,nopc,nob/c,...,CRIF,513,EXP,65-74,to_inst,68.939394,south,direct,0,60.0
11124,36014,2019,cf,Joint,nopre,type1,p3,l1,nopc,nob/c,...,CRIF,701,EXP,45-54,not_inst,46.868979,North,direct,0,39.0
101407,126297,2019,ncf,Male,nopre,type2,p1,l1,nopc,b/c,...,CIB,524,CIB,45-54,not_inst,96.266234,North,direct,0,45.0


### Define the features (X) and the target (y)

In [29]:
X = data_light[["age", "income", "loan_limit"]]
X.head(3)

Unnamed: 0,age,income,loan_limit
93262,65-74,2340.0,cf
11124,45-54,15360.0,cf
101407,45-54,3660.0,ncf


In [172]:
y = data_light[["Status"]]
y.head(3)

Unnamed: 0,Status
35835,0
128183,1
89732,0


### Preprocess the data

In [30]:
# Import from preprocessor2
def create_preprocessor(data):
    # Define categorical and numerical columns
    categorical_features = data.select_dtypes(include=['object']).columns.tolist()
    numerical_features = data.select_dtypes(include=['int64', 'float64']).columns.tolist()

    # Define transformers for numerical and categorical features
    numerical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler())
    ])

    categorical_transformer = Pipeline(steps=[
        ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
    ])

    # Combine transformers into a column transformer
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_features),
            ('cat', categorical_transformer, categorical_features)
        ]
    )

    return preprocessor, categorical_features, numerical_features

In [31]:
preprocessor, categorical_features, numerical_features = create_preprocessor(X)

In [175]:
# X . fit and transform
X = preprocessor.fit_transform(X)

In [176]:
# Get feature names from the preprocessor
transformed_columns = numerical_features + list(preprocessor.named_transformers_['cat'].named_steps['onehot'].get_feature_names_out(categorical_features))

In [177]:
# Convert the transformed data into a DataFrame
X = pd.DataFrame(X, columns=transformed_columns)
X.head(3)

Unnamed: 0,income,age_25-34,age_35-44,age_45-54,age_55-64,age_65-74,age_<25,age_>74,loan_limit_cf,loan_limit_ncf,loan_limit_nan
0,1.074253,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,1.297719,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,-0.394233,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0


## Using preprocessor.py

In [140]:
preprocessor = process_data()

🔍 Checking for file at path: /home/nicolas/code/YannAll/automated_loan_review_project/raw_data/Loan_Default.csv
✅ Data loaded successfully
🔍 Checking for file at path: /home/nicolas/code/YannAll/automated_loan_review_project/raw_data/Loan_Default.csv
✅ Data loaded successfully
✅ Data cleaned
✅ Categorical variables encoded successfully, including 'term'
✅ Outliers removed based on 3 * IQR threshold
✅ Columns 'year' and 'ID' dropped
✅ Missing values imputed with KNN Imputer
✅ Tree-based imputation models fitted
✅ Missing values imputed with tree-based models
✅ Continuous variables scaled between 0 and 1
✅ Transformed data saved successfully at /home/nicolas/code/YannAll/automated_loan_review_project/raw_data/loan_preprocessed.csv


In [127]:
ROOT_PATH = pathlib.Path().resolve().parent # Get the parent directory of the current working directory
raw_data_path = os.path.join(ROOT_PATH, 'raw_data', 'loan_preprocessed.csv')

In [128]:
data = pd.read_csv(raw_data_path)
data.head(3)

Unnamed: 0,income,Status,age_25-34,age_35-44,age_45-54,age_55-64,age_65-74,age_<25,age_>74,age_nan,loan_limit_cf,loan_limit_ncf,loan_limit_nan
0,0.030623,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0.087645,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,0.166843,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


### Define the features (X) and the target (y)

In [32]:
X = data.drop(columns="Status")
X.head(3)

Unnamed: 0,ID,year,loan_limit,Gender,approv_in_adv,loan_type,loan_purpose,Credit_Worthiness,open_credit,business_or_commercial,...,income,credit_type,Credit_Score,co-applicant_credit_type,age,submission_of_application,LTV,Region,Security_Type,dtir1
0,24890,2019,cf,Sex Not Available,nopre,type1,p1,l1,nopc,nob/c,...,1740.0,EXP,758,CIB,25-34,to_inst,98.728814,south,direct,45.0
1,24891,2019,cf,Male,nopre,type2,p1,l1,nopc,b/c,...,4980.0,EQUI,552,EXP,55-64,to_inst,,North,direct,
2,24892,2019,cf,Male,pre,type1,p1,l1,nopc,nob/c,...,9480.0,EXP,834,CIB,35-44,to_inst,80.019685,south,direct,46.0


In [33]:
y = data[["Status"]]
y.head(3)

Unnamed: 0,Status
0,1
1,1
2,0


# Instantiating, fitting and saving the model

In [34]:
# Instantiate the model
model = LogisticRegression()

In [35]:
# Train the model on the full dataset
model.fit(X, y)

ValueError: could not convert string to float: 'cf'

In [180]:
# Save the pretrain model
with open('../models/mvp_model.pkl', 'wb') as file:
    pickle.dump(model, file)

# Predict

In [None]:
def my_prediction_function(age, income, loan_limit, preprocessor):
    """Prediction function using a pretrained model loaded from disk

    Arguments:
    - age
    - income
    - loan_limit
    - preprocessor
    """
    print(f"""Arguments taken into account:
        - age: {age}
        - income: {income}
        - loan limit: {loan_limit}
        - preprocessor: {preprocessor}""")

    # Load the model from the pretrain model pickle file
    ROOT_PATH = pathlib.Path().resolve().parent # Get the parent directory of the current working directory
    model_path = os.path.join(ROOT_PATH, 'models', 'mvp_model.pkl')
    print(f"Path of the model.pkl:\n{model_path}\n")
    with open(model_path, 'rb') as file:
        model = pickle.load(file)

    # Build a dataframe with the inputs
    X_predict=data.drop(columns='Status')
    # X = pd.DataFrame({
    #     "age": age,
    #     "income": income,
    #     "loan_limit": loan_limit}, index = [0])
    # print(f"Data before preprocessing:\n{X}\n")

    #Encode categorical features
    categorical_encoder = CategoricalEncoder()
    categorical_encoder.fit(X)

    # Transform the features
    #X_preprocessed = preprocessor.transform(X)
    #print(f"Data after preprocessing:\n{X_preprocessed}\n")

    # Use the model to predict the given inputs
    prediction = model.predict(X_preprocessed)
    print(f"Prediction: {prediction}")

    print("✅ Prediction done succesfully")

    return prediction

In [183]:
my_prediction_function("35-44", 13680.0, "ncf", preprocessor)

Arguments taken into account:
        - age: 35-44
        - income: 13680.0
        - loan limit: ncf
        - preprocessor: ColumnTransformer(transformers=[('num',
                                 Pipeline(steps=[('imputer', SimpleImputer()),
                                                 ('scaler', StandardScaler())]),
                                 ['income']),
                                ('cat',
                                 Pipeline(steps=[('onehot',
                                                  OneHotEncoder(handle_unknown='ignore',
                                                                sparse_output=False))]),
                                 ['age', 'loan_limit'])])
Path of the model.pkl:
/home/nicolas/code/YannAll/automated_loan_review_project/models/mvp_model.pkl

Data before preprocessing:
     age   income loan_limit
0  35-44  13680.0        ncf

Data after preprocessing:
[[1.17002434 0.         1.         0.         0.         0.
  0.         0.  



array([0])

In [10]:
# Build a dataframe with the inputs given by the user. For the other features we simply take the first row values
X_predict=data.drop(columns='Status').iloc[[0]]
X_predict['age']="35-44"
X_predict['income']=13680.0
X_predict['loan_limit']="ncf"
X_predict

Unnamed: 0,ID,year,loan_limit,Gender,approv_in_adv,loan_type,loan_purpose,Credit_Worthiness,open_credit,business_or_commercial,...,income,credit_type,Credit_Score,co-applicant_credit_type,age,submission_of_application,LTV,Region,Security_Type,dtir1
0,24890,2019,ncf,Sex Not Available,nopre,type1,p1,l1,nopc,nob/c,...,13680.0,EXP,758,CIB,35-44,to_inst,98.728814,south,direct,45.0


In [21]:
from package_folder.preprocessor import CategoricalEncoder
categorical_encoder = CategoricalEncoder()
X_predict_fit=categorical_encoder.fit(X_predict)
X_predict_transformed=categorical_encoder.transform(X_predict)
X_predict_transformed

✅ Categorical variables encoded successfully, including 'term'


Unnamed: 0,ID,year,loan_amount,rate_of_interest,Interest_rate_spread,Upfront_charges,property_value,income,Credit_Score,LTV,...,construction_type_sb,occupancy_type_pr,Secured_by_home,total_units_1U,credit_type_EXP,co-applicant_credit_type_CIB,age_35-44,submission_of_application_to_inst,Region_south,Security_Type_direct
0,24890,2019,116500,,,,118000.0,13680.0,758,98.728814,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [20]:
X_predict_transformed.isnull().sum()

ID                                   0
year                                 0
loan_amount                          0
rate_of_interest                     1
Interest_rate_spread                 1
Upfront_charges                      1
property_value                       0
income                               0
Credit_Score                         0
LTV                                  0
dtir1                                0
loan_limit_ncf                       0
Gender_Sex Not Available             0
approv_in_adv_nopre                  0
loan_type_type1                      0
loan_purpose_p1                      0
Credit_Worthiness_l1                 0
open_credit_nopc                     0
business_or_commercial_nob/c         0
term_360.0                           0
Neg_ammortization_not_neg            0
interest_only_not_int                0
lump_sum_payment_not_lpsm            0
construction_type_sb                 0
occupancy_type_pr                    0
Secured_by_home          

In [24]:
#Drop columns with missing values according to Yann's feature selection 1
X_predict_transformed=X_predict_transformed.drop(columns=['rate_of_interest','Interest_rate_spread','Upfront_charges'])
X_predict_transformed

Unnamed: 0,ID,year,loan_amount,property_value,income,Credit_Score,LTV,dtir1,loan_limit_ncf,Gender_Sex Not Available,...,construction_type_sb,occupancy_type_pr,Secured_by_home,total_units_1U,credit_type_EXP,co-applicant_credit_type_CIB,age_35-44,submission_of_application_to_inst,Region_south,Security_Type_direct
0,24890,2019,116500,118000.0,13680.0,758,98.728814,45.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [None]:
#Run predict on our "applicant" (i.e. X_predict_transformed)



# END OF THE NOTEBOOK