# Import libraries

In [1]:
# General
import os
import pathlib
import pickle

# Analysis
import numpy as np
import pandas as pd

# Visualisation
import seaborn as sns
import matplotlib.pyplot as plt

# Machine learning
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer

# Autoreload
%load_ext autoreload
%autoreload 2

In [None]:
# Import projetc preprocessd data
from package_folder.preprocessor2 import * #load_loan_data

# Import the preprocess data

In [2]:
ROOT_PATH = pathlib.Path().resolve().parent # Get the parent directory of the current working directory
raw_data_path = os.path.join(ROOT_PATH, 'raw_data', 'Loan_Default.csv')
print(raw_data_path)

/home/nicolas/code/YannAll/automated_loan_review_project/raw_data/Loan_Default.csv


In [3]:
# Full dataset from the csv file
data = pd.read_csv(raw_data_path)
data.head(3)

Unnamed: 0,ID,year,loan_limit,Gender,approv_in_adv,loan_type,loan_purpose,Credit_Worthiness,open_credit,business_or_commercial,...,credit_type,Credit_Score,co-applicant_credit_type,age,submission_of_application,LTV,Region,Security_Type,Status,dtir1
0,24890,2019,cf,Sex Not Available,nopre,type1,p1,l1,nopc,nob/c,...,EXP,758,CIB,25-34,to_inst,98.728814,south,direct,1,45.0
1,24891,2019,cf,Male,nopre,type2,p1,l1,nopc,b/c,...,EQUI,552,EXP,55-64,to_inst,,North,direct,1,
2,24892,2019,cf,Male,pre,type1,p1,l1,nopc,nob/c,...,EXP,834,CIB,35-44,to_inst,80.019685,south,direct,0,46.0


In [None]:

data_light = data.sample(1000)
data_light.head(3)

Unnamed: 0,ID,year,loan_limit,Gender,approv_in_adv,loan_type,loan_purpose,Credit_Worthiness,open_credit,business_or_commercial,...,credit_type,Credit_Score,co-applicant_credit_type,age,submission_of_application,LTV,Region,Security_Type,Status,dtir1
144287,169177,2019,cf,Sex Not Available,nopre,type1,p3,l1,nopc,nob/c,...,CIB,650,CIB,35-44,to_inst,78.846154,south,direct,0,39.0
10689,35579,2019,cf,Joint,nopre,type1,p4,l1,nopc,nob/c,...,CIB,515,EXP,65-74,not_inst,60.47486,North,direct,0,27.0
91447,116337,2019,cf,Male,nopre,type1,p4,l1,nopc,nob/c,...,EXP,621,CIB,25-34,to_inst,57.440476,North,direct,0,31.0


# Logistic regression model (for the prediction)

## Define the features (X) and the target (y)

In [59]:
X = data_light[["age", "income", "loan_limit"]]
X.head(10)

Unnamed: 0,age,income,loan_limit
144287,35-44,6120.0,cf
10689,65-74,7920.0,cf
91447,25-34,4200.0,cf
21106,25-34,3000.0,cf
30594,55-64,2640.0,cf
75254,45-54,3780.0,cf
143760,45-54,9180.0,cf
48417,65-74,16800.0,ncf
104614,55-64,5160.0,cf
106488,35-44,6300.0,cf


In [58]:
y = data_light[["Status"]]
y.head(10)

Unnamed: 0,Status
144287,0
10689,0
91447,0
21106,0
30594,1
75254,0
143760,0
48417,0
104614,1
106488,0


## Preprocessed the data

In [7]:
# Import from preprocessor2
def create_preprocessor(data):
    # Define categorical and numerical columns
    categorical_features = data.select_dtypes(include=['object']).columns.tolist()
    numerical_features = data.select_dtypes(include=['int64', 'float64']).columns.tolist()

    # Define transformers for numerical and categorical features
    numerical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler())
    ])

    categorical_transformer = Pipeline(steps=[
        ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
    ])

    # Combine transformers into a column transformer
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_features),
            ('cat', categorical_transformer, categorical_features)
        ]
    )

    return preprocessor, categorical_features, numerical_features

In [8]:
preprocessor, categorical_features, numerical_features = create_preprocessor(X)

In [9]:
# X . fit and transform
X_preprocessed = preprocessor.fit_transform(X)

In [10]:
# Get feature names from the preprocessor
transformed_columns = numerical_features + list(preprocessor.named_transformers_['cat'].named_steps['onehot'].get_feature_names_out(categorical_features))

In [11]:
# Convert the transformed data into a DataFrame
X_preprocessed = pd.DataFrame(X_preprocessed, columns=transformed_columns)
X_preprocessed.head(3)

Unnamed: 0,income,age_25-34,age_35-44,age_45-54,age_55-64,age_65-74,age_<25,age_>74,loan_limit_cf,loan_limit_ncf,loan_limit_nan
0,-0.13742,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0.086787,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
2,-0.376575,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


## Instantiating, fitting and saving the model

In [12]:
# Instantiate the model
model = LogisticRegression()

In [13]:
# Train the model on the full dataset
model.fit(X_preprocessed, y)

  y = column_or_1d(y, warn=True)


In [15]:
# Save the pretrain model
with open('../models/mvp_model.pkl', 'wb') as file:
    pickle.dump(model, file)

## Predict

In [66]:
def df_with_3_features_only(df_full):
    df_3_features = df_full[["age", "income", "loan_limit"]]
    return df_3_features

In [None]:
def my_prediction_function(age, income, loan_limit, preprocessor):
    """Prediction function using a pretrained model loaded from disk

    Arguments:
    - age
    - income
    - loan_limit
    - preprocessor
    """
    print(f"""Arguments taken into account:
        - age: {age}
        - income: {income}
        - loan limit: {loan_limit}
        - preprocessor: {preprocessor}""")

    # Load the model from the pickle file
    ROOT_PATH = pathlib.Path().resolve().parent # Get the parent directory of the current working directory
    model_path = os.path.join(ROOT_PATH, 'models', 'mvp_model.pkl')
    print(f"Path of the model.pkl:\n{model_path}\n")
    with open(model_path, 'rb') as file:
        model = pickle.load(file)

    # Build a dataframe with the inputs
    X_pred = pd.DataFrame({
        "age": age,
        "income": income,
        "loan_limit": loan_limit}, index = [0])
    print(f"Data before preprocessing:\n{X_pred}\n")

    # Transform the features
    X_preprocessed = preprocessor.transform(X_pred)
    print(f"Data after preprocessing:\n{X_preprocessed}\n")

    # Use the model to predict the given inputs
    prediction = model.predict(X_preprocessed)
    print(f"Prediction: {prediction}")

    print("✅ Prediction done succesfully")

    return prediction

In [65]:
my_prediction_function("35-44", 13680.0, "ncf", preprocessor)

Arguments taken into account:
        - age: 35-44
        - income: 13680.0
        - loan limit: ncf
        - preprocessor: ColumnTransformer(transformers=[('num',
                                 Pipeline(steps=[('imputer', SimpleImputer()),
                                                 ('scaler', StandardScaler())]),
                                 ['income']),
                                ('cat',
                                 Pipeline(steps=[('onehot',
                                                  OneHotEncoder(handle_unknown='ignore',
                                                                sparse_output=False))]),
                                 ['age', 'loan_limit'])])
Path of the model.pkl:
/home/nicolas/code/YannAll/automated_loan_review_project/models/mvp_model.pkl

Data before preprocessing:
     age   income loan_limit
0  35-44  13680.0        ncf

Data after preprocessing:
[[0.80425046 0.         1.         0.         0.         0.
  0.         0.  



array([0])

In [39]:
    # Load the model from the pickle file
    ROOT_PATH = pathlib.Path().resolve().parent # Get the parent directory of the current working directory
    model_path = os.path.join(ROOT_PATH, 'models', 'mvp_model.pkl')
    print(f"Path of the model.pkl:\n{model_path}\n")
    with open(model_path, 'rb') as file:
        model = pickle.load(file)

Path of the model.pkl:
/home/nicolas/code/YannAll/automated_loan_review_project/models/mvp_model.pkl



In [60]:
age = "55-64"
income = 5160.0
loan_limit = "cf"

In [61]:
    # Build a dataframe with the inputs
    X_pred = pd.DataFrame({
        "age": age,
        "income": income,
        "loan_limit": loan_limit}, index = [0])
    print(f"Data before preprocessing:\n{X_pred}\n")

Data before preprocessing:
     age  income loan_limit
0  55-64  5160.0         cf



In [62]:
    # Transform the features
    X_preprocessed = preprocessor.transform(X_pred)
    X_preprocessed
    # print(f"Data after preprocessing:\n{X_preprocessed}\n")


array([[-0.25699742,  0.        ,  0.        ,  0.        ,  1.        ,
         0.        ,  0.        ,  0.        ,  1.        ,  0.        ,
         0.        ]])

In [63]:
    # Use the model to predict the given inputs
    prediction = model.predict(X_preprocessed)
    print(f"Prediction: {prediction}")

Prediction: [0]




In [None]:
X_pred = pd.DataFrame({
        "age" : age,
        "income" : income,
        "loan_limit" : loan_limit}, index = [0])

X_pred

In [None]:
X_pred = preprocessor.transform(X_pred)
X_pred

In [None]:
X_pred = preprocessor.fit_transform(X)

# END OF THE NOTEBOOK

In [21]:
a = [1, 2]
print(len(a))

2


In [None]:
# Call functions to process the data
data = load_loan_data()
data = clean_data(data)
data = encode_categorical(data)
# data = knn_impute(data)
# data = tree_imputation(data)

# Create and fit the preprocessor
# preprocessor, categorical_features, numerical_features = create_preprocessor(data)

# Fit and transform the data using the preprocessor
# transformed_data = preprocessor.fit_transform(data)

# Get feature names from the preprocessor
# transformed_columns = numerical_features + list(preprocessor.named_transformers_['cat'].named_steps['onehot'].get_feature_names_out(categorical_features))

# Convert the transformed data into a DataFrame
# transformed_df = pd.DataFrame(transformed_data, columns=transformed_columns)

# Save the transformed DataFrame
output_path = os.path.join(pathlib.Path().resolve(), 'loan_preprocessed.csv')
data.to_csv(output_path, index=False)
# transformed_df.to_csv(output_path, index=False)
print(f"✅ Transformed data saved successfully at {output_path}")

In [None]:
ROOT_PATH = pathlib.Path().resolve().parent # Get the parent directory of the current working directory
raw_data_path = os.path.join(ROOT_PATH, 'notebooks', 'loan_preprocessed.csv')
print(raw_data_path)

In [None]:
data = pd.read_csv(raw_data_path)
data.head(3)

## Define the features (X) and the target (y)

In [None]:
X = data[["age", "income", "loan_limit"]]

# X = data.drop(columns='Status')
X.head(3)

In [None]:
y = data['Status']
y.head(3)

In [None]:
# Instantiate the model
model = LogisticRegression(max_iter=100000)

# Train the model on the full dataset
model.fit(X, y)

# Save the pretrain model
with open('../models/mvp_model.pkl', 'wb') as file:
    pickle.dump(model, file)

In [None]:
X_test_pred = X.iloc[0, :]

In [None]:
model.predict()

In [None]:
# Call functions to process the data
data = load_loan_data()
data = clean_data(data)
data = encode_categorical(data)
# data = knn_impute(data)
# data = tree_imputation(data)

# Create and fit the preprocessor
# preprocessor, categorical_features, numerical_features = create_preprocessor(data)

# Fit and transform the data using the preprocessor
# transformed_data = preprocessor.fit_transform(data)

# Get feature names from the preprocessor
# transformed_columns = numerical_features + list(preprocessor.named_transformers_['cat'].named_steps['onehot'].get_feature_names_out(categorical_features))

# Convert the transformed data into a DataFrame
# transformed_df = pd.DataFrame(transformed_data, columns=transformed_columns)

# Save the transformed DataFrame
output_path = os.path.join(pathlib.Path().resolve(), 'loan_preprocessed.csv')
data.to_csv(output_path, index=False)
# transformed_df.to_csv(output_path, index=False)
print(f"✅ Transformed data saved successfully at {output_path}")


In [None]:
data.head(5)

In [None]:
os.path.join(pathlib.Path().resolve())

## Prediction function

In [None]:
ROOT_PATH = pathlib.Path().resolve().parent # Get the parent directory of the current working directory
raw_data_path = os.path.join(ROOT_PATH, 'raw_data', 'Loan_Default.csv')
print(raw_data_path)

In [None]:
def my_prediction_function(age, income, loan_limit):
    """Prediction function using a pretrained model loaded from disk

    Arguments:
    - age
    - income
    - loan_limit
    """
    # Load the model from the pickle file
    ROOT_PATH = pathlib.Path().resolve().parent # Get the parent directory of the current working directory
    model_path = os.path.join(ROOT_PATH, 'models', 'mvp_model.pkl')
    with open(model_path, 'rb') as file:
        model = pickle.load(file)

    # Use the model to predict the given inputs
    X_pred = [age, income, loan_limit]
    prediction = model.predict([X_pred])

    return prediction


In [None]:
# Imputer

# Logistic regression model (for the score)

Miscel

In [None]:
X_user_input
model.predict()

In [None]:

LogisticRegression()
# Export the pipeline as a pickle file

model.fit(X, y)

model.score(X_test,y_test)

with open('../models/mvp_model.pkl', 'wb') as file:
    pickle.dump(pipeline, file)


In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
import pickle

# Load the preprocess dataset
load_loan_data()

X, y = load_iris(return_X_y=True)

# Create a simple pipeline
pipeline = Pipeline([
    ('scaler', MinMaxScaler()),
    ('logistic_regression', LogisticRegression())
])

# Fit the pipeline
pipeline.fit(X, y)


# Export the pipeline as a pickle file
with open('../models/mvp_model.pkl', 'wb') as file:
    pickle.dump(pipeline, file)

In [None]:
df = load_loan_data()
df

In [None]:
# Call
data = load_loan_data()
data = clean_data(data)
data = encode_categorical(data)
# data = knn_impute(data)
# data = tree_imputation(data)
# preprocessor = create_preprocessor()

In [None]:
data.columns

In [None]:
ROOT_PATH = pathlib.Path().resolve().parent # Get the parent directory of the current working directory
raw_data_path = os.path.join(ROOT_PATH, 'raw_data', 'Loan_Default.csv')
print(raw_data_path)

In [None]:
from package_folder.preprocessor import load_loan_data
load_loan_data()

In [None]:
preprocessor.py

In [None]:
# # Other method:
# ROOT_PATH = os.path.dirname(os.path.dirname(__file__))
# raw_data_path = os.path.join(ROOT_PATH, 'raw_data', 'Loan_Default.csv')
# raw_data_path

In [None]:
df = pd.read_csv(raw_data_path)
df.head(3)

## General EDA

In [None]:
df.shape

In [None]:
# Display DataFrame info
df_info = df.info(verbose=True)

In [None]:
# Non-null value in %
df.count()/len(df)*100

In [None]:
df.describe()

In [None]:
columns_categorical = df.select_dtypes(include='object').columns

In [None]:
columns_numerical = df.select_dtypes(include=['int64','float64']).columns

## Age

In [None]:
age_dist = pd.DataFrame(df.groupby('age').nunique()['ID']/len(df)*100).sort_values('ID', ascending=False)
age_dist

In [None]:
fig, ax = plt.subplots()
plt.bar(pd.DataFrame(df.groupby('age').nunique()['ID']/len(df)*100).sort_values('ID', ascending=False).index,
        pd.DataFrame(df.groupby('age').nunique()['ID']/len(df)*100).sort_values('ID', ascending=False)['ID'])

ax.set_title('Age distribution')

## Income

In [None]:
sns.histplot(data = df, x = 'income', kde=True)

In [None]:
# Plot the results
fig, ax = plt.subplots()

ax.hist(df['income'], bins=1000)
ax.set_title('Income distribution')
ax.set_xlim(0, 25000)
plt.show()

## Loan limit

In [None]:
loan_limit_dist = pd.DataFrame(df.groupby('loan_limit').nunique()['ID']/len(df)*100).sort_values('ID', ascending=False)
loan_limit_dist

In [None]:
fig, ax = plt.subplots()
plt.bar(pd.DataFrame(df.groupby('loan_limit').nunique()['ID']/len(df)*100).sort_values('ID', ascending=False).index,
        pd.DataFrame(df.groupby('loan_limit').nunique()['ID']/len(df)*100).sort_values('ID', ascending=False)['ID'])

ax.set_title('Loan limit distribution')

# Balancing

In [None]:
df.hist(column='income', bins=[0, 3720, 5760, 8520, max(df['income'])])

In [None]:
# fig, ax = plt.subplots()
df.hist(column='income', bins=[0, 2000, 3720, 4500, 5760, 7000, 8520, 15000, max(df['income'])])


In [None]:
df['ID'].nunique() == len(df)

In [None]:
a = pd.DataFrame(df.groupby('Gender').nunique()['ID']/len(df)*100).sort_values('ID', ascending=False)
a

In [None]:
plt.bar(pd.DataFrame(df.groupby('Gender').nunique()['ID']/len(df)*100).sort_values('ID', ascending=False).index,
        pd.DataFrame(df.groupby('Gender').nunique()['ID']/len(df)*100).sort_values('ID', ascending=False)['ID'])

In [None]:
max(df['income'])

In [None]:
a.index

In [None]:
df.groupby('Gender').nunique().index
pd.DataFrame(df.groupby('Gender').nunique()['ID']/len(df)*100).sort_values('ID', ascending=False).index

In [None]:
sns.barplot(x=pd.DataFrame(df.groupby('Gender').nunique()['ID']/len(df)*100).sort_values('ID', ascending=False).index,
            y=df.groupby('Gender').nunique()['ID']/len(df)*100,
            data=df)

In [None]:
fig, ax = plt.subplots(round(columns_categorical//3, ), 3)
for name_column in columns_categorical:
    ax.plt.bar(pd.DataFrame(df.groupby(name_column).nunique()['ID']/len(df)*100).sort_values('ID', ascending=False).index,
        pd.DataFrame(df.groupby(name_column).nunique()['ID']/len(df)*100).sort_values('ID', ascending=False)['ID'])

In [None]:
plt.bar(pd.DataFrame(df.groupby('Gender').nunique()['ID']/len(df)*100).sort_values('ID', ascending=False).index,
        pd.DataFrame(df.groupby('Gender').nunique()['ID']/len(df)*100).sort_values('ID', ascending=False)['ID'])

In [None]:
a = pd.DataFrame(df.groupby('Gender').nunique()['ID']/len(df)*100).sort_values('ID', ascending=False)
a

In [None]:
df['ID'].nunique() == len(df)

In [None]:
df.sort_values('ID’, ascending=False)