This notebook aims to create a function that convert user inputs to a dataframe to make a predict.

Now, it is possible:
- to have up to 17 parameters as an input (instead of only 3 before)
- to have missing inputs

The possible parameters are:
- loan_limit,
- Gender,
- open_credit,
- business_or_commercial,
- loan_amount,
- term,
- interest_only,
- lump_sum_payment,
- property_value,
- construction_type,
- occupancy_type,
- Secured_by,
- total_units,
- income,
- age,
- Region,
- Security_Type

# Import

In [247]:
# General
import pickle

# Analysis
import pandas as pd

# Visualisation
import seaborn as sns
import matplotlib.pyplot as plt

# Machine learning
from sklearn.linear_model import LogisticRegression


# Autoreload
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Loading the dataset and create a Dataframe

## Loading the data

In [248]:
ROOT_PATH = pathlib.Path().resolve().parent # Get the parent directory of the current working directory
raw_data_path = os.path.join(ROOT_PATH, 'raw_data', 'Loan_Default.csv')

# Printing raw_data_path
print('Raw_data_path: ', raw_data_path)

Raw_data_path:  /home/nicolas/code/YannAll/automated_loan_review_project/raw_data/Loan_Default.csv


## Creating the DataFrame

In [249]:
# Convert the Loan_Default.file to a DataFrame
data_raw = pd.read_csv(raw_data_path)
data_raw.head(3)

Unnamed: 0,ID,year,loan_limit,Gender,approv_in_adv,loan_type,loan_purpose,Credit_Worthiness,open_credit,business_or_commercial,loan_amount,rate_of_interest,Interest_rate_spread,Upfront_charges,term,Neg_ammortization,interest_only,lump_sum_payment,property_value,construction_type,occupancy_type,Secured_by,total_units,income,credit_type,Credit_Score,co-applicant_credit_type,age,submission_of_application,LTV,Region,Security_Type,Status,dtir1
0,24890,2019,cf,Sex Not Available,nopre,type1,p1,l1,nopc,nob/c,116500,,,,360.0,not_neg,not_int,not_lpsm,118000.0,sb,pr,home,1U,1740.0,EXP,758,CIB,25-34,to_inst,98.728814,south,direct,1,45.0
1,24891,2019,cf,Male,nopre,type2,p1,l1,nopc,b/c,206500,,,,360.0,not_neg,not_int,lpsm,,sb,pr,home,1U,4980.0,EQUI,552,EXP,55-64,to_inst,,North,direct,1,
2,24892,2019,cf,Male,pre,type1,p1,l1,nopc,nob/c,406500,4.56,0.2,595.0,360.0,neg_amm,not_int,not_lpsm,508000.0,sb,pr,home,1U,9480.0,EXP,834,CIB,35-44,to_inst,80.019685,south,direct,0,46.0


# Creating, fitting and saving a preprocessor for input data

In [250]:
# Numerical transformer
num_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy="median"))
])

# Categorical transformer
cat_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy="most_frequent"))
])

# Parallelize "num_transformer" and "cat_transfomer"
preprocessor = ColumnTransformer([
    ('num_transformer', num_transformer, num_col),
    ('cat_transformer', cat_transformer, cat_col)
])

preprocessor

In [251]:
# Fit the preprocessor to the raw data dataframe
preprocessor.fit(data_raw)

In [252]:
# Save the fitted preprocessor
with open('../models/preprocessor_fitted_input.pkl', 'wb') as file:
    pickle.dump(preprocessor, file)

# Creating a base input dataframe

## Loading the fitted preprocessor

In [253]:
# Load the model from the fitted preprocessor pickle file
ROOT_PATH = pathlib.Path().resolve().parent # Get the parent directory of the current working directory
preprocessor_input_path = os.path.join(ROOT_PATH, 'models', 'preprocessor_fitted_input.pkl')
print(f"Path of the preprocessor.pkl:\n{preprocessor_input_path}\n")
with open(preprocessor_input_path, 'rb') as file:
    preprocessor = pickle.load(file)

Path of the preprocessor.pkl:
/home/nicolas/code/YannAll/automated_loan_review_project/models/preprocessor_fitted_input.pkl



## Imputing the dataframe

In [255]:
# Creating a NaN dataframe ready to be imputed
df_nan = pd.DataFrame(np.nan, index = np.arange(1), columns = data_raw.columns)
df_nan

Unnamed: 0,ID,year,loan_limit,Gender,approv_in_adv,loan_type,loan_purpose,Credit_Worthiness,open_credit,business_or_commercial,loan_amount,rate_of_interest,Interest_rate_spread,Upfront_charges,term,Neg_ammortization,interest_only,lump_sum_payment,property_value,construction_type,occupancy_type,Secured_by,total_units,income,credit_type,Credit_Score,co-applicant_credit_type,age,submission_of_application,LTV,Region,Security_Type,Status,dtir1
0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [256]:
# Impute the Nan DF with value
df_base = preprocessor.transform(df_nan)
df_base

array([[99224.5, 2019.0, 296500.0, 3.99, 0.3904, 2596.45, 360.0,
        418000.0, 5760.0, 699.0, 75.13586957, 0.0, 39.0, 'cf', 'Male',
        'nopre', 'type1', 'p3', 'l1', 'nopc', 'nob/c', 'not_neg',
        'not_int', 'not_lpsm', 'sb', 'pr', 'home', '1U', 'CIB', 'CIB',
        '45-54', 'to_inst', 'North', 'direct']], dtype=object)

In [257]:
# Impute the Nan with the preprocessor
df_base = pd.DataFrame(preprocessor.transform(df_nan), index=np.arange(1), columns=preprocessor.get_feature_names_out())
df_base

Unnamed: 0,num_transformer__ID,num_transformer__year,num_transformer__loan_amount,num_transformer__rate_of_interest,num_transformer__Interest_rate_spread,num_transformer__Upfront_charges,num_transformer__term,num_transformer__property_value,num_transformer__income,num_transformer__Credit_Score,num_transformer__LTV,num_transformer__Status,num_transformer__dtir1,cat_transformer__loan_limit,cat_transformer__Gender,cat_transformer__approv_in_adv,cat_transformer__loan_type,cat_transformer__loan_purpose,cat_transformer__Credit_Worthiness,cat_transformer__open_credit,cat_transformer__business_or_commercial,cat_transformer__Neg_ammortization,cat_transformer__interest_only,cat_transformer__lump_sum_payment,cat_transformer__construction_type,cat_transformer__occupancy_type,cat_transformer__Secured_by,cat_transformer__total_units,cat_transformer__credit_type,cat_transformer__co-applicant_credit_type,cat_transformer__age,cat_transformer__submission_of_application,cat_transformer__Region,cat_transformer__Security_Type
0,99224.5,2019.0,296500.0,3.99,0.3904,2596.45,360.0,418000.0,5760.0,699.0,75.13587,0.0,39.0,cf,Male,nopre,type1,p3,l1,nopc,nob/c,not_neg,not_int,not_lpsm,sb,pr,home,1U,CIB,CIB,45-54,to_inst,North,direct


In [258]:
# Remove the first 17 characters from each column name
df_base.columns = df_base.columns.str.slice(17)
df_base

Unnamed: 0,ID,year,loan_amount,rate_of_interest,Interest_rate_spread,Upfront_charges,term,property_value,income,Credit_Score,LTV,Status,dtir1,loan_limit,Gender,approv_in_adv,loan_type,loan_purpose,Credit_Worthiness,open_credit,business_or_commercial,Neg_ammortization,interest_only,lump_sum_payment,construction_type,occupancy_type,Secured_by,total_units,credit_type,co-applicant_credit_type,age,submission_of_application,Region,Security_Type
0,99224.5,2019.0,296500.0,3.99,0.3904,2596.45,360.0,418000.0,5760.0,699.0,75.13587,0.0,39.0,cf,Male,nopre,type1,p3,l1,nopc,nob/c,not_neg,not_int,not_lpsm,sb,pr,home,1U,CIB,CIB,45-54,to_inst,North,direct


## Overwrite user's value in the DF

In [259]:
loan_limit=None
income=None
age=32

In [260]:
# Creating a dict from the inputs
dict = {
    'loan_limit': loan_limit,
    'income': income,
    'age': age
}
dict

{'loan_limit': None, 'income': None, 'age': 32}

In [261]:
# Creating a Dataframe from the dict
df_input = pd.DataFrame(dict, index = np.arange(1))
df_input

Unnamed: 0,loan_limit,income,age
0,,,32


In [262]:
# Get columns names
column_names = list(df_input.columns)
column_names

['loan_limit', 'income', 'age']

In [263]:
# Replace df_base values by inputs values if not None
for column_name in column_names:
    if df_input.at[0, column_name] is not None:
        df_base.at[0, column_name] = df_input.loc[0, column_name]

df_base

Unnamed: 0,ID,year,loan_amount,rate_of_interest,Interest_rate_spread,Upfront_charges,term,property_value,income,Credit_Score,LTV,Status,dtir1,loan_limit,Gender,approv_in_adv,loan_type,loan_purpose,Credit_Worthiness,open_credit,business_or_commercial,Neg_ammortization,interest_only,lump_sum_payment,construction_type,occupancy_type,Secured_by,total_units,credit_type,co-applicant_credit_type,age,submission_of_application,Region,Security_Type
0,99224.5,2019.0,296500.0,3.99,0.3904,2596.45,360.0,418000.0,5760.0,699.0,75.13587,0.0,39.0,cf,Male,nopre,type1,p3,l1,nopc,nob/c,not_neg,not_int,not_lpsm,sb,pr,home,1U,CIB,CIB,32,to_inst,North,direct


# Creating a base input dataframe in a function

In [264]:
def create_df_from_inputs(loan_limit=None, Gender=None, open_credit=None,
                     business_or_commercial=None, loan_amount=None,
                     term=None, interest_only=None, lump_sum_payment=None,
                     property_value=None, construction_type=None, occupancy_type=None,
                     Secured_by=None, total_units=None, income=None, age=None, Region=None,
                     Security_Type=None):

    # Load the model from the fitted preprocessor pickle file
    ROOT_PATH = pathlib.Path().resolve().parent # Get the parent directory of the current working directory
    preprocessor_input_path = os.path.join(ROOT_PATH, 'models', 'preprocessor_fitted_input.pkl')
    # print(f"Path of the preprocessor.pkl:\n{preprocessor_input_path}\n")
    with open(preprocessor_input_path, 'rb') as file:
        preprocessor = pickle.load(file)

    columns_names = [column_name[17:] for column_name in preprocessor.get_feature_names_out()]

    # Creating a NaN dataframe ready to be imputed
    df_nan = pd.DataFrame(np.nan, index = np.arange(1), columns =columns_names)

    # Impute the Nan with the preprocessor
    df_base = pd.DataFrame(preprocessor.transform(df_nan), index=np.arange(1), columns=preprocessor.get_feature_names_out())

    # Remove the first 17 characters from each column name
    df_base.columns = df_base.columns.str.slice(17)

    # Creating a dict from the inputs
    dict = {
        'loan_limit': loan_limit,
        'Gender': Gender,
        'open_credit': open_credit,
        'business_or_commercial': business_or_commercial,
        'loan_amount': loan_amount,
        'term': term,
        'interest_only': interest_only,
        'lump_sum_payment': lump_sum_payment,
        'property_value': property_value,
        'construction_type': construction_type,
        'occupancy_type': occupancy_type,
        'Secured_by': Secured_by,
        'total_units': total_units,
        'income': income,
        'age': age,
        'Region': Region,
        'Security_Type': Security_Type
    }

    # Creating a Dataframe from the dict
    df_input = pd.DataFrame(dict, index = np.arange(1))

    # Get columns names

    column_names = list(df_input.columns)

    # Replace df_base values by inputs values if not None
    for column_name in column_names:
        if df_input.at[0, column_name] is not None:
            df_base.at[0, column_name] = df_input.loc[0, column_name]

    return df_base

In [265]:
# Checking the function
pd.set_option('display.max_columns', None)
df_user_input_consolidated = create_df_from_inputs(loan_limit="azerty", age="45-54", income=34545)
df_user_input_consolidated

Unnamed: 0,ID,year,loan_amount,rate_of_interest,Interest_rate_spread,Upfront_charges,term,property_value,income,Credit_Score,LTV,Status,dtir1,loan_limit,Gender,approv_in_adv,loan_type,loan_purpose,Credit_Worthiness,open_credit,business_or_commercial,Neg_ammortization,interest_only,lump_sum_payment,construction_type,occupancy_type,Secured_by,total_units,credit_type,co-applicant_credit_type,age,submission_of_application,Region,Security_Type
0,99224.5,2019.0,296500.0,3.99,0.3904,2596.45,360.0,418000.0,34545,699.0,75.13587,0.0,39.0,azerty,Male,nopre,type1,p3,l1,nopc,nob/c,not_neg,not_int,not_lpsm,sb,pr,home,1U,CIB,CIB,45-54,to_inst,North,direct


# END OF THE NOTEBOOK