## Loan Prediction (Stacking)

## Step 1: Import Libraries + Load Dataset

In [1]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler # Scaling only for logisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import pandas as pd
import numpy

loan= pd.read_csv("loan_data.csv")
print(loan.isnull().sum())
loan.head()

Loan_ID               0
Gender                5
Married               0
Dependents            8
Education             0
Self_Employed        21
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount            0
Loan_Amount_Term     11
Credit_History       30
Property_Area         0
Loan_Status           0
dtype: int64


Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
1,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
2,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
3,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y
4,LP001013,Male,Yes,0,Not Graduate,No,2333,1516.0,95.0,360.0,1.0,Urban,Y


## Step 2: Handling

In [2]:
# Catagorical features
loan["Gender"]= loan["Gender"].fillna(loan["Gender"].mode()[0])
loan["Dependents"]= loan["Dependents"].fillna(loan["Dependents"].mode()[0])
loan["Self_Employed"]= loan["Self_Employed"].fillna(loan["Self_Employed"].mode()[0])

# Numerical features
loan["Loan_Amount_Term"]= loan["Loan_Amount_Term"].fillna(loan["Loan_Amount_Term"].median())
loan["Credit_History"]= loan["Credit_History"].fillna(loan["Credit_History"].median())


## Step 3: Maping the Target + Training

In [3]:
loan["Loan_Status"]= loan["Loan_Status"].map({"Y": 1, "N": 0})

# Select features and Target
X= loan.drop("Loan_Status", axis=1)
y=loan["Loan_Status"]

# # Encode catagorical features
X= pd.get_dummies(X, drop_first= True)

# Train
X_train, X_test, y_train, y_test= train_test_split(X, y, stratify= y, test_size= 0.2, random_state= 42)


## Step 4: Base Models

In [4]:

scaler= StandardScaler()
X_train_lr= scaler.fit_transform(X_train)
X_test_lr= scaler.transform(X_test)

models= [("lr", LogisticRegression(max_iter= 1000, random_state= 42)),
         ("rf", RandomForestClassifier(n_estimators= 100, random_state= 42)),
         ("xg", XGBClassifier(eval_metric= "logloss", random_state= 42))]

train_predictions=[]
test_predictions=[]

for name, model in models:
    if name == "lr":  # only for lr

        model.fit(X_train_lr, y_train)
        train_pre= model.predict_proba(X_train_lr)[:,1]
        test_pre= model.predict_proba(X_test_lr)[:,1]

    else:

        model.fit(X_train, y_train)
        # [:,1] -> probability of class 1 of every model
        train_pre= model.predict_proba(X_train)[:,1]
        test_pre= model.predict_proba(X_test)[:,1]

    train_predictions.append(train_pre)
    test_predictions.append(test_pre)

# this is probabilities of these 3 models for 1
# now the probabilities become the input of meta model

## Step 5: Stacking

In [5]:
# now the prediction list is in 1D input but meta model needs 2D input (np.column_stack)
X_train_stack= numpy.column_stack(train_predictions)
X_test_stack= numpy.column_stack(test_predictions) # feature of meta model

stack_model= LogisticRegression(max_iter= 1000, random_state= 42)
stack_model.fit(X_train_stack, y_train)
stack_pre= stack_model.predict_proba(X_test_stack)

## Step 6: roc_auc_score

In [6]:

for name, model in models:
    if name == "lr":
        acc= roc_auc_score(y_test, model.predict_proba(X_test_lr)[:,1])
    else:    
        acc= roc_auc_score(y_test, model.predict_proba(X_test)[:,1])
    print("roc_auc_score of ", name, "is: ", acc)
    
stack_acc= roc_auc_score(y_test, stack_pre[:,1])
print("roc_auc_score of stacking model: ", stack_acc)

roc_auc_score of  lr is:  0.8983471074380165
roc_auc_score of  rf is:  0.8545454545454545
roc_auc_score of  xg is:  0.7909090909090908
roc_auc_score of stacking model:  0.8958677685950412
