# Import Libraries

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, confusion_matrix, accuracy_score, precision_score, recall_score, roc_auc_score, classification_report, roc_curve
import scipy.stats as stats
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from xgboost import XGBClassifier
import warnings
warnings.filterwarnings('ignore')

# Load Data

### Data Source:
https://www.kaggle.com/datasets/devzohaib/eligibility-prediction-for-loan?resource=download

In [None]:
loan_data = pd.read_csv('/kaggle/input/eligibility-prediction-for-loan/Loan_Data.csv')

Problem Statement:

Dream Housing Finance company deals in all kinds of home loans. They want to be able to predict whether a prospective customer will be eligible for a loan or not, and therefore, have some targeted marketing campaigns. To achieve this, they would be looking for a model that most accurately predicts eligible customers. Thus, the model that most accurately predicts 'Y' as as 'Y' should be selected. That is, the model with the highest ACCURACY.

In [None]:
loan_data.info()

In [None]:
loan_data.head()

Certain columns with missing data:
- Gender
- Married
- Dependents
- Self_Employed
- LoanAmount
- Loan_Amount_Term
- Credit_History

Mix of numerical and categorical variables:
- Numerical Variables:
    - ApplicantIncome
    - CoapplicantIncome
    - LoanAmount
    - Loan_Amount_Term
- Categorical Variables:
    - Dependents
    - Gender
    - Married
    - Education
    - Self_Employed
    - Credit_History
    - Property_Area

Target variable is Loan_Status

In [None]:
# drop Index
data = loan_data.drop(columns = ['Loan_ID'], inplace = False)

In [None]:
data.nunique()

# Exploratory Data Analysis (EDA)

In [None]:
data.describe().T

Possibility of some outliers. However, most of the variables in real life will show some outliers so we should let the model learn to predict on outliers

# Numerical Features - Univariate Analysis

## Boxplot & Histogram

### ApplicantIncome

In [None]:
sns.boxplot(x=data['ApplicantIncome'])

In [None]:
import matplotlib.pyplot as plt
hist = plt.hist(x=data['ApplicantIncome'], density = True)
# Draw a vertical line in the histogram to visualize mean value of the numerical feature (NaNs will be ignored when calculating the mean)
plt.axvline(data['ApplicantIncome'].mean(), color = 'red', linestyle='--')
# Draw another vertical line in the histogram to visualize median value of the numerical feature (NaNs will be ignored when calculating the median)
plt.axvline(data['ApplicantIncome'].median(), color = 'black', linestyle='-')

Observations:
- Positiveky skewed distribution
- Outliers on right hand side, but expected with Income
- Mean and Median close enough

### CoapplicantIncome

In [None]:
sns.boxplot(x=data['CoapplicantIncome'])

In [None]:
hist = plt.hist(x=data['CoapplicantIncome'], density = True)
# Draw a vertical line in the histogram to visualize mean value of the numerical feature (NaNs will be ignored when calculating the mean)
plt.axvline(data['CoapplicantIncome'].mean(), color = 'red', linestyle='--')
# Draw another vertical line in the histogram to visualize median value of the numerical feature (NaNs will be ignored when calculating the median)
plt.axvline(data['CoapplicantIncome'].median(), color = 'black', linestyle='-')

Observations:
- Positiveky skewed distribution
- Outliers on right hand side, but expected with Income
- Mean and Median close enough

### LoanAmount

In [None]:
sns.boxplot(x=data['LoanAmount'])

In [None]:
hist = plt.hist(x=data['LoanAmount'], density = True)
# Draw a vertical line in the histogram to visualize mean value of the numerical feature (NaNs will be ignored when calculating the mean)
plt.axvline(data['LoanAmount'].mean(), color = 'red', linestyle='--')
# Draw another vertical line in the histogram to visualize median value of the numerical feature (NaNs will be ignored when calculating the median)
plt.axvline(data['LoanAmount'].median(), color = 'black', linestyle='-')

Observations:
- Slightly positiveky skewed distribution
- Outliers on right hand side, which may need to be treated
- Mean and Median close enough

# Categorical Features - Univariate Analysis

### Dependents

In [None]:
data['Dependents'].value_counts(normalize = True).plot.bar()
plt.xticks(rotation = 0)

Observation:
- Almost 60% of population has 0 dependents

### Gender

In [None]:
data['Gender'].value_counts(normalize = True).plot.bar()
plt.xticks(rotation = 0)

Observation:
- Almost 80% of population is Male

### Married

In [None]:
data['Married'].value_counts(normalize = True).plot.bar()
plt.xticks(rotation = 0)

Observation:
- Alittle over 60% of population is Married

### Education

In [None]:
data['Education'].value_counts(normalize = True).plot.bar()
plt.xticks(rotation = 0)

Observation:
- Almost 80% of population is Graduate

### Self_Employed

In [None]:
data['Self_Employed'].value_counts(normalize = True).plot.bar()
plt.xticks(rotation = 0)

Observation:
- Almost 80% of population is salaried

### Credit_History

In [None]:
data['Credit_History'].value_counts(normalize = True).plot.bar()
plt.xticks(rotation = 0)

Observation:
- Almost 80% of population has a credit history

### Property_Area

In [None]:
data['Property_Area'].value_counts(normalize = True).plot.bar()
plt.xticks(rotation = 0)

Observation:
- Majority of population resides in Urban / Semi-Urban areas

# Bivariate / Multivariate Analysis

In [None]:
sns.pairplot(loan_data, diag_kind= 'kde', hue = 'Loan_Status')

In [None]:
# Identify Correlation
data.corr()

In [None]:
data.corr(method = 'spearman')

In [None]:
# Plot the Correlation matrix
plt.figure(figsize=(15,10))
sns.heatmap(data.corr(), annot=True, linewidths=.5, vmin = -1, vmax = 1, fmt = '.2g')

In [None]:
plt.figure(figsize=(15,10))
sns.heatmap(data.corr(method = 'spearman'), annot=True, linewidths=.5, vmin = -1, vmax = 1, fmt = '.2g')

Very high correlation between 'ApplicantIncome' and 'LoanAmount' - therefore, one of them needs to be dropped to prevent independent variable multicollinearity.

This is verified by VIF calculation shown below

In [None]:
data.isnull().sum()

# Data Preprocessing

## Imputing null values

In [None]:
# Imputing null for categorical variables with mode
data['Dependents'] = data['Dependents'].fillna(data['Dependents'].mode()[0])
data['Gender'] = data['Gender'].fillna(data['Gender'].mode()[0])
data['Married'] = data['Married'].fillna(data['Married'].mode()[0])
data['Education'] = data['Education'].fillna(data['Education'].mode()[0])
data['Self_Employed'] = data['Self_Employed'].fillna(data['Self_Employed'].mode()[0])
data['Credit_History'] = data['Credit_History'].fillna(data['Credit_History'].mode()[0])

In [None]:
# Imputing null for numerical variables with median - mean was not chosen because of outliers in the data
data['LoanAmount'].fillna(data['LoanAmount'].median(), inplace = True)
data['Loan_Amount_Term'].fillna(data['Loan_Amount_Term'].median(), inplace = True)

In [None]:
data.isnull().sum()

## Detecting multicollinearity

In [None]:
# Calculating VIF for all numerical variables
from statsmodels.stats.outliers_influence import variance_inflation_factor

x = data[['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term']]

vif = pd.DataFrame()
vif["features"] = x.columns
vif["vif_Factor"] = [variance_inflation_factor(x.values, i) for i in range(x.shape[1])]
vif

In [None]:
# Calculating VIF for all numerical variables except LoanAmount
from statsmodels.stats.outliers_influence import variance_inflation_factor

x = data[['ApplicantIncome', 'CoapplicantIncome', 'Loan_Amount_Term']]

vif = pd.DataFrame()
vif["features"] = x.columns
vif["vif_Factor"] = [variance_inflation_factor(x.values, i) for i in range(x.shape[1])]
vif

Since VIF fector has significantly reduced, it is best to drop LoanAmount from the set of independent variables

In [None]:
# Drop LoanAmount from data set
data.drop(columns = ['LoanAmount'], inplace = True)

### Balancing Dataset

In [None]:
pip install imblearn

In [None]:
import imblearn

In [None]:
from imblearn.over_sampling import RandomOverSampler

In [None]:
ros = RandomOverSampler(random_state=0)

In [None]:
x = data.drop(columns = ['Loan_Status'])
y = data['Loan_Status']

In [None]:
x, y = ros.fit_resample(x, y)

In [None]:
ax = sns.countplot(x=y)

In [None]:
df1=pd.concat([x,y],axis=1)
df1.head()

## Outlier Treatment

There are outliers in ApplicantIncome, CoapplicantIncome and Loan_Amount_Term. However, in real life, we would expect these variables to display some outliers. Therefore, it is important not to treat them

## Data Prep for Moedling

In [None]:
# OneHotEncoding of object dtype columns
cat_cols = ['Dependents','Gender','Married','Education','Self_Employed','Credit_History','Property_Area']
dummies = pd.get_dummies(df1[cat_cols])
dummies

In [None]:
#Concat dummmies into data frame
df1 = pd.concat([df1, dummies], axis = 1)

In [None]:
df1.shape

In [None]:
# Drop original categorical columns from data frame
df1 = df1.drop(cat_cols, axis = 1)

In [None]:
df1.shape

In [None]:
# determine whether tha target column is balanced or not
y.value_counts()

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
df1['Loan_Status']

In [None]:
labelencoder = LabelEncoder()

In [None]:
# Convert the target variable into binary format
y = labelencoder.fit_transform(df1['Loan_Status'])

In [None]:
y

In [None]:
# segregate the target variable
x = df1.drop(columns = ['Loan_Status'])

# splitting data into training and test set
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.25, random_state = 42, stratify = y)

In [None]:
x_train.shape

In [None]:
x_test.shape

In [None]:
y_train.shape

In [None]:
y_test.shape

# Developing Classification Models

In [None]:
def get_metrics_score(clf, flag = True):
    '''
    Function to calculate the following metrics of a fitted classifier - Accuracy, Recall and Precision
    Params:
    -------
    clf: a fitted classifier to predict values
    '''

    # defining an empty list to store train and test results
    score_list=[] 
    # predict on both the training and test sets
    pred_train = clf.predict(x_train)
    pred_test = clf.predict(x_test)
    # calculate Accuracy
    train_acc = accuracy_score(y_train, pred_train)
    test_acc = accuracy_score(y_test, pred_test)
    # calculate Recall
    train_recall = recall_score(y_train, pred_train)
    test_recall = recall_score(y_test, pred_test)
    # calculate Precision
    train_precision = precision_score(y_train, pred_train)
    test_precision = precision_score(y_test, pred_test)
    # calculate F1 score
    F1_Score = f1_score(y_test, pred_test)
    # calculate ROC_AUC_score
    Roc_Auc_score = roc_auc_score(y_test, pred_test)    
    # add these scores to score_list
    score_list.extend((train_acc, test_acc, train_recall, test_recall, train_precision, test_precision, f1_score, roc_auc_score))
        
    # If the flag is set to True then only the following print statements will be dispayed. The default value is set to True.
    if flag == True: 
        print("Accuracy on training set : ", train_acc)
        print("Accuracy on test set : ", test_acc)
        print("Recall on training set : ", train_recall)
        print("Recall on test set : ", test_recall)
        print("Precision on training set : ", train_precision)
        print("Precision on test set : ", test_precision)
        print("F1_Score : ", F1_Score)
        print("Roc_Auc_score : ", Roc_Auc_score)
    
    return score_list # returning the list with train and test scores

# RandomForest Classifier

In [None]:
rf = RandomForestClassifier(random_state = 42, class_weight = 'balanced')
rf.fit(x_train, y_train)
predictions_rf = rf.predict(x_test)

In [None]:
predictions_rf.shape

In [None]:
def draw_cm(actual,predicted):
    cm = confusion_matrix(actual,predicted)
    sns.heatmap(cm,annot=True, fmt='.2f', xticklabels=[0,1], yticklabels=[0,1])
    plt.ylabel('observed')
    plt.xlabel('Predicted')
    plt.show()

In [None]:
draw_cm(y_test, predictions_rf)

In [None]:
print(classification_report(y_test, predictions_rf))

In [None]:
rf_scores = get_metrics_score(rf)

# Bagging Classifier on Random Forest

In [None]:
baggingrf = BaggingClassifier(rf, random_state = 42)
baggingrf.fit(x_train, y_train)
predictions_baggingrf = baggingrf.predict(x_test)

In [None]:
draw_cm(y_test, predictions_baggingrf)

In [None]:
print(classification_report(y_test, predictions_baggingrf))

In [None]:
baggingrf_scores = get_metrics_score(baggingrf)

# Logistic Regression Classifier

In [None]:
lr = LogisticRegression(random_state = 42, class_weight = None)
lr.fit(x_train, y_train)
predictions_lr = lr.predict(x_test)

In [None]:
predictions_lr.shape

In [None]:
draw_cm(y_test, predictions_lr)

In [None]:
print(classification_report(y_test, predictions_lr))

In [None]:
logistic_regression_scores = get_metrics_score(lr)

# KNN Classifier

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
knn = KNeighborsClassifier(n_neighbors = 10, algorithm = 'kd_tree')

In [None]:
knn.fit(x_train, y_train)
predictions_knn = knn.predict(x_test)

In [None]:
predictions_knn.shape

In [None]:
draw_cm(y_test, predictions_knn)

In [None]:
print(classification_report(y_test, predictions_knn))

In [None]:
knn_scores = get_metrics_score(knn)

# Bagging Classifier on KNN

In [None]:
baggingknn = BaggingClassifier(knn, random_state = 42)
baggingknn.fit(x_train, y_train)
predictions_bknn = baggingknn.predict(x_test)

In [None]:
print(classification_report(y_test, predictions_bknn))

In [None]:
baggingknn_scores = get_metrics_score(baggingknn)

In [None]:
draw_cm(y_test, predictions_bknn)

# Boosting

## AdaBoost

In [None]:
abc = AdaBoostClassifier(random_state = 42)
abc.fit(x_train, y_train)
predictions_abc = abc.predict(x_test)

In [None]:
draw_cm(y_test, predictions_abc)

In [None]:
print(classification_report(y_test, predictions_abc))

In [None]:
abc_scores = get_metrics_score(abc)

## XGBoost Classifier

In [None]:
xgb = XGBClassifier(random_state = 42, scale_pos_weight = 1)
xgb.fit(x_train, y_train)
predictions_xgb = xgb.predict(x_test)

In [None]:
def draw_cm(actual,predicted):
    cm = confusion_matrix(actual,predicted)
    sns.heatmap(cm,annot=True, fmt='.2f', xticklabels=[0,1], yticklabels=[0,1])
    plt.ylabel('observed')
    plt.xlabel('Predicted')
    plt.show()

draw_cm(y_test, predictions_xgb)

In [None]:
print(classification_report(y_test, predictions_xgb))

In [None]:
xgb_scores = get_metrics_score(xgb)

# Choosing the best models

In [None]:
pd.DataFrame(data = {'Random Forest': rf_scores, 'Bagging Random Forest': baggingrf_scores, 'Logistic Regression': logistic_regression_scores, 'KNN': knn_scores, 'Bagging KNN': baggingknn_scores,'AdaBoost': abc_scores, 'XGBoost': xgb_scores}, index = ['Accuracy - Train', 'Accuracy - Test', 'Recall - Train', 'Recall - Test', 'Precision - Train', 'Precision - Test', 'F1', 'ROC'])

Based on the above chart, Random Forest and XGBoost by far outperform all other models, with the former edging slightly ahead on Accuracy while the latter being superior based on Precision_Test. Also, with a close enough Accuracy on both train and test data, the model does not appear to be overfitting.

We will now Cross-Validate both these models to see the results

# Cross Validation

In [None]:
from sklearn.model_selection import KFold, cross_val_score

clf = RandomForestClassifier(random_state=42)

k_folds = KFold(n_splits = 5)

scores = cross_val_score(clf, x, y, cv = k_folds)

print("Cross Validation Scores: ", scores)
print("Average CV Score: ", scores.mean())
print("Number of CV Scores used in Average: ", len(scores))

In [None]:
from sklearn.model_selection import KFold, cross_val_score

clf = XGBClassifier(random_state=42)

k_folds = KFold(n_splits = 5)

scores = cross_val_score(clf, x, y, cv = k_folds)

print("Cross Validation Scores: ", scores)
print("Average CV Score: ", scores.mean())
print("Number of CV Scores used in Average: ", len(scores))

XGBoost takes a very slight edge after Corss Validation of the results.

However, since the difference in CV is not significant, we will rely on our primary criteria, i.e., Accuracy - based on higher accuracy, Random Forest is our selected model

# ROC Curve Plot

### RandomForest

In [None]:
def plot_roc_curve(y_test, y_pred):
    """
    plots the roc curve based of the probabilities
    """

    fpr, tpr, thresholds = roc_curve(y_test, y_pred)
    plt.plot(fpr, tpr)
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')

In [None]:
plot_roc_curve(y_test, predictions_rf)
print(f'model 1 AUC score: {roc_auc_score(y_test, predictions_rf)}')

### XGBoost

In [None]:
plot_roc_curve(y_test, predictions_xgb)
print(f'model 1 AUC score: {roc_auc_score(y_test, predictions_xgb)}')

### FEATURE IMPORTANCE

### Random Forest

In [None]:

importances=rf.feature_importances_


In [None]:
# Sort the feature importance in descending order
#
sorted_indices = np.argsort(importances)[::-1]
 
feat_labels = df1.columns
 
for f in range(x_train.shape[1]):
    print("%2d) %-*s %f" % (f + 1, 30,
                            feat_labels[sorted_indices[f]],
                            importances[sorted_indices[f]]))

In [None]:
plt.title('Feature Importance')
plt.bar(range(x_train.shape[1]), importances[sorted_indices], align='center')
plt.xticks(range(x_train.shape[1]), x_train.columns[sorted_indices], rotation=90)
plt.tight_layout()
plt.show()

In [None]:
# Based on the above analysis, Random Forest is the best model with the highest accuracy.