In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import missingno as mso
import seaborn as sns
import warnings
import os
import scipy

from scipy import stats
from scipy.stats import pearsonr
from scipy.stats import ttest_ind
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier


In [None]:
df = pd.read_csv("../input/loan-predication/train_u6lujuX_CVtuZ9i (1).csv")
df.head()

In [None]:
print(df.shape)

In [None]:
df.Gender.value_counts(dropna=False)

In [None]:
df.Married.value_counts(dropna=False)

In [None]:
df.Education.value_counts(dropna=False)

In [None]:
df.Self_Employed.value_counts(dropna=False)

In [None]:
df.Credit_History.value_counts(dropna=False)

In [None]:
df.Property_Area.value_counts(dropna=False)

In [None]:
df.Loan_Status.value_counts(dropna=False)

In [None]:
sns.countplot(x="Loan_Status", data=df, palette="YlOrBr")
plt.show()

In [None]:
df.Loan_Amount_Term.value_counts(dropna=False)

In [None]:
df[['ApplicantIncome','CoapplicantIncome','LoanAmount']].describe()

In [None]:
#Heatmap plotted
plt.figure(figsize=(10,7))
sns.heatmap(df.corr(), annot=True, cmap='inferno');

There is clearly a positive correlation between Loan Amount and Applicant Income

In [None]:
pd.crosstab(df.Gender,df.Married).plot(kind="bar", stacked=True, figsize=(5,5), color=['#f64f59','#12c2e9'])
plt.title('Gender vs Married')
plt.xlabel('Gender')
plt.ylabel('Frequency')
plt.xticks(rotation=0)
plt.show()

In [None]:
pd.crosstab(df.Self_Employed,df.Credit_History).plot(kind="bar", stacked=True, figsize=(5,5), color=['#544a7d','#ffd452'])
plt.title('Self Employed vs Credit History')
plt.xlabel('Self Employed')
plt.ylabel('Frequency')
plt.legend(["Bad Credit", "Good Credit"])
plt.xticks(rotation=0)
plt.show()

In [None]:
pd.crosstab(df.Property_Area,df.Loan_Status).plot(kind="bar", stacked=True, figsize=(5,5), color=['#333333','#dd1818'])
plt.title('Property Area vs Loan Status')
plt.xlabel('Property Area')
plt.ylabel('Frequency')
plt.xticks(rotation=0)
plt.show()

In [None]:
#Boxplots created to identify and remove outliers within specified IQR of the data
sns.boxplot(x="Loan_Status", y="ApplicantIncome", data=df, palette="mako");

In [None]:
sns.boxplot(x="CoapplicantIncome", y="Loan_Status", data=df, palette="rocket");

In [None]:
sns.boxplot(x="Loan_Status", y="LoanAmount", data=df, palette="YlOrBr");

In [None]:
df.isnull().sum()

In [None]:
#unnecessary variable dropped
df = df.drop(['Loan_ID'], axis = 1)

In [None]:
#categorical missing values imputed with mode
df['Gender'].fillna(df['Gender'].mode()[0],inplace=True)
df['Married'].fillna(df['Married'].mode()[0],inplace=True)
df['Dependents'].fillna(df['Dependents'].mode()[0],inplace=True)
df['Self_Employed'].fillna(df['Self_Employed'].mode()[0],inplace=True)
df['Credit_History'].fillna(df['Credit_History'].mode()[0],inplace=True)
df['Loan_Amount_Term'].fillna(df['Loan_Amount_Term'].mode()[0],inplace=True)

In [None]:
#numerical missing values imputed with mean
df['LoanAmount'].fillna(df['LoanAmount'].mean(),inplace=True)

In [None]:
#one hot encoding performed
df = pd.get_dummies(df)

# Drop columns
df = df.drop(['Gender_Female', 'Married_No', 'Education_Not Graduate', 
              'Self_Employed_No', 'Loan_Status_N'], axis = 1)

# Rename columns name
new = {'Gender_Male': 'Gender', 'Married_Yes': 'Married', 
       'Education_Graduate': 'Education', 'Self_Employed_Yes': 'Self_Employed',
       'Loan_Status_Y': 'Loan_Status'}
       
df.rename(columns=new, inplace=True)

In [None]:
#removal of outliers done based on the boxplots above
Q1 = df.quantile(0.25)
Q3 = df.quantile(0.75)
IQR = Q3 - Q1

df = df[~((df < (Q1 - 1.5 * IQR)) |(df > (Q3 + 1.5 * IQR))).any(axis=1)]

In [None]:
# Square Root Transformation done to normalize the data; part of feature standardization

df.ApplicantIncome = np.sqrt(df.ApplicantIncome)
df.CoapplicantIncome = np.sqrt(df.CoapplicantIncome)
df.LoanAmount = np.sqrt(df.LoanAmount)

In [None]:
X = df.drop(["Loan_Status"], axis=1)
y = df["Loan_Status"]

In [None]:
#SMOTE resampling done to handle imbalance in the dataset
X, y = SMOTE().fit_resample(X, y)

In [None]:
sns.set_theme(style="darkgrid")
sns.countplot(y=y, data=df, palette="coolwarm")
plt.ylabel('Loan Status')
plt.xlabel('Total')
plt.show()

In [None]:
#features are normalized
X = MinMaxScaler().fit_transform(X)

In [None]:
#features split according to an 80-20 train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [None]:
#first model's implementation, logistic regression carried out
LRclassifier = LogisticRegression(solver='saga', max_iter=500, random_state=1)
LRclassifier.fit(X_train, y_train)

y_pred = LRclassifier.predict(X_test)

print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

from sklearn.metrics import accuracy_score
LRAcc = accuracy_score(y_pred,y_test)
print('LR accuracy: {:.2f}%'.format(LRAcc*100))

In [None]:
conf_matrix = confusion_matrix(y_test, y_pred) 
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', cbar=False,
            xticklabels=['Rejected', 'Approved'], yticklabels=['Rejected', 'Approved'])
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix for Logistic Regression')
plt.show()


In [None]:
#second model's implementation, knn carried out
k_value = 5  

knn_classifier = KNeighborsClassifier(n_neighbors=k_value)
knn_classifier.fit(X_train, y_train)

y_pred_knn = knn_classifier.predict(X_test)

knn_acc = accuracy_score(y_test, y_pred_knn)
print('K-NN accuracy: {:.2f}%'.format(knn_acc * 100))

conf_matrix_knn = confusion_matrix(y_test, y_pred_knn)
print("Confusion Matrix:\n", conf_matrix_knn)

plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix_knn, annot=True, fmt='d', cmap='Blues', cbar=False,
            xticklabels=['Rejected', 'Approved'], yticklabels=['Rejected', 'Approved'])
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title(f'Confusion Matrix for K-NN with k={k_value}')
plt.show()