<a href="https://colab.research.google.com/github/anujott-codes/Credit-Loan-Approval/blob/main/Credit_Approval.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##Import Libraries

In [None]:
#import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix, roc_auc_score, roc_curve, classification_report, accuracy_score
from xgboost import XGBClassifier
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

import warnings
warnings.filterwarnings('ignore')

##Credit Card Approval

###import dataset

In [None]:
#using credit dataset from kaggle derived from UCI
credit_df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Machine Learning/Project/Credit Card and Loan Approval/credit_approval_dataset.csv")
credit_df.head()

###EDA

In [None]:
#info regarding dataset
credit_df.info()

In [None]:
#check null values
credit_df.isna().sum()

In [None]:
#check duplicates
credit_df.duplicated().sum()

In [None]:
#unique values per column
credit_df.nunique()

In [None]:
#description of the dataset
credit_df.describe()

In [None]:
#extracting numerical features and categorical features
num_features = credit_df.select_dtypes(exclude='O').columns.to_list()
cat_features = credit_df.select_dtypes(include='O').columns.to_list()

print("----- Numerical Features -----")
print(num_features)
print(f"Total : {len(num_features)}")
print("\n")
print("----- Categorical Features -----")
print(cat_features)
print(f"Total : {len(cat_features)}")

In [None]:
#function to plot distributions
def plot(df, col ,num):
  plt.subplot(4,4,num)
  sns.histplot(data = df,x=col,kde=True, color='skyblue')
  plt.title(f"{col} Distribution")
  plt.xlabel(" ")

In [None]:
#plotting distributions
plt.figure(figsize=(16,16))
for i,col in enumerate(num_features):
  plot(credit_df,col,i+1)

plt.tight_layout()
plt.show()

Debt,Incmome and Credit Score have been already scaled

In [None]:
#function for pie chart
def pie_plot(col, num, df=credit_df):
    plt.subplot(3,3,num)
    plt.pie(
        df[col].value_counts(),
        labels=df[col].value_counts().index,
        autopct='%1.1f%%',
        colors=sns.color_palette('pastel')
    )
    plt.title(f"{col} Distribution")


In [None]:
#pie chart for binary distributions
plt.figure(figsize=(14,14))
binary_features = ['Gender','Married','PriorDefault','BankCustomer','Employed','DriversLicense','Approved']
for i,col in enumerate(binary_features):
  pie_plot(col,i+1)
plt.show()

In [None]:
#pie chart for categorical features
plt.figure(figsize=(16,16))
for i,col in enumerate(cat_features):
  pie_plot(col,i+1)

plt.tight_layout()
plt.show()

In [None]:
#check collinearity and feature correlation
plt.figure(figsize=(12,10))
numeric_df = credit_df.select_dtypes(exclude='O')
correlation_matrix = numeric_df.corr()
sns.heatmap(correlation_matrix, cmap='viridis',annot=True)
plt.show()

In [None]:
#pairplot
sns.pairplot(data = credit_df)
plt.show()

In [None]:
#dropping less informative features
final_df = credit_df.copy()
final_df = final_df.drop(columns=['Ethnicity','ZipCode'])
final_df.head()

In [None]:
final_df.shape

In [None]:
#continuous distributions
continuous_features = ['Age','Debt','YearsEmployed','CreditScore','Income']

In [None]:
#function for box plot to analyse outliers
def boxplot(col,num,df=final_df):
  plt.subplot(3,3,num)
  sns.set_style('dark')
  sns.boxplot(data = df,x = col, color='purple')
  plt.title(f'{col} Boxplot Distribution')

In [None]:
#plotting boxplot for continuous distributions
plt.figure(figsize=(12,12))
for i,col in enumerate(continuous_features):
  boxplot(col,i+1)

plt.tight_layout()
plt.show()

##Data Split

In [None]:
X = final_df.drop(columns = ['Approved'])
y = final_df['Approved']

In [None]:
X.head()

In [None]:
#data split for training set and test set
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,stratify=y,random_state=42)

In [None]:
X_train.head()

In [None]:
X_test.head()

In [None]:
#setup preprocessor
#extracting numerical and categorical features
categorical_features = X_train.select_dtypes(include='O').columns.to_list()
binary_features.remove('Approved')

preprocessor = ColumnTransformer(
    transformers=[
        ('bin','passthrough',binary_features),
        ('num',StandardScaler(),continuous_features),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features)
    ]
)

#preprocessing X_train and X_test
X_train_preprocessed = preprocessor.fit_transform(X_train)
X_test_preprocessed = preprocessor.transform(X_test)

In [None]:
#saving preprocessor for deployment
joblib.dump(preprocessor,'preprocessor.pkl')

In [None]:
#logistic Regression
lr = LogisticRegression(random_state = 42)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Perform 5-fold CV
lr_scores = cross_val_score(lr, X_train_preprocessed, y_train, cv=skf, scoring='accuracy')

print("Accuracy scores for each fold:", lr_scores)
print("Mean accuracy:", lr_scores.mean())
print("Standard deviation:", lr_scores.std())



In [None]:
#SVC
svc = SVC(random_state = 42,probability=True)

# Perform 5-fold CV
svc_scores = cross_val_score(svc, X_train_preprocessed, y_train, cv=skf, scoring='accuracy')

print("Accuracy scores for each fold:", svc_scores)
print("Mean accuracy:", svc_scores.mean())
print("Standard deviation:", svc_scores.std())

In [None]:
#KNN
knn = KNeighborsClassifier()

# Perform 5-fold CV
knn_scores = cross_val_score(knn, X_train_preprocessed, y_train, cv=skf, scoring='accuracy')

print("Accuracy scores for each fold:", knn_scores)
print("Mean accuracy:", knn_scores.mean())
print("Standard deviation:", knn_scores.std())

In [None]:
#Naive Bayes
nb = GaussianNB()

# Perform 5-fold CV
nb_scores = cross_val_score(nb, X_train_preprocessed, y_train, cv=skf, scoring='accuracy')

print("Accuracy scores for each fold:", nb_scores)
print("Mean accuracy:", nb_scores.mean())
print("Standard deviation:", nb_scores.std())

In [None]:
#RandomForestClassifier
rf = RandomForestClassifier(random_state=42)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Perform 5-fold CV
rf_scores = cross_val_score(rf, X_train_preprocessed, y_train, cv=skf, scoring='accuracy')

print("Accuracy scores for each fold:", rf_scores)
print("Mean accuracy:", rf_scores.mean())
print("Standard deviation:", rf_scores.std())

In [None]:
#GradientBoostingClassifier
gb = GradientBoostingClassifier(random_state=22)

# Perform 5-fold CV
gb_scores = cross_val_score(gb, X_train_preprocessed, y_train, cv=skf, scoring='accuracy')

print("Accuracy scores for each fold:", gb_scores)
print("Mean accuracy:", gb_scores.mean())
print("Standard deviation:", gb_scores.std())

In [None]:
#XGBClassifier
xgb = XGBClassifier(
    n_estimators=200,
    learning_rate=0.1,
    random_state=42,
    use_label_encoder=False,
    eval_metric='logloss',
    objective='binary:logistic'
)

# Perform 5-fold CV
xgb_scores = cross_val_score(xgb, X_train_preprocessed, y_train, cv=skf, scoring='accuracy')

print("Accuracy scores for each fold:", xgb_scores)
print("Mean accuracy:", xgb_scores.mean())
print("Standard deviation:", xgb_scores.std())

LogisticRegression, SVC, Random Forests perform well

##Evaluation on Testing Set

In [None]:
#logistic regression
lr.fit(X_train_preprocessed,y_train)

#prediction on training set
ypred_train_lr = lr.predict(X_train_preprocessed)

#prediction on testing set
ypred_test_lr = lr.predict(X_test_preprocessed)

#classifiction report for training set
clf_report_train_lr = classification_report(y_train,ypred_train_lr)
print("----- Classification report for Training Set -----")
print(clf_report_train_lr)

#classification report for testing set
clf_report_test_lr = classification_report(y_test,ypred_test_lr)
print("----- Classification report for Testing Set -----")
print(clf_report_test_lr)

In [None]:
# Support Vector Classifier
svc.fit(X_train_preprocessed,y_train)

#prediction on training set
ypred_train_svc = svc.predict(X_train_preprocessed)

#prediction on testing set
ypred_test_svc = svc.predict(X_test_preprocessed)

#classifiction report for training set
clf_report_train_svc = classification_report(y_train,ypred_train_svc)
print("----- Classification report for Training Set -----")
print(clf_report_train_svc)

#classification report for testing set
clf_report_test_svc = classification_report(y_test,ypred_test_svc)
print("----- Classification report for Testing Set -----")
print(clf_report_test_svc)


In [None]:
#Random Forest Classifier
rf.fit(X_train_preprocessed,y_train)

#prediction on training set
ypred_train_rf = rf.predict(X_train_preprocessed)

#prediction on testing set
ypred_test_rf = rf.predict(X_test_preprocessed)

#classifiction report for training set
clf_report_train_rf = classification_report(y_train,ypred_train_rf)
print("----- Classification report for Training Set -----")
print(clf_report_train_rf)

#classification report for testing set
clf_report_test_rf = classification_report(y_test,ypred_test_rf)
print("----- Classification report for Testing Set -----")
print(clf_report_test_rf)

SVC Performs the Best

##Confusion Matrix, ROC_AUC curve and Evaluation metrics

In [None]:
#confusion matrix
cm = confusion_matrix(y_test,ypred_test_svc)
plt.figure(figsize=(6,5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Reject','Approve'], yticklabels=['Reject','Approve'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix - SVC')
plt.show()

In [None]:
#Classification Report
print("------ Classification Report SVC ------")
print(clf_report_test_svc)

In [None]:
#ROC-AUC curve
yprob_svc = svc.predict_proba(X_test_preprocessed)[:,1]
auc_score = roc_auc_score(y_test, yprob_svc)
fpr, tpr, thresholds = roc_curve(y_test, yprob_svc)

plt.figure(figsize=(6,5))
plt.plot(fpr, tpr, label=f"SVC (AUC = {auc_score:.3f})",color='red')
plt.plot([0,1],[0,1],'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve - SVC')
plt.legend(loc='lower right')
plt.show()

In [None]:
#saving model
joblib.dump(svc,'model.pkl')