# Bank Loan Prediction - Supervised Learning
### Name: Burhan Hadi Butt
### Enrollment: 03-134211-008
### Class: BSCS - 8A

## Import Libraries

In [1]:
# Import the necessary modules
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.decomposition import PCA
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

## Load the Dataset

In [2]:
# Load the training and testing data
dataset = pd.read_csv('/content/drive/MyDrive/Datasets/DM/Assignment 2/bankloan.csv')

## Data Exploration

In [3]:
# First few rows of the dataset
print(dataset.head())

   ID  Age  Experience  Income  ZIP.Code  Family  CCAvg  Education  Mortgage  \
0   1   25           1      49     91107       4    1.6          1         0   
1   2   45          19      34     90089       3    1.5          1         0   
2   3   39          15      11     94720       1    1.0          1         0   
3   4   35           9     100     94112       1    2.7          2         0   
4   5   35           8      45     91330       4    1.0          2         0   

   Personal.Loan  Securities.Account  CD.Account  Online  CreditCard  
0              0                   1           0       0           0  
1              0                   1           0       0           0  
2              0                   0           0       0           0  
3              0                   0           0       0           0  
4              0                   0           0       0           1  


In [4]:
# Information about the dataset
print(dataset.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 14 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   ID                  5000 non-null   int64  
 1   Age                 5000 non-null   int64  
 2   Experience          5000 non-null   int64  
 3   Income              5000 non-null   int64  
 4   ZIP.Code            5000 non-null   int64  
 5   Family              5000 non-null   int64  
 6   CCAvg               5000 non-null   float64
 7   Education           5000 non-null   int64  
 8   Mortgage            5000 non-null   int64  
 9   Personal.Loan       5000 non-null   int64  
 10  Securities.Account  5000 non-null   int64  
 11  CD.Account          5000 non-null   int64  
 12  Online              5000 non-null   int64  
 13  CreditCard          5000 non-null   int64  
dtypes: float64(1), int64(13)
memory usage: 547.0 KB
None


In [5]:
# Summary statistics
print(dataset.describe())

                ID          Age   Experience       Income      ZIP.Code  \
count  5000.000000  5000.000000  5000.000000  5000.000000   5000.000000   
mean   2500.500000    45.338400    20.104600    73.774200  93152.503000   
std    1443.520003    11.463166    11.467954    46.033729   2121.852197   
min       1.000000    23.000000    -3.000000     8.000000   9307.000000   
25%    1250.750000    35.000000    10.000000    39.000000  91911.000000   
50%    2500.500000    45.000000    20.000000    64.000000  93437.000000   
75%    3750.250000    55.000000    30.000000    98.000000  94608.000000   
max    5000.000000    67.000000    43.000000   224.000000  96651.000000   

            Family        CCAvg    Education     Mortgage  Personal.Loan  \
count  5000.000000  5000.000000  5000.000000  5000.000000    5000.000000   
mean      2.396400     1.937938     1.881000    56.498800       0.096000   
std       1.147663     1.747659     0.839869   101.713802       0.294621   
min       1.000000  

## Manual Feature Selection

### Feature Selection

In [6]:
# Input Features
X_mfs = dataset[['Age', 'Experience', 'Income', 'Family', 'CCAvg', 'Education', 'Mortgage', 'Securities.Account', 'CD.Account', 'Online', 'CreditCard']]

In [7]:
# Output Features
y_mfs = dataset['Personal.Loan']

### Dataset Preprocessing

In [8]:
# Split the data into training and testing sets
X_train_mfs, X_test_mfs, y_train_mfs, y_test_mfs = train_test_split(X_mfs, y_mfs, test_size=0.3, random_state=42)

In [9]:
# Standardize the data
scaler = StandardScaler()
X_train_scaled_mfs = scaler.fit_transform(X_train_mfs)
X_test_scaled_mfs = scaler.transform(X_test_mfs)

### Model Training and Testing

In [10]:
# Classifier 1: Decision Tree
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train_scaled_mfs, y_train_mfs)
y_pred_dt_mfs = dt_model.predict(X_test_scaled_mfs)

# Classifier 2: Random Forest
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train_scaled_mfs, y_train_mfs)
y_pred_rf_mfs = rf_model.predict(X_test_scaled_mfs)

# Classifier 3: Support Vector Machine (SVM)
svm_model = SVC()
svm_model.fit(X_train_scaled_mfs, y_train_mfs)
y_pred_svm_mfs = svm_model.predict(X_test_scaled_mfs)

# Classifier 4: K-Nearest Neighbors (KNN)
knn_model = KNeighborsClassifier()
knn_model.fit(X_train_scaled_mfs, y_train_mfs)
y_pred_knn_mfs = knn_model.predict(X_test_scaled_mfs)

# Classifier 5: Gradient Boosting Classifier
gb_model = GradientBoostingClassifier(random_state=42)
gb_model.fit(X_train_scaled_mfs, y_train_mfs)
y_pred_gb_mfs = gb_model.predict(X_test_scaled_mfs)

### Model Evaluation

In [11]:
# Accuracy Score
accuracy_dt_mfs = accuracy_score(y_test_mfs, y_pred_dt_mfs)
accuracy_rf_mfs = accuracy_score(y_test_mfs, y_pred_rf_mfs)
accuracy_svm_mfs = accuracy_score(y_test_mfs, y_pred_svm_mfs)
accuracy_knn_mfs = accuracy_score(y_test_mfs, y_pred_knn_mfs)
accuracy_gb_mfs = accuracy_score(y_test_mfs, y_pred_gb_mfs)

In [12]:
# Display Accuracy Scores
print("Accuracy Scores - Manual Feature Extraction\n")
print(f"Decision Tree: {accuracy_dt_mfs:.2f}")
print(f"Random Forest: {accuracy_rf_mfs:.2f}")
print(f"Support Vector Machine (SVM): {accuracy_svm_mfs:.2f}")
print(f"K-Nearest Neighbors (KNN): {accuracy_knn_mfs:.2f}")
print(f"Gradient Boosting Classifier: {accuracy_gb_mfs:.2f}")

Accuracy Scores - Manual Feature Extraction

Decision Tree: 0.98
Random Forest: 0.99
Support Vector Machine (SVM): 0.97
K-Nearest Neighbors (KNN): 0.96
Gradient Boosting Classifier: 0.99


## Principal Component Analysis

### Datset Preprocessing

In [13]:
# Seperate input and output features
X = dataset.drop(columns=['Personal.Loan'])
y = dataset['Personal.Loan']

In [14]:
# Split the data into training and testing sets
X_train_pca, X_test_pca, y_train_pca, y_test_pca = train_test_split(X, y, test_size=0.3, random_state=42)

In [15]:
# Standardize the data
scaler = StandardScaler()
X_train_scaled_pca = scaler.fit_transform(X_train_pca)
X_test_scaled_pca = scaler.transform(X_test_pca)

### Automatic Feature Extraction

In [16]:
# Principal Component Analysis
pca = PCA(n_components=10)
X_train_pca_extracted = pca.fit_transform(X_train_scaled_pca)
X_test_pca_extracted = pca.transform(X_test_scaled_pca)

### Model Training and Testing

In [17]:
# Classifier 1: Decision Tree
dt_model.fit(X_train_pca_extracted, y_train_pca)
y_pred_dt_pca = dt_model.predict(X_test_pca_extracted)

# Classifier 2: Random Forest
rf_model.fit(X_train_pca_extracted, y_train_pca)
y_pred_rf_pca = rf_model.predict(X_test_pca_extracted)

# Classifier 3: Support Vector Machine (SVM)
svm_model.fit(X_train_pca_extracted, y_train_pca)
y_pred_svm_pca = svm_model.predict(X_test_pca_extracted)

# Classifier 4: K-Nearest Neighbors (KNN)
knn_model.fit(X_train_pca_extracted, y_train_pca)
y_pred_knn_pca = knn_model.predict(X_test_pca_extracted)

# Classifier 5: Gradient Boosting Classifier
gb_model.fit(X_train_pca_extracted, y_train_pca)
y_pred_gb_pca = gb_model.predict(X_test_pca_extracted)

### Model Evaluation

In [18]:
# Accuracy Score
accuracy_dt_pca = accuracy_score(y_test_pca, y_pred_dt_pca)
accuracy_rf_pca = accuracy_score(y_test_pca, y_pred_rf_pca)
accuracy_svm_pca = accuracy_score(y_test_pca, y_pred_svm_pca)
accuracy_knn_pca = accuracy_score(y_test_pca, y_pred_knn_pca)
accuracy_gb_pca = accuracy_score(y_test_pca, y_pred_gb_pca)

In [19]:
# Display Accuracy Scores
print("Accuracy Scores - Principal Component Analysis\n")
print(f"Decision Tree: {accuracy_dt_pca:.2f}")
print(f"Random Forest: {accuracy_rf_pca:.2f}")
print(f"Support Vector Machine (SVM): {accuracy_svm_pca:.2f}")
print(f"K-Nearest Neighbors (KNN): {accuracy_knn_pca:.2f}")
print(f"Gradient Boosting Classifier: {accuracy_gb_pca:.2f}")

Accuracy Scores - Principal Component Analysis

Decision Tree: 0.92
Random Forest: 0.95
Support Vector Machine (SVM): 0.96
K-Nearest Neighbors (KNN): 0.94
Gradient Boosting Classifier: 0.95


## Recursive Feature Elimination

### Data Preprocessing

In [20]:
# Split the data into training and testing sets
X_train_rfe, X_test_rfe, y_train_rfe, y_test_rfe = train_test_split(X, y, test_size=0.3, random_state=42)

In [21]:
# Standardize the data
scaler = StandardScaler()
X_train_scaled_rfe = scaler.fit_transform(X_train_rfe)
X_test_scaled_rfe = scaler.transform(X_test_rfe)

## Automatic Feature Extraction

In [22]:
# Recursive Feature Elimination
log_reg = LogisticRegression(max_iter=1000, random_state=42)
rfe = RFE(estimator=log_reg, n_features_to_select=5)
X_train_rfe_extracted = rfe.fit_transform(X_train_scaled_rfe, y_train_rfe)
X_test_rfe_extracted = rfe.transform(X_test_scaled_rfe)

### Model Training and Testing

In [23]:
# Classifier 1: Decision Tree
dt_model.fit(X_train_rfe_extracted, y_train_rfe)
y_pred_dt_rfe = dt_model.predict(X_test_rfe_extracted)

# Classifier 2: Random Forest
rf_model.fit(X_train_rfe_extracted, y_train_rfe)
y_pred_rf_rfe = rf_model.predict(X_test_rfe_extracted)

# Classifier 3: Support Vector Machine (SVM)
svm_model.fit(X_train_rfe_extracted, y_train_rfe)
y_pred_svm_rfe = svm_model.predict(X_test_rfe_extracted)

# Classifier 4: K-Nearest Neighbors (KNN)
knn_model.fit(X_train_rfe_extracted, y_train_rfe)
y_pred_knn_rfe = knn_model.predict(X_test_rfe_extracted)

# Classifier 5: Gradient Boosting Classifier
gb_model.fit(X_train_rfe_extracted, y_train_rfe)
y_pred_gb_rfe = gb_model.predict(X_test_rfe_extracted)

### Model Evaluation

In [24]:
# Accuracy Score
accuracy_dt_rfe = accuracy_score(y_test_rfe, y_pred_dt_rfe)
accuracy_rf_rfe = accuracy_score(y_test_rfe, y_pred_rf_rfe)
accuracy_svm_rfe = accuracy_score(y_test_rfe, y_pred_svm_rfe)
accuracy_knn_rfe = accuracy_score(y_test_rfe, y_pred_knn_rfe)
accuracy_gb_rfe = accuracy_score(y_test_rfe, y_pred_gb_rfe)

In [25]:
# Display Accuracy Scores
print("Accuracy Scores - Recursive Feature Elimination\n")
print(f"Decision Tree: {accuracy_dt_rfe:.2f}")
print(f"Random Forest: {accuracy_rf_rfe:.2f}")
print(f"Support Vector Machine (SVM): {accuracy_svm_rfe:.2f}")
print(f"K-Nearest Neighbors (KNN): {accuracy_knn_rfe:.2f}")
print(f"Gradient Boosting Classifier: {accuracy_gb_rfe:.2f}")

Accuracy Scores - Recursive Feature Elimination

Decision Tree: 0.98
Random Forest: 0.98
Support Vector Machine (SVM): 0.98
K-Nearest Neighbors (KNN): 0.98
Gradient Boosting Classifier: 0.98


### Results

*   Manual Selection produced the highest overall accuracy, but it might be tailored and may involve bias from choosing features.
*   PCA effectively reduced dimensionality with a minimal drop in performance, beneficial for simplifying the model.
*   RFE demonstrated high accuracy while automating the selection process, ensuring that the chosen features contribute significantly to model performance.

### Conclusion
The results indicate that manual feature selection can yield exceptional model performance when domain knowledge is available. However, RFE stands out as a strong automated technique for feature extraction, offering high accuracy with minimal manual intervention. PCA, while slightly less accurate, provides a practical solution for dimensionality reduction, balancing performance and computational efficiency. Each method has its benefits, and the choice depends on the specific requirements of the problem, such as the need for model interpretability or computational resources.