In [8]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.pipeline import Pipeline

data = pd.read_csv("loan_approval_dataset.csv")
data

Unnamed: 0,loan_id,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status
0,1,2,Graduate,No,9600000,29900000,12,778,2400000,17600000,22700000,8000000,Approved
1,2,0,Not Graduate,Yes,4100000,12200000,8,417,2700000,2200000,8800000,3300000,Rejected
2,3,3,Graduate,No,9100000,29700000,20,506,7100000,4500000,33300000,12800000,Rejected
3,4,3,Graduate,No,8200000,30700000,8,467,18200000,3300000,23300000,7900000,Rejected
4,5,5,Not Graduate,Yes,9800000,24200000,20,382,12400000,8200000,29400000,5000000,Rejected
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4264,4265,5,Graduate,Yes,1000000,2300000,12,317,2800000,500000,3300000,800000,Rejected
4265,4266,0,Not Graduate,Yes,3300000,11300000,20,559,4200000,2900000,11000000,1900000,Approved
4266,4267,2,Not Graduate,No,6500000,23900000,18,457,1200000,12400000,18100000,7300000,Rejected
4267,4268,1,Not Graduate,No,4100000,12800000,8,780,8200000,700000,14100000,5800000,Approved


In [13]:
data.dropna(inplace=True)
data

Unnamed: 0,loan_id,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status
0,1,2,0,0,9600000,29900000,12,778,2400000,17600000,22700000,8000000,0
1,2,0,1,1,4100000,12200000,8,417,2700000,2200000,8800000,3300000,1
2,3,3,0,0,9100000,29700000,20,506,7100000,4500000,33300000,12800000,1
3,4,3,0,0,8200000,30700000,8,467,18200000,3300000,23300000,7900000,1
4,5,5,1,1,9800000,24200000,20,382,12400000,8200000,29400000,5000000,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4264,4265,5,0,1,1000000,2300000,12,317,2800000,500000,3300000,800000,1
4265,4266,0,1,1,3300000,11300000,20,559,4200000,2900000,11000000,1900000,0
4266,4267,2,1,0,6500000,23900000,18,457,1200000,12400000,18100000,7300000,1
4267,4268,1,1,0,4100000,12800000,8,780,8200000,700000,14100000,5800000,0


In [15]:
label_encoders = {}
for column in data.select_dtypes(include=['object']).columns:
    label_encoders[column] = LabelEncoder()
    data[column] = label_encoders[column].fit_transform(data[column])
    
data[column]

0       0
1       1
2       1
3       1
4       1
       ..
4264    1
4265    0
4266    1
4267    0
4268    0
Name:  loan_status, Length: 4269, dtype: int32

In [18]:
X = data.drop(' loan_status',axis=1)
X

Unnamed: 0,loan_id,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value
0,1,2,0,0,9600000,29900000,12,778,2400000,17600000,22700000,8000000
1,2,0,1,1,4100000,12200000,8,417,2700000,2200000,8800000,3300000
2,3,3,0,0,9100000,29700000,20,506,7100000,4500000,33300000,12800000
3,4,3,0,0,8200000,30700000,8,467,18200000,3300000,23300000,7900000
4,5,5,1,1,9800000,24200000,20,382,12400000,8200000,29400000,5000000
...,...,...,...,...,...,...,...,...,...,...,...,...
4264,4265,5,0,1,1000000,2300000,12,317,2800000,500000,3300000,800000
4265,4266,0,1,1,3300000,11300000,20,559,4200000,2900000,11000000,1900000
4266,4267,2,1,0,6500000,23900000,18,457,1200000,12400000,18100000,7300000
4267,4268,1,1,0,4100000,12800000,8,780,8200000,700000,14100000,5800000


In [19]:
y = data[' loan_status']
y

0       0
1       1
2       1
3       1
4       1
       ..
4264    1
4265    0
4266    1
4267    0
4268    0
Name:  loan_status, Length: 4269, dtype: int32

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [22]:
X_train

array([[-0.38682838,  1.51250774,  1.00263891, ...,  2.04678575,
         0.07808278,  1.16041374],
       [-0.80430304, -1.43500078,  1.00263891, ...,  1.22311091,
         2.49843196,  0.88201987],
       [-1.59840356, -0.84549907, -0.99736803, ..., -0.8818359 ,
        -1.33923881, -1.31419838],
       ...,
       [ 0.7708264 ,  0.92300603,  1.00263891, ...,  1.29175048,
         1.47359943,  0.13963624],
       [ 1.32636998, -0.25599737,  1.00263891, ..., -0.83607619,
         0.50327926,  1.4388076 ],
       [-1.0526637 ,  0.92300603, -0.99736803, ..., -0.28695963,
         1.03750048, -0.10782497]])

In [23]:
X_test

array([[-0.36395305,  1.51250774, -0.99736803, ...,  1.15447134,
         0.05627784,  0.72735662],
       [-0.79695025, -0.25599737, -0.99736803, ...,  1.04007205,
         0.28522978,  0.54176071],
       [-1.50363436,  0.33350433, -0.99736803, ...,  2.57302245,
         1.45179448,  0.51082806],
       ...,
       [-0.54368774,  1.51250774,  1.00263891, ...,  0.49095549,
        -0.37982112,  0.88201987],
       [-1.20952306,  0.92300603,  1.00263891, ..., -0.83607619,
        -1.0121646 , -1.09766982],
       [-0.84515183, -0.84549907,  1.00263891, ...,  0.14775765,
        -0.59787059, -0.97393921]])

In [25]:
from sklearn.feature_selection import SelectKBest, f_classif
selector = SelectKBest(f_classif, k=10)
X_train = selector.fit_transform(X_train, y_train)
X_test = selector.transform(X_test)

In [26]:
X_train

array([[-0.38682838,  1.51250774,  1.00950021, ...,  2.04678575,
         0.07808278,  1.16041374],
       [-0.80430304, -1.43500078,  1.61417128, ...,  1.22311091,
         2.49843196,  0.88201987],
       [-1.59840356, -0.84549907, -1.51589075, ..., -0.8818359 ,
        -1.33923881, -1.31419838],
       ...,
       [ 0.7708264 ,  0.92300603,  1.25848241, ...,  1.29175048,
         1.47359943,  0.13963624],
       [ 1.32636998, -0.25599737,  0.68938023, ..., -0.83607619,
         0.50327926,  1.4388076 ],
       [-1.0526637 ,  0.92300603,  0.40482913, ..., -0.28695963,
         1.03750048, -0.10782497]])

In [27]:
X_test

array([[-0.36395305,  1.51250774,  0.12027804, ...,  1.15447134,
         0.05627784,  0.72735662],
       [-0.79695025, -0.25599737,  0.29812247, ...,  1.04007205,
         0.28522978,  0.54176071],
       [-1.50363436,  0.33350433,  1.61417128, ...,  2.57302245,
         1.45179448,  0.51082806],
       ...,
       [-0.54368774,  1.51250774,  0.19141581, ...,  0.49095549,
        -0.37982112,  0.88201987],
       [-1.20952306,  0.92300603, -1.01792634, ..., -0.83607619,
        -1.0121646 , -1.09766982],
       [-0.84515183, -0.84549907, -0.76894413, ...,  0.14775765,
        -0.59787059, -0.97393921]])

In [30]:
rf = RandomForestClassifier(n_estimators=100, random_state=42)

In [31]:
gb = GradientBoostingClassifier(n_estimators=100, random_state=42)

In [33]:
lr_l1 = LogisticRegression(penalty='l1', solver='liblinear', random_state=42) #L1 regularizartion

In [34]:
lr_l2 = LogisticRegression(penalty='l2', solver='liblinear', random_state=42) #L2 regularization

In [35]:
voting_clf = VotingClassifier(estimators=[
    ('rf', rf),
    ('gb', gb),
    ('lr_l1', lr_l1),
    ('lr_l2', lr_l2)
], voting='hard')

In [36]:
voting_clf.fit(X_train, y_train)

VotingClassifier(estimators=[('rf', RandomForestClassifier(random_state=42)),
                             ('gb',
                              GradientBoostingClassifier(random_state=42)),
                             ('lr_l1',
                              LogisticRegression(penalty='l1', random_state=42,
                                                 solver='liblinear')),
                             ('lr_l2',
                              LogisticRegression(random_state=42,
                                                 solver='liblinear'))])

In [38]:
y_pred = voting_clf.predict(X_test)

In [39]:
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

conf_matrix = confusion_matrix(y_test, y_pred)
print(f'Confusion Matrix:\n{conf_matrix}')

class_report = classification_report(y_test, y_pred)
print(f'Classification Report:\n{class_report}')

Accuracy: 0.94
Confusion Matrix:
[[530   6]
 [ 49 269]]
Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.99      0.95       536
           1       0.98      0.85      0.91       318

    accuracy                           0.94       854
   macro avg       0.95      0.92      0.93       854
weighted avg       0.94      0.94      0.93       854

