In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

In [6]:
df=pd.read_csv("train_clean_data.csv",index_col=0)
df.head()

Unnamed: 0,Loan_ID,Dependents,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,gender_Male,married_Yes,education_Not Graduate,property_area_Semiurban,property_area_Urban,self_employed_Yes,Loan_status_Y
1,LP001003,1,4583,1508.0,128.0,360.0,1.0,1,1,0,0,0,0,0
2,LP001005,0,3000,0.0,66.0,360.0,1.0,1,1,0,0,1,1,1
3,LP001006,0,2583,2358.0,120.0,360.0,1.0,1,1,1,0,1,0,1
4,LP001008,0,6000,0.0,141.0,360.0,1.0,1,0,0,0,1,0,1
5,LP001011,2,5417,4196.0,267.0,360.0,1.0,1,1,0,0,1,1,1


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 517 entries, 1 to 613
Data columns (total 14 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Loan_ID                  517 non-null    object 
 1   Dependents               517 non-null    object 
 2   ApplicantIncome          517 non-null    int64  
 3   CoapplicantIncome        517 non-null    float64
 4   LoanAmount               517 non-null    float64
 5   Loan_Amount_Term         517 non-null    float64
 6   Credit_History           517 non-null    float64
 7   gender_Male              517 non-null    int64  
 8   married_Yes              517 non-null    int64  
 9   education_Not Graduate   517 non-null    int64  
 10  property_area_Semiurban  517 non-null    int64  
 11  property_area_Urban      517 non-null    int64  
 12  self_employed_Yes        517 non-null    int64  
 13  Loan_status_Y            517 non-null    int64  
dtypes: float64(4), int64(8), object

In [8]:
df.shape

(517, 14)

In [9]:
df['Loan_status_Y'].value_counts()

Loan_status_Y
1    360
0    157
Name: count, dtype: int64

In [10]:
X=df.drop(['Loan_ID','Dependents','Loan_status_Y'],axis=1)
y=df['Loan_status_Y']

In [11]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.33,stratify=y,random_state=42)

Synthetic Minority Over-Sampling Technique from the imblearn library to balance an imbalanced datset
Interploating:- "finding a point between two known points"

In [12]:
from imblearn.over_sampling import SMOTE
smote=SMOTE(random_state=42)
X_train_res,y_train_res=smote.fit_resample(X_train,y_train)

In [13]:
X_train_res.shape

(482, 11)

In [14]:
y_train_res.shape

(482,)

In [15]:
clf1=DecisionTreeClassifier(class_weight='balanced')
clf2=GaussianNB()
clf3=LogisticRegression(solver='liblinear',max_iter=200,class_weight='balanced')

In [16]:
eclf1=VotingClassifier(estimators=[('DT',clf1),('GNB',clf2),('LR',clf3)],voting='hard')
eclf1=eclf1.fit(X_train_res,y_train_res)
predictions=eclf1.predict(X_test)
predictions

array([0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1,
       1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1,
       1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1,
       0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0,
       1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0,
       1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0])

In [17]:
from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(y_test,predictions))
print(confusion_matrix(y_test,predictions))

              precision    recall  f1-score   support

           0       0.70      0.62      0.65        52
           1       0.84      0.88      0.86       119

    accuracy                           0.80       171
   macro avg       0.77      0.75      0.76       171
weighted avg       0.80      0.80      0.80       171

[[ 32  20]
 [ 14 105]]


In [18]:
eclf2=VotingClassifier(estimators=[('DT',clf1),('GNB',clf2),('LR',clf3)],voting='soft')
eclf2=eclf2.fit(X_train,y_train)
prediction=eclf2.predict(X_test)
prediction

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1,
       1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1,
       1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1,
       1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1,
       0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0,
       1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0,
       1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0])

In [19]:
from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(y_test,prediction))
print(confusion_matrix(y_test,prediction))

              precision    recall  f1-score   support

           0       0.67      0.60      0.63        52
           1       0.83      0.87      0.85       119

    accuracy                           0.79       171
   macro avg       0.75      0.74      0.74       171
weighted avg       0.78      0.79      0.79       171

[[ 31  21]
 [ 15 104]]


In [20]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
base_model1=DecisionTreeClassifier()
bagging=BaggingClassifier(estimator=base_model1,n_estimators=200,random_state=0)
bagging.fit(X_train_res,y_train_res)
y_pred_bagging=bagging.predict(X_test)
print("Bagging Accuracy:",accuracy_score(y_test,y_pred_bagging))

Bagging Accuracy: 0.7660818713450293


In [21]:
print(classification_report(y_test,y_pred_bagging))
print(confusion_matrix(y_test,y_pred_bagging))

              precision    recall  f1-score   support

           0       0.63      0.56      0.59        52
           1       0.82      0.86      0.84       119

    accuracy                           0.77       171
   macro avg       0.72      0.71      0.71       171
weighted avg       0.76      0.77      0.76       171

[[ 29  23]
 [ 17 102]]


In [23]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score,classification_report
boosting=AdaBoostClassifier(n_estimators=1200,random_state=0)
boosting.fit(X_train_res,y_train_res)
y_pred_boosting=boosting.predict(X_test)
print("Boosting(AdaBoost) accuracy:",accuracy_score(y_test,y_pred_boosting))

Boosting(AdaBoost) accuracy: 0.7953216374269005


In [25]:
print("Report:\n", classification_report(y_test, y_pred_boosting))


Report:
               precision    recall  f1-score   support

           0       0.68      0.62      0.65        52
           1       0.84      0.87      0.86       119

    accuracy                           0.80       171
   macro avg       0.76      0.74      0.75       171
weighted avg       0.79      0.80      0.79       171



RandomForest

In [26]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
rf = RandomForestClassifier(n_estimators=100, random_state=0)
rf.fit(X_train_res, y_train_res)
y_pred_rf = rf.predict(X_test)
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))

Random Forest Accuracy: 0.7953216374269005


In [27]:
print("Report:\n", classification_report(y_test, y_pred_rf))


Report:
               precision    recall  f1-score   support

           0       0.70      0.58      0.63        52
           1       0.83      0.89      0.86       119

    accuracy                           0.80       171
   macro avg       0.76      0.73      0.74       171
weighted avg       0.79      0.80      0.79       171

