In [1]:
import pandas as pd 
import numpy as np
from sklearn.preprocessing import PowerTransformer
from sklearn.impute import KNNImputer 
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report,accuracy_score,confusion_matrix
import pickle
import warnings 
warnings.filterwarnings('ignore')

## 1. Problem Statement

## 2. Data gathering

In [2]:
df = pd.read_csv('loan-train.csv')
df


Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y
...,...,...,...,...,...,...,...,...,...,...,...,...,...
609,LP002978,Female,No,0,Graduate,No,2900,0.0,71.0,360.0,1.0,Rural,Y
610,LP002979,Male,Yes,3+,Graduate,No,4106,0.0,40.0,180.0,1.0,Rural,Y
611,LP002983,Male,Yes,1,Graduate,No,8072,240.0,253.0,360.0,1.0,Urban,Y
612,LP002984,Male,Yes,2,Graduate,No,7583,0.0,187.0,360.0,1.0,Urban,Y


In [3]:
df['Loan_Status'].value_counts()

Loan_Status
Y    422
N    192
Name: count, dtype: int64

In [4]:
df.isna().sum()

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [5]:
df= df.drop(['Loan_ID','Gender','Married'],axis=1)

In [6]:
lab_enc = LabelEncoder()
df['Dependents']=lab_enc.fit_transform(df['Dependents'])
df['Education']=lab_enc.fit_transform(df['Education'])
df['Self_Employed']=lab_enc.fit_transform(df['Self_Employed'])
df['Property_Area']=lab_enc.fit_transform(df['Property_Area'])
df['Loan_Status']=lab_enc.fit_transform(df['Loan_Status'])

In [7]:
df


Unnamed: 0,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,0,0,0,5849,0.0,,360.0,1.0,2,1
1,1,0,0,4583,1508.0,128.0,360.0,1.0,0,0
2,0,0,1,3000,0.0,66.0,360.0,1.0,2,1
3,0,1,0,2583,2358.0,120.0,360.0,1.0,2,1
4,0,0,0,6000,0.0,141.0,360.0,1.0,2,1
...,...,...,...,...,...,...,...,...,...,...
609,0,0,0,2900,0.0,71.0,360.0,1.0,0,1
610,3,0,0,4106,0.0,40.0,180.0,1.0,0,1
611,1,0,0,8072,240.0,253.0,360.0,1.0,2,1
612,2,0,0,7583,0.0,187.0,360.0,1.0,2,1


In [8]:
df.isna().sum()

Dependents            0
Education             0
Self_Employed         0
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [9]:
knn_data = KNNImputer(n_neighbors=3)

array = knn_data.fit_transform(df)
df = pd.DataFrame(array,columns=df.columns)
df

Unnamed: 0,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,0.0,0.0,0.0,5849.0,0.0,138.0,360.0,1.0,2.0,1.0
1,1.0,0.0,0.0,4583.0,1508.0,128.0,360.0,1.0,0.0,0.0
2,0.0,0.0,1.0,3000.0,0.0,66.0,360.0,1.0,2.0,1.0
3,0.0,1.0,0.0,2583.0,2358.0,120.0,360.0,1.0,2.0,1.0
4,0.0,0.0,0.0,6000.0,0.0,141.0,360.0,1.0,2.0,1.0
...,...,...,...,...,...,...,...,...,...,...
609,0.0,0.0,0.0,2900.0,0.0,71.0,360.0,1.0,0.0,1.0
610,3.0,0.0,0.0,4106.0,0.0,40.0,180.0,1.0,0.0,1.0
611,1.0,0.0,0.0,8072.0,240.0,253.0,360.0,1.0,2.0,1.0
612,2.0,0.0,0.0,7583.0,0.0,187.0,360.0,1.0,2.0,1.0


In [10]:
df.isna().sum()

Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
dtype: int64

## 4. feature selection

In [11]:
x = df.drop('Loan_Status',axis=1)
y= df['Loan_Status']
y.value_counts()

Loan_Status
1.0    422
0.0    192
Name: count, dtype: int64

## 5. Model selection

In [12]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=11,stratify=y)

In [13]:
log_reg=LogisticRegression()
log_reg.fit(x_train,y_train)

In [14]:
y_pred_train = log_reg.predict(x_train)

In [15]:
accuracy_score(y_train,y_pred_train)

0.7963340122199593

In [16]:
clf_report = classification_report(y_train,y_pred_train)
print("clf report:\n ",clf_report)

clf report:
                precision    recall  f1-score   support

         0.0       0.86      0.42      0.56       154
         1.0       0.78      0.97      0.87       337

    accuracy                           0.80       491
   macro avg       0.82      0.69      0.71       491
weighted avg       0.81      0.80      0.77       491



In [17]:
y_pred_test = log_reg.predict(x_test)

In [18]:
clf_report = classification_report(y_test,y_pred_test)
print("clf report:\n ",clf_report)

clf report:
                precision    recall  f1-score   support

         0.0       0.95      0.55      0.70        38
         1.0       0.83      0.99      0.90        85

    accuracy                           0.85       123
   macro avg       0.89      0.77      0.80       123
weighted avg       0.87      0.85      0.84       123



In [19]:
confusion_matrix(y_train,y_pred_train)

array([[ 64,  90],
       [ 10, 327]], dtype=int64)

In [20]:
with open('logistic.pkl','wb') as file:
     pickle.dump(log_reg,file)

In [21]:
x.columns

Index(['Dependents', 'Education', 'Self_Employed', 'ApplicantIncome',
       'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term', 'Credit_History',
       'Property_Area'],
      dtype='object')