In [116]:
import numpy as np 
import pandas as pd 
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn.neighbors import KNeighborsClassifier

import seaborn as sns



In [117]:
df=pd.read_csv('loan_data.csv')

In [118]:
df.head(2)

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N


In [119]:
df.shape

(614, 13)

In [120]:
df.describe()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History
count,614.0,614.0,592.0,600.0,564.0
mean,5403.459283,1621.245798,146.412162,342.0,0.842199
std,6109.041673,2926.248369,85.587325,65.12041,0.364878
min,150.0,0.0,9.0,12.0,0.0
25%,2877.5,0.0,100.0,360.0,1.0
50%,3812.5,1188.5,128.0,360.0,1.0
75%,5795.0,2297.25,168.0,360.0,1.0
max,81000.0,41667.0,700.0,480.0,1.0


In [121]:
df.isnull().sum()

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [122]:
### dropping all null values

df.dropna(inplace=True)

In [123]:
df.isnull().sum()

Loan_ID              0
Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
dtype: int64

In [124]:
df.shape

(480, 13)

In [125]:
df.head(2)

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y


In [126]:
df['Loan_Status']=df['Loan_Status'].apply(lambda x:1 if x=='Y' else 0 )

In [127]:
df.head(2)

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,0
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,1


In [128]:
### cleaning up all the fields as needed 


df['Dependents'].value_counts()

### replace 3+ to a more numeric value, since all fileds with either 3,4,5,6,7 has 3+, we can make it 3 


df.replace({'Dependents':{'3+':'3'}},inplace=True)

In [129]:
### we need to fix other categorical fiels like Gender, Married, Education, Self_Emplyed, Property_Area also
## to numerical values 

cols=['Gender','Married','Education','Self_Employed','Property_Area']

for i in cols:
    print(df[i].value_counts())


Male      394
Female     86
Name: Gender, dtype: int64
Yes    311
No     169
Name: Married, dtype: int64
Graduate        383
Not Graduate     97
Name: Education, dtype: int64
No     414
Yes     66
Name: Self_Employed, dtype: int64
Semiurban    191
Urban        150
Rural        139
Name: Property_Area, dtype: int64


In [130]:
df.replace({'Gender':{'Male':1,'Female':0},'Married':{'Yes':1,'No':0},'Education':{'Graduate':1,'Not Graduate':0},
                'Self_Employed':{'Yes':1,'No':0},'Property_Area' : {'Rural':0,'Semiurban':1,'Urban':2}},inplace=True)

In [131]:
cols=['Gender','Married','Education','Self_Employed','Property_Area']

for i in cols:
    print(df[i].value_counts())

1    394
0     86
Name: Gender, dtype: int64
1    311
0    169
Name: Married, dtype: int64
1    383
0     97
Name: Education, dtype: int64
0    414
1     66
Name: Self_Employed, dtype: int64
1    191
2    150
0    139
Name: Property_Area, dtype: int64


In [132]:
### all the columns are propery labeled now 

df.head(2)

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
1,LP001003,1,1,1,1,0,4583,1508.0,128.0,360.0,1.0,0,0
2,LP001005,1,1,0,1,1,3000,0.0,66.0,360.0,1.0,2,1


In [133]:
### Seperate independent and dependent variables now 

X=df.drop(columns=['Loan_ID','Loan_Status'],axis=1)
Y=df['Loan_Status']

In [134]:
print('')




In [135]:
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.2,stratify=Y,random_state=2)

In [136]:
### now train the model 


model=SVC(kernel='linear')



In [137]:
model.fit(X_train,Y_train)

In [138]:
### Model evaluation on training data set 

X_train_predict=model.predict(X_train)

training_data_accuracy=accuracy_score(Y_train,X_train_predict)

print("Training data accuracy is :", training_data_accuracy)

Training data accuracy is : 0.765625


In [139]:
### Model evaluation on training data set 

X_test_predict=model.predict(X_test)

testing_data_accuracy=accuracy_score(Y_test,X_test_predict)

print("Training data accuracy is :", testing_data_accuracy)

Training data accuracy is : 0.8229166666666666


In [140]:
LogisticRegression()

In [141]:
#### testing it on multipe models 


models={
    "LogisticRegression": LogisticRegression(max_iter=1000),
    "SVM":SVC(),
    "Randomforest":RandomForestClassifier(n_estimators=100),
    "KNearestNeighbors": KNeighborsClassifier()
    
    
}

In [142]:
for i,j in models.items():
    
    j.fit(X_train,Y_train)
    X_test_predict=j.predict(X_test)
    accuracy=accuracy_score(Y_test,X_test_predict)
    
    
    print("Model: ", i)
    print("Accuracy : ", accuracy)

Model:  LogisticRegression
Accuracy :  0.8229166666666666
Model:  SVM
Accuracy :  0.6875
Model:  Randomforest
Accuracy :  0.75
Model:  KNearestNeighbors
Accuracy :  0.7083333333333334
