In [1]:
#Import Library

import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler,LabelEncoder

In [2]:
#Read Data
dfloan = pd.read_csv('loanTrain.csv')
dfloan.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [3]:
#Identify columns
dfloan.columns

Index(['Loan_ID', 'Gender', 'Married', 'Dependents', 'Education',
       'Self_Employed', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Property_Area', 'Loan_Status'],
      dtype='object')

In [4]:
#Identify Target column
dfloan['Loan_Status'].unique()

array(['Y', 'N'], dtype=object)

In [5]:
dfloan['Loan_Status'].value_counts()

Loan_Status
Y    422
N    192
Name: count, dtype: int64

In [6]:
#Shape of data
dfloan.shape

(614, 13)

In [7]:
#number of Unique items in each column
dfloan.nunique()

Loan_ID              614
Gender                 2
Married                2
Dependents             4
Education              2
Self_Employed          2
ApplicantIncome      505
CoapplicantIncome    287
LoanAmount           203
Loan_Amount_Term      10
Credit_History         2
Property_Area          3
Loan_Status            2
dtype: int64

In [8]:
#Create a copy of original data
df = dfloan.copy()
df.head(3)

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y


In [9]:
#Drop duplicate data columns
df[df.duplicated()]

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status


In [10]:
#Identify missing values in each data column
df.isna().sum()

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [11]:
#Identify numerical data columns & Non-Numerical data columns

num_cols = list(df.select_dtypes('number').columns)
non_num_cols = list(df.select_dtypes(exclude = 'number').columns)

print(num_cols)
print()
print(non_num_cols)

['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term', 'Credit_History']

['Loan_ID', 'Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'Property_Area', 'Loan_Status']


In [12]:
#Impute using mostfrequent value Simple imputer
cat_imputer = SimpleImputer(strategy = 'most_frequent')
df[non_num_cols] = cat_imputer.fit_transform(df[non_num_cols])

In [13]:
#Impute using mean value value Simple imputer
num_imputer = SimpleImputer(strategy = 'mean')
df[num_cols] = num_imputer.fit_transform(df[num_cols])

In [14]:
df.isna().sum()

Loan_ID              0
Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
dtype: int64

In [15]:
#Sum up Applicant income and co-applicant income
df['ApplicantIncome'] = df['ApplicantIncome'] + df['CoapplicantIncome']

In [16]:
df.head(3)

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849.0,0.0,146.412162,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,6091.0,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000.0,0.0,66.0,360.0,1.0,Urban,Y


In [17]:
#Remove unnecessary data columns

df1 = df.drop(['Loan_ID','CoapplicantIncome'] , axis = 'columns')
df1.head(3)

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,Male,No,0,Graduate,No,5849.0,146.412162,360.0,1.0,Urban,Y
1,Male,Yes,1,Graduate,No,6091.0,128.0,360.0,1.0,Rural,N
2,Male,Yes,0,Graduate,Yes,3000.0,66.0,360.0,1.0,Urban,Y


In [18]:
#Perform Encoding
df1['Dependents'].unique()
df1['Education'].unique()
df1['Property_Area'].unique()

array(['Urban', 'Rural', 'Semiurban'], dtype=object)

In [19]:
print(non_num_cols)
non_num_cols.remove('Loan_ID')

['Loan_ID', 'Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'Property_Area', 'Loan_Status']


In [20]:
print(non_num_cols)

['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'Property_Area', 'Loan_Status']


In [21]:
for col in non_num_cols:
    le = LabelEncoder()
    df1[col] = le.fit_transform(df1[col])

In [22]:
df1.head(3)

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,1,0,0,0,0,5849.0,146.412162,360.0,1.0,2,1
1,1,1,1,0,0,6091.0,128.0,360.0,1.0,0,0
2,1,1,0,0,1,3000.0,66.0,360.0,1.0,2,1


In [23]:
#Log transformation of numerical columns
num_cols
num_cols.remove('CoapplicantIncome')
num_cols.remove('Credit_History')
print(num_cols)

['ApplicantIncome', 'LoanAmount', 'Loan_Amount_Term']


In [24]:
df1[num_cols] = np.log(df1[num_cols])

In [25]:
df1.head(3)

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,1,0,0,0,0,8.674026,4.986426,5.886104,1.0,2,1
1,1,1,1,0,0,8.714568,4.85203,5.886104,1.0,0,0
2,1,1,0,0,1,8.006368,4.189655,5.886104,1.0,2,1


In [26]:
#Seperate features and labels for creating ML Model

X = df1.drop('Loan_Status' , axis = 'columns')
y = df1['Loan_Status']

In [27]:
y.value_counts()

Loan_Status
1    422
0    192
Name: count, dtype: int64

In [28]:
#Split into train & test
from sklearn.model_selection import train_test_split
xtrain,xtest,ytrain,ytest = train_test_split(X,y,train_size=0.78,random_state=0)

In [29]:
xtrain

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
315,1,1,1,1,0,8.524963,4.709530,5.192957,1.000000,2
318,0,0,1,0,0,8.172164,4.718499,5.886104,0.842199,1
132,1,0,0,0,0,7.907652,4.248495,5.886104,1.000000,1
566,1,0,0,0,0,8.111628,4.248495,5.886104,1.000000,2
279,0,1,0,1,0,8.318742,4.820282,5.886104,0.842199,0
...,...,...,...,...,...,...,...,...,...,...
277,1,1,0,0,0,8.390041,4.382027,5.886104,1.000000,2
9,1,1,1,0,0,10.077819,5.855072,5.886104,1.000000,1
359,1,1,3,0,0,9.028099,5.298317,5.886104,1.000000,1
192,1,1,0,1,0,8.705000,5.075174,5.886104,1.000000,2


In [30]:
#Perform data scaling
from sklearn.preprocessing import MinMaxScaler
scaler =  MinMaxScaler()

scaler.fit(xtrain)

xtrainScaled = scaler.transform(xtrain)
xtestScaled = scaler.transform(xtest)

In [31]:
#Train your model

from sklearn.linear_model import LogisticRegression

model1 = LogisticRegression()

model1.fit(xtrainScaled,ytrain)

In [32]:
#Evaluate model performance

print("Accuracy on training data" , model1.score(xtrainScaled,ytrain))
print("Accuracy on test data" , model1.score(xtestScaled,ytest))

Accuracy on training data 0.8075313807531381
Accuracy on test data 0.8161764705882353


In [35]:
#Serialization or Save your Model

import joblib
joblib.dump(model1,"loan_model.pkl")

['loan_model.pkl']

In [37]:
print(model1.coef_)

[[-0.10333467  0.33792768  0.27051517 -0.40697909  0.03335306  0.20880313
  -0.95670688  0.15723909  3.30157767  0.25685782]]


In [36]:
#Deserialization or Load model
import joblib
loaded_model = joblib.load("loan_model.pkl")

In [38]:
print(loaded_model.coef_)

[[-0.10333467  0.33792768  0.27051517 -0.40697909  0.03335306  0.20880313
  -0.95670688  0.15723909  3.30157767  0.25685782]]
