In [1]:
# Import Dependencies
import warnings
warnings.simplefilter('ignore')

%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os


In [2]:
# Read the csv file
filepath = "data/home_loan_applications.csv"
df = pd.read_csv(filepath)
df.head(10)

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y
5,LP001011,Male,Yes,2,Graduate,Yes,5417,4196.0,267.0,360.0,1.0,Urban,Y
6,LP001013,Male,Yes,0,Not Graduate,No,2333,1516.0,95.0,360.0,1.0,Urban,Y
7,LP001014,Male,Yes,3+,Graduate,No,3036,2504.0,158.0,360.0,0.0,Semiurban,N
8,LP001018,Male,Yes,2,Graduate,No,4006,1526.0,168.0,360.0,1.0,Urban,Y
9,LP001020,Male,Yes,1,Graduate,No,12841,10968.0,349.0,360.0,1.0,Semiurban,N


# Data Cleaning and Preprocessing

In [4]:
# Remove unneccesary columns
loan = df.drop(["Loan_ID","Loan_Amount_Term","Education"], axis = 1)
loan.head()

Unnamed: 0,Gender,Married,Dependents,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Credit_History,Property_Area,Loan_Status
0,Male,No,0,No,5849,0.0,,1.0,Urban,Y
1,Male,Yes,1,No,4583,1508.0,128.0,1.0,Rural,N
2,Male,Yes,0,Yes,3000,0.0,66.0,1.0,Urban,Y
3,Male,Yes,0,No,2583,2358.0,120.0,1.0,Urban,Y
4,Male,No,0,No,6000,0.0,141.0,1.0,Urban,Y


In [5]:
# Drop rows containing n/a
clean_df = loan.apply(pd.to_numeric, errors = "coerce")
clean_df = loan.dropna()

clean_df

Unnamed: 0,Gender,Married,Dependents,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Credit_History,Property_Area,Loan_Status
1,Male,Yes,1,No,4583,1508.0,128.0,1.0,Rural,N
2,Male,Yes,0,Yes,3000,0.0,66.0,1.0,Urban,Y
3,Male,Yes,0,No,2583,2358.0,120.0,1.0,Urban,Y
4,Male,No,0,No,6000,0.0,141.0,1.0,Urban,Y
5,Male,Yes,2,Yes,5417,4196.0,267.0,1.0,Urban,Y
...,...,...,...,...,...,...,...,...,...,...
609,Female,No,0,No,2900,0.0,71.0,1.0,Rural,Y
610,Male,Yes,3+,No,4106,0.0,40.0,1.0,Rural,Y
611,Male,Yes,1,No,8072,240.0,253.0,1.0,Urban,Y
612,Male,Yes,2,No,7583,0.0,187.0,1.0,Urban,Y


In [6]:
# Assigning X (data) and y (target)

X = clean_df[["Gender",
           "Married",
           "Dependents",
           "Self_Employed",
           "ApplicantIncome",
           "CoapplicantIncome",
           "LoanAmount",
           "Credit_History",
           "Property_Area"]]
y = clean_df["Loan_Status"].values.reshape(-1,1)
print(X.shape, y.shape)

(492, 9) (492, 1)


Dummy Encoding (Binary Encoded Data)

In [7]:
data = clean_df.copy()

data_binary_encoded = pd.get_dummies(data)
data_binary_encoded.head(10)

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Credit_History,Gender_Female,Gender_Male,Married_No,Married_Yes,Dependents_0,Dependents_1,Dependents_2,Dependents_3+,Self_Employed_No,Self_Employed_Yes,Property_Area_Rural,Property_Area_Semiurban,Property_Area_Urban,Loan_Status_N,Loan_Status_Y
1,4583,1508.0,128.0,1.0,0,1,0,1,0,1,0,0,1,0,1,0,0,1,0
2,3000,0.0,66.0,1.0,0,1,0,1,1,0,0,0,0,1,0,0,1,0,1
3,2583,2358.0,120.0,1.0,0,1,0,1,1,0,0,0,1,0,0,0,1,0,1
4,6000,0.0,141.0,1.0,0,1,1,0,1,0,0,0,1,0,0,0,1,0,1
5,5417,4196.0,267.0,1.0,0,1,0,1,0,0,1,0,0,1,0,0,1,0,1
6,2333,1516.0,95.0,1.0,0,1,0,1,1,0,0,0,1,0,0,0,1,0,1
7,3036,2504.0,158.0,0.0,0,1,0,1,0,0,0,1,1,0,0,1,0,1,0
8,4006,1526.0,168.0,1.0,0,1,0,1,0,0,1,0,1,0,0,0,1,0,1
9,12841,10968.0,349.0,1.0,0,1,0,1,0,1,0,0,1,0,0,1,0,1,0
10,3200,700.0,70.0,1.0,0,1,0,1,0,0,1,0,1,0,0,0,1,0,1


Splitting the Data for training and testing


In [8]:
from sklearn.model_selection import train_test_split

X = pd.get_dummies(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

X_train.head()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Credit_History,Gender_Female,Gender_Male,Married_No,Married_Yes,Dependents_0,Dependents_1,Dependents_2,Dependents_3+,Self_Employed_No,Self_Employed_Yes,Property_Area_Rural,Property_Area_Semiurban,Property_Area_Urban
143,2698,2034.0,122.0,1.0,0,1,0,1,1,0,0,0,1,0,0,1,0
434,3750,0.0,100.0,1.0,0,1,1,0,1,0,0,0,1,0,0,0,1
169,8000,0.0,200.0,1.0,0,1,0,1,0,0,1,0,1,0,0,1,0
239,3315,0.0,96.0,1.0,0,1,0,1,0,1,0,0,1,0,0,1,0
396,3180,0.0,71.0,0.0,1,0,1,0,1,0,0,0,1,0,0,0,1


Creating a Logistic Regression Model


In [9]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier

LogisticRegression()

In [10]:
classifier.fit(X_train, y_train)

LogisticRegression()

In [11]:
print(f"Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")

Training Data Score: 0.8265582655826558
Testing Data Score: 0.7723577235772358


In [12]:
predictions = classifier.predict(X_test)
print(f"First 10 Predictions:   {predictions[:10]}")
print(f"First 10 Actual labels: {y_test[:10].tolist()}")

First 10 Predictions:   ['Y' 'N' 'Y' 'N' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y']
First 10 Actual labels: [['Y'], ['N'], ['N'], ['N'], ['Y'], ['Y'], ['Y'], ['Y'], ['Y'], ['N']]
