In [49]:
# Import Dependencies
import warnings
warnings.simplefilter('ignore')

%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os


In [50]:
# Read the csv file
filepath = "data/home_loan_applications.csv"
df = pd.read_csv(filepath)
df.head(10)

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y
5,LP001011,Male,Yes,2,Graduate,Yes,5417,4196.0,267.0,360.0,1.0,Urban,Y
6,LP001013,Male,Yes,0,Not Graduate,No,2333,1516.0,95.0,360.0,1.0,Urban,Y
7,LP001014,Male,Yes,3+,Graduate,No,3036,2504.0,158.0,360.0,0.0,Semiurban,N
8,LP001018,Male,Yes,2,Graduate,No,4006,1526.0,168.0,360.0,1.0,Urban,Y
9,LP001020,Male,Yes,1,Graduate,No,12841,10968.0,349.0,360.0,1.0,Semiurban,N


# Data Cleaning and Preprocessing

In [51]:
# Remove unneccesary columns
loan = df.drop(["Loan_ID","Loan_Amount_Term"], axis = 1)
loan.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Credit_History,Property_Area,Loan_Status
0,Male,No,0,Graduate,No,5849,0.0,,1.0,Urban,Y
1,Male,Yes,1,Graduate,No,4583,1508.0,128.0,1.0,Rural,N
2,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,1.0,Urban,Y
3,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,1.0,Urban,Y
4,Male,No,0,Graduate,No,6000,0.0,141.0,1.0,Urban,Y


## Dummy Encoding (Binary Encoded Data)

In [52]:
data = loan.copy()

data_binary_encoded = pd.get_dummies(data)
data_binary_encoded.head(10)

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Credit_History,Gender_Female,Gender_Male,Married_No,Married_Yes,Dependents_0,Dependents_1,...,Dependents_3+,Education_Graduate,Education_Not Graduate,Self_Employed_No,Self_Employed_Yes,Property_Area_Rural,Property_Area_Semiurban,Property_Area_Urban,Loan_Status_N,Loan_Status_Y
0,5849,0.0,,1.0,0,1,1,0,1,0,...,0,1,0,1,0,0,0,1,0,1
1,4583,1508.0,128.0,1.0,0,1,0,1,0,1,...,0,1,0,1,0,1,0,0,1,0
2,3000,0.0,66.0,1.0,0,1,0,1,1,0,...,0,1,0,0,1,0,0,1,0,1
3,2583,2358.0,120.0,1.0,0,1,0,1,1,0,...,0,0,1,1,0,0,0,1,0,1
4,6000,0.0,141.0,1.0,0,1,1,0,1,0,...,0,1,0,1,0,0,0,1,0,1
5,5417,4196.0,267.0,1.0,0,1,0,1,0,0,...,0,1,0,0,1,0,0,1,0,1
6,2333,1516.0,95.0,1.0,0,1,0,1,1,0,...,0,0,1,1,0,0,0,1,0,1
7,3036,2504.0,158.0,0.0,0,1,0,1,0,0,...,1,1,0,1,0,0,1,0,1,0
8,4006,1526.0,168.0,1.0,0,1,0,1,0,0,...,0,1,0,1,0,0,0,1,0,1
9,12841,10968.0,349.0,1.0,0,1,0,1,0,1,...,0,1,0,1,0,0,1,0,1,0


In [53]:
# Assigning X (data) and y (target)

X = loan[["Gender",
           "Married",
           "Dependents",
           "Education",
           "Self_Employed",
           "ApplicantIncome",
           "CoapplicantIncome",
           "LoanAmount",
           "Credit_History",
           "Property_Area"]]
y = loan["Loan_Status"].values.reshape(-1,1)
print(X.shape, y.shape)

(614, 10) (614, 1)


## Scaling and Normalization

In [56]:
from sklearn.model_selection import train_test_split

X = pd.get_dummies(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

X_train.head()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Credit_History,Gender_Female,Gender_Male,Married_No,Married_Yes,Dependents_0,Dependents_1,Dependents_2,Dependents_3+,Education_Graduate,Education_Not Graduate,Self_Employed_No,Self_Employed_Yes,Property_Area_Rural,Property_Area_Semiurban,Property_Area_Urban
92,3273,1820.0,81.0,1.0,0,1,0,1,0,0,1,0,0,1,1,0,0,0,1
304,4000,2500.0,140.0,1.0,0,1,1,0,1,0,0,0,1,0,1,0,1,0,0
68,7100,0.0,125.0,1.0,0,1,0,1,0,0,0,1,0,1,0,1,0,0,1
15,4950,0.0,125.0,1.0,0,1,1,0,1,0,0,0,1,0,1,0,0,0,1
211,3430,1250.0,128.0,0.0,0,1,0,1,0,0,0,1,1,0,1,0,0,1,0


In [57]:
# StandardScalar

from sklearn.preprocessing import StandardScaler
X_scaler = StandardScaler().fit(X_train)
y_scaler = StandardScaler().fit(y_train)

ValueError: could not convert string to float: 'Y'

In [58]:
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)
y_train_scaled = y_scaler.transform(y_train)
y_test_scaled = y_scaler.transform(y_test)

NameError: name 'y_scaler' is not defined

In [None]:
# Assigning X (data) and y (target)

# X = df.drop(["Loan_ID","Loan_Status"], axis=1)
# y = df["Loan_Status"]
# print(X.shape, y.shape)

In [None]:
# Splitting the Data into train and test samples
# from sklearn.model_selection import train_test_split

# X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [None]:
# Creating a logistic Regression Model
# from sklearn.linear_model import LogisticRegression
# classifier = LogisticRegression()
# classifier

In [None]:
# Fitting the model using the train sample
# classifier.fit(X_train, y_train)