In [1]:
# Import Dependencies
import warnings
warnings.simplefilter('ignore')

%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os


In [2]:
df = pd.read_csv(os.path.join("https://amy27-bucket.s3.ap-southeast-2.amazonaws.com/home_loan_applications.csv"))
df.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


# Data Cleaning and Preprocessing

In [3]:
# Combining income columns
sum_column = df["ApplicantIncome"] + df["CoapplicantIncome"]
df["Total_Income"] = sum_column
df.head(10)

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status,Total_Income
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y,5849.0
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N,6091.0
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y,3000.0
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y,4941.0
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y,6000.0
5,LP001011,Male,Yes,2,Graduate,Yes,5417,4196.0,267.0,360.0,1.0,Urban,Y,9613.0
6,LP001013,Male,Yes,0,Not Graduate,No,2333,1516.0,95.0,360.0,1.0,Urban,Y,3849.0
7,LP001014,Male,Yes,3+,Graduate,No,3036,2504.0,158.0,360.0,0.0,Semiurban,N,5540.0
8,LP001018,Male,Yes,2,Graduate,No,4006,1526.0,168.0,360.0,1.0,Urban,Y,5532.0
9,LP001020,Male,Yes,1,Graduate,No,12841,10968.0,349.0,360.0,1.0,Semiurban,N,23809.0


In [4]:
# Remove unneccesary columns
df = df.drop(["Loan_ID","Loan_Amount_Term","Education","ApplicantIncome","CoapplicantIncome"], axis = 1)
df.head()

Unnamed: 0,Gender,Married,Dependents,Self_Employed,LoanAmount,Credit_History,Property_Area,Loan_Status,Total_Income
0,Male,No,0,No,,1.0,Urban,Y,5849.0
1,Male,Yes,1,No,128.0,1.0,Rural,N,6091.0
2,Male,Yes,0,Yes,66.0,1.0,Urban,Y,3000.0
3,Male,Yes,0,No,120.0,1.0,Urban,Y,4941.0
4,Male,No,0,No,141.0,1.0,Urban,Y,6000.0


In [5]:
# Drop rows containing n/a
clean_df = df.apply(pd.to_numeric, errors = "coerce")
clean_df = df.dropna()

clean_df.head()

Unnamed: 0,Gender,Married,Dependents,Self_Employed,LoanAmount,Credit_History,Property_Area,Loan_Status,Total_Income
1,Male,Yes,1,No,128.0,1.0,Rural,N,6091.0
2,Male,Yes,0,Yes,66.0,1.0,Urban,Y,3000.0
3,Male,Yes,0,No,120.0,1.0,Urban,Y,4941.0
4,Male,No,0,No,141.0,1.0,Urban,Y,6000.0
5,Male,Yes,2,Yes,267.0,1.0,Urban,Y,9613.0


In [6]:
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder()

encoder.fit(clean_df[[
    "Gender",
    "Married",
    "Dependents",
    "Self_Employed",
    "Property_Area"
]])

OneHotEncoder()

In [7]:
encoded_data = encoder.transform(
    clean_df[["Gender",
        "Married",
        "Dependents",
        "Self_Employed",
        "Property_Area"]]
)
encoded_df = pd.DataFrame(columns=encoder.get_feature_names(["Gender",
    "Married",
    "Dependents",
    "Self_Employed",
    "Property_Area"]),data=encoded_data.toarray()).set_index(keys=clean_df.index)
encoded_df
loan_df = pd.concat([encoded_df, clean_df[["LoanAmount", "Credit_History", "Total_Income", "Loan_Status"]]], axis=1)
loan_df

Unnamed: 0,Gender_Female,Gender_Male,Married_No,Married_Yes,Dependents_0,Dependents_1,Dependents_2,Dependents_3+,Self_Employed_No,Self_Employed_Yes,Property_Area_Rural,Property_Area_Semiurban,Property_Area_Urban,LoanAmount,Credit_History,Total_Income,Loan_Status
1,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,128.0,1.0,6091.0,N
2,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,66.0,1.0,3000.0,Y
3,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,120.0,1.0,4941.0,Y
4,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,141.0,1.0,6000.0,Y
5,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,267.0,1.0,9613.0,Y
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
609,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,71.0,1.0,2900.0,Y
610,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,40.0,1.0,4106.0,Y
611,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,253.0,1.0,8312.0,Y
612,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,187.0,1.0,7583.0,Y


In [8]:
# Drop dummy columns
final_df = loan_df.drop(columns=["Gender_Female","Married_No","Self_Employed_No"])
final_df.head()

Unnamed: 0,Gender_Male,Married_Yes,Dependents_0,Dependents_1,Dependents_2,Dependents_3+,Self_Employed_Yes,Property_Area_Rural,Property_Area_Semiurban,Property_Area_Urban,LoanAmount,Credit_History,Total_Income,Loan_Status
1,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,128.0,1.0,6091.0,N
2,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,66.0,1.0,3000.0,Y
3,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,120.0,1.0,4941.0,Y
4,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,141.0,1.0,6000.0,Y
5,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,267.0,1.0,9613.0,Y


In [9]:
# Assigning X (data) and y (target)

X = final_df[["Gender_Male",
              "Married_Yes",
              "Dependents_0",
              "Dependents_1",
              "Dependents_2",
              "Dependents_3+",
              "Self_Employed_Yes",
              "Property_Area_Rural",
              "Property_Area_Semiurban",
              "Property_Area_Urban",
              "LoanAmount",
              "Credit_History",
              "Total_Income"]]

y = pd.get_dummies(final_df["Loan_Status"])[["Y"]].values.reshape(-1,1)

print(X.shape, y.shape)

(492, 13) (492, 1)


Splitting the Data for training and testing


In [10]:
from sklearn.model_selection import train_test_split

X = pd.get_dummies(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

X_train.head()

Unnamed: 0,Gender_Male,Married_Yes,Dependents_0,Dependents_1,Dependents_2,Dependents_3+,Self_Employed_Yes,Property_Area_Rural,Property_Area_Semiurban,Property_Area_Urban,LoanAmount,Credit_History,Total_Income
143,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,122.0,1.0,4732.0
434,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,100.0,1.0,3750.0
169,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,200.0,1.0,8000.0
239,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,96.0,1.0,3315.0
396,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,71.0,0.0,3180.0


Creating a Logistic Regression Model - Classification Model


In [11]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier

LogisticRegression()

In [12]:
classifier.fit(X_train, y_train)

LogisticRegression()

In [13]:
# Accuracy score
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")

Testing Data Score: 0.8048780487804879


In [14]:
predictions = classifier.predict(X_test)
print(f"First 10 Predictions:   {predictions[:10]}")
print(f"First 10 Actual labels: {y_test[:10].tolist()}")

First 10 Predictions:   [1 0 1 0 1 1 1 1 1 1]
First 10 Actual labels: [[1], [0], [0], [0], [1], [1], [1], [1], [1], [0]]


In [15]:
# Saving model
import joblib

In [16]:
joblib.dump(classifier, "../app/static/py/classifier.sav")
joblib.dump(encoder, "../app/static/py/encoder.sav")

['../app/static/py/encoder.sav']

In [17]:
# Function to predict result based on user input

def predict(user_inputs):
    classifier = joblib.load("../app/static/py/classifier.sav")
    encoder=joblib.load("../app/static/py/encoder.sav")

    gender = user_inputs["gender"]
    married = user_inputs["married"]
    dependents = user_inputs["dependents"]
    self_employed = user_inputs["self_employed"]
    loan_amount = user_inputs["loan_amount"]
    credit_history = user_inputs["credit history"]
    property_area = user_inputs["property_area"]
    total_income = user_inputs["total_income"]

    decoded_data = pd.DataFrame({
        "Gender": [gender], 
        "Married": [married],
        "Dependents": [dependents], 
        "Self_Employed": [self_employed], 
        "Property_Area": [property_area]
    })

    encoded_data = encoder.transform(decoded_data)
    encoded_df = pd.DataFrame(columns=encoder.get_feature_names(["Gender",
        "Married",
        "Dependents",
        "Self_Employed",
        "Property_Area"]),data=encoded_data.toarray())

    df = pd.DataFrame({
        "LoanAmount": [loan_amount],
        "Credit_History": [credit_history], 
        "Total_Income": [total_income]
    })

    merge_df = pd.concat([encoded_df, df], axis=1)
    predict_df = merge_df[["Gender_Male",
              "Married_Yes",
              "Dependents_0",
              "Dependents_1",
              "Dependents_2",
              "Dependents_3+",
              "Self_Employed_Yes",
              "Property_Area_Rural",
              "Property_Area_Semiurban",
              "Property_Area_Urban",
              "LoanAmount",
              "Credit_History",
              "Total_Income"]]

    
    loan_status = classifier.predict(predict_df)
    return loan_status

In [18]:
user_inputs = {
    "gender":"Female",
    "married":"Yes",
    "dependents":"1",
    "self_employed":"No",
    "loan_amount":1200.00,
    "credit history":1.0,
    "property_area": "Urban",
    "total_income":2900.0
}

predict(user_inputs)[0]

0