In [1]:
import pandas as pd
import numpy as np

from category_encoders import OneHotEncoder
import skimpy as sk
import pytimetk as tk

import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Supress warnings
import warnings
warnings.simplefilter(action="ignore", category=Warning)

In [2]:
df = pd.read_csv("../data/prepared_data.csv")
df.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,Male,No,0,Graduate,No,5849,0.0,146.412162,360.0,1.0,Urban,Y
1,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [3]:
df.isnull().sum()

Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
dtype: int64

In [4]:
ohe = OneHotEncoder(use_cat_names=True)

encoded_df = ohe.fit_transform(df)
encoded_df.glimpse()

<class 'pandas.core.frame.DataFrame'>: 520 rows of 22 columns
Gender_Male:              int64             [1, 1, 1, 1, 1, 1, 1, 1, 1,  ...
Gender_Female:            int64             [0, 0, 0, 0, 0, 0, 0, 0, 0,  ...
Married_No:               int64             [1, 0, 0, 0, 1, 0, 0, 0, 0,  ...
Married_Yes:              int64             [0, 1, 1, 1, 0, 1, 1, 1, 1,  ...
Dependents_0:             int64             [1, 0, 1, 1, 1, 1, 0, 0, 0,  ...
Dependents_1:             int64             [0, 1, 0, 0, 0, 0, 0, 0, 0,  ...
Dependents_3+:            int64             [0, 0, 0, 0, 0, 0, 1, 0, 0,  ...
Dependents_2:             int64             [0, 0, 0, 0, 0, 0, 0, 1, 1,  ...
Education_Graduate:       int64             [1, 1, 1, 0, 1, 0, 1, 1, 1,  ...
Education_Not Graduate:   int64             [0, 0, 0, 1, 0, 1, 0, 0, 0,  ...
Self_Employed_No:         int64             [1, 1, 0, 1, 1, 1, 1, 1, 1,  ...
Self_Employed_Yes:        int64             [0, 0, 1, 0, 0, 0, 0, 0, 0,  ...
ApplicantIncom

In [5]:
target = "Loan_Status"

x = df.drop(
    columns=[target],
    inplace=False
)
y = df[target]

In [6]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [7]:
acc_baseline = y_train.value_counts(normalize=True).max()
acc_baseline

0.7019230769230769

In [8]:
model_lr = make_pipeline(
    OneHotEncoder(use_cat_names=True),
    LogisticRegression(max_iter=1000)
)

model_lr.fit(x_train, y_train)

In [9]:
lr_train_acc = model_lr.score(x_train, y_train)
lr_test_acc = model_lr.score(x_test, y_test)

print("Logistic Regression, Training Accuracy Score:", lr_train_acc)
print("Logistic Regression, Validation Accuracy Score:", lr_test_acc)

Logistic Regression, Training Accuracy Score: 0.8269230769230769
Logistic Regression, Validation Accuracy Score: 0.8173076923076923


In [10]:
y_pred = model_lr.predict(x_test)
y_pred

array(['Y', 'Y', 'N', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y',
       'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N',
       'Y', 'N', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'N', 'Y', 'Y', 'Y',
       'Y', 'Y', 'Y', 'N', 'Y', 'N', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'N',
       'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'N',
       'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y',
       'Y', 'Y', 'N', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y',
       'Y', 'N', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y'],
      dtype=object)

In [11]:
accuracy = accuracy_score(y_test, y_pred)
accuracy

0.8173076923076923

In [23]:
y_pred_df = pd.DataFrame(y_pred, columns=["Predictions"])

In [24]:
y_pred_df.head()

Unnamed: 0,Predictions
0,Y
1,Y
2,N
3,N
4,Y
