In [1]:
import pandas as pd
import numpy as np

from category_encoders import OneHotEncoder
import skimpy as sk
import pytimetk as tk

import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
from sklearn.impute import SimpleImputer
from sklearn.preprocessing  import StandardScaler
from sklearn.decomposition import PCA
from sklearn.feature_selection import RFE
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV

import joblib

# Suppress warnings
import warnings
warnings.simplefilter(action="ignore", category=Warning)

In [2]:
df = pd.read_csv("../data/prepared_data.csv")
df.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,Male,No,0,Graduate,No,5849,0.0,146.412162,360.0,1.0,Urban,Y
1,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [3]:
df.isnull().sum()

Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
dtype: int64

In [4]:
ohe = OneHotEncoder(use_cat_names=True)

encoded_df = ohe.fit_transform(df)
encoded_df.glimpse()

<class 'pandas.core.frame.DataFrame'>: 520 rows of 22 columns
Gender_Male:              int64             [1, 1, 1, 1, 1, 1, 1, 1, 1,  ...
Gender_Female:            int64             [0, 0, 0, 0, 0, 0, 0, 0, 0,  ...
Married_No:               int64             [1, 0, 0, 0, 1, 0, 0, 0, 0,  ...
Married_Yes:              int64             [0, 1, 1, 1, 0, 1, 1, 1, 1,  ...
Dependents_0:             int64             [1, 0, 1, 1, 1, 1, 0, 0, 0,  ...
Dependents_1:             int64             [0, 1, 0, 0, 0, 0, 0, 0, 0,  ...
Dependents_3+:            int64             [0, 0, 0, 0, 0, 0, 1, 0, 0,  ...
Dependents_2:             int64             [0, 0, 0, 0, 0, 0, 0, 1, 1,  ...
Education_Graduate:       int64             [1, 1, 1, 0, 1, 0, 1, 1, 1,  ...
Education_Not Graduate:   int64             [0, 0, 0, 1, 0, 1, 0, 0, 0,  ...
Self_Employed_No:         int64             [1, 1, 0, 1, 1, 1, 1, 1, 1,  ...
Self_Employed_Yes:        int64             [0, 0, 1, 0, 0, 0, 0, 0, 0,  ...
ApplicantIncom

In [5]:
target = "Loan_Status"

x = df.drop(
    columns=[target],
    inplace=False
)
y = df[target]

In [6]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [7]:
acc_baseline = y_train.value_counts(normalize=True).max()
acc_baseline

0.7019230769230769

In [8]:
model_lr = make_pipeline(
    OneHotEncoder(use_cat_names=True),
    LogisticRegression(max_iter=1000)
)

model_lr.fit(x_train, y_train)

In [9]:
lr_train_acc = model_lr.score(x_train, y_train)
lr_test_acc = model_lr.score(x_test, y_test)

print("Logistic Regression, Training Accuracy Score:", lr_train_acc)
print("Logistic Regression, Validation Accuracy Score:", lr_test_acc)


Logistic Regression, Training Accuracy Score: 0.8269230769230769
Logistic Regression, Validation Accuracy Score: 0.8173076923076923


In [10]:
y_pred = model_lr.predict(x_test)
y_pred

array(['Y', 'Y', 'N', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y',
       'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N',
       'Y', 'N', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'N', 'Y', 'Y', 'Y',
       'Y', 'Y', 'Y', 'N', 'Y', 'N', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'N',
       'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'N',
       'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y',
       'Y', 'Y', 'N', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y',
       'Y', 'N', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y'],
      dtype=object)

In [11]:
accuracy = accuracy_score(y_test, y_pred)
accuracy

0.8173076923076923

In [12]:
y_pred_df = pd.DataFrame(y_pred, columns=["Predictions"])

In [13]:
y_pred_df.head()

Unnamed: 0,Predictions
0,Y
1,Y
2,N
3,N
4,Y


In [14]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred, average='macro'))
print("Recall:", recall_score(y_test, y_pred, average='macro'))
print("F1 score:", f1_score(y_test, y_pred, average='macro'))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 0.8173076923076923
Precision: 0.8321858864027538
Recall: 0.7432773109243698
F1 score: 0.7651812240047534

Classification Report:
               precision    recall  f1-score   support

           N       0.86      0.53      0.65        34
           Y       0.81      0.96      0.88        70

    accuracy                           0.82       104
   macro avg       0.83      0.74      0.77       104
weighted avg       0.82      0.82      0.80       104

Confusion Matrix:
 [[18 16]
 [ 3 67]]


In [15]:
# Cross-validation
scores = cross_val_score(model_lr, x_train, y_train, cv=5)

# Print the scores
print("Cross-validation scores:", scores)
print("Average cross-validation score:", np.mean(scores))

# Calculate the mean and standard deviation of the scores
mean_score = scores.mean()
std_dev = scores.std()

print("\nMean score:", mean_score)
print("Standard deviation:", std_dev)

Cross-validation scores: [0.82142857 0.79518072 0.81927711 0.84337349 0.85542169]
Average cross-validation score: 0.8269363166953528

Mean score: 0.8269363166953528
Standard deviation: 0.020875888344597568


In [16]:
# hyper parameter tuning with GridSearchCV
param_grid = {
    'logisticregression__penalty': ['l1', 'l2'],
    'logisticregression__C': [0.1, 1, 10]
}

grid_search = GridSearchCV(model_lr, param_grid, cv=5)
grid_search.fit(x_train, y_train)

print('Grid Search:')
print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

# hyper parameter tuning with RandomizedSearchCV
random_search = RandomizedSearchCV(model_lr, param_grid, cv=5, n_iter=10)
random_search.fit(x_train, y_train)

print('\nRandomized Search:')
print("Best Parameters:", random_search.best_params_)
print("Best Score:", random_search.best_score_)

Grid Search:
Best Parameters: {'logisticregression__C': 1, 'logisticregression__penalty': 'l2'}
Best Score: 0.8269363166953528

Randomized Search:
Best Parameters: {'logisticregression__penalty': 'l2', 'logisticregression__C': 1}
Best Score: 0.8269363166953528


In [19]:
# Save Model
joblib.dump(model_lr, '../artifacts/final_model.sav')

['../artifacts/final_model.sav']

### Extra Models

In [None]:
target = "Loan_Status"

x = df.drop(
    columns=[target],
    inplace=False
)
y = df[target]

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

##### Model 1


In [None]:
model_t1 = make_pipeline(

    OneHotEncoder(use_cat_names=True), # encode cat features
    SimpleImputer(), # imputation
    StandardScaler(), 
    LogisticRegression()
    ) # build model

# fit the model
model_t1.fit(x_train, y_train)

In [None]:
lr_train_acc = model_t1.score(x_train, y_train)
lr_test_acc = model_t1.score(x_test, y_test)

print("Logistic Regression, Training Accuracy Score:", lr_train_acc)
print("Logistic Regression, Validation Accuracy Score:", lr_test_acc)

Logistic Regression, Training Accuracy Score: 0.8269230769230769
Logistic Regression, Validation Accuracy Score: 0.8173076923076923


In [None]:
y_pred = model_t1.predict(x_test)
# print(y_pred)

In [None]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred, average='macro'))
print("Recall:", recall_score(y_test, y_pred, average='macro'))
print("F1 score:", f1_score(y_test, y_pred, average='macro'))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 0.8173076923076923
Precision: 0.8321858864027538
Recall: 0.7432773109243698
F1 score: 0.7651812240047534

Classification Report:
               precision    recall  f1-score   support

           N       0.86      0.53      0.65        34
           Y       0.81      0.96      0.88        70

    accuracy                           0.82       104
   macro avg       0.83      0.74      0.77       104
weighted avg       0.82      0.82      0.80       104

Confusion Matrix:
 [[18 16]
 [ 3 67]]


In [None]:
# Cross-validation
scores = cross_val_score(model_t1, x_train, y_train, cv=5)

# Print the scores
print("Cross-validation scores:", scores)
print("Average cross-validation score:", np.mean(scores))

# Calculate the mean and standard deviation of the scores
mean_score = scores.mean()
std_dev = scores.std()

print("\nMean score:", mean_score)
print("Standard deviation:", std_dev)

Cross-validation scores: [0.82142857 0.79518072 0.8313253  0.84337349 0.84337349]
Average cross-validation score: 0.8269363166953528

Mean score: 0.8269363166953528
Standard deviation: 0.017879473005005992


In [None]:
# hyper parameter tuning with GridSearchCV
param_grid = {
    'logisticregression__penalty': ['l1', 'l2'],
    'logisticregression__C': [0.1, 1, 10]
}

grid_search = GridSearchCV(model_t1, param_grid, cv=5)
grid_search.fit(x_train, y_train)

print('Grid Search:')
print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

# hyper parameter tuning with RandomizedSearchCV
random_search = RandomizedSearchCV(model_lr, param_grid, cv=5, n_iter=10)
random_search.fit(x_train, y_train)

print('\nRandomized Search:')
print("Best Parameters:", random_search.best_params_)
print("Best Score:", random_search.best_score_)

Grid Search:
Best Parameters: {'logisticregression__C': 0.1, 'logisticregression__penalty': 'l2'}
Best Score: 0.8293459552495698

Randomized Search:
Best Parameters: {'logisticregression__penalty': 'l2', 'logisticregression__C': 1}
Best Score: 0.8269363166953528


#### Model 2

In [None]:
model_t2 = make_pipeline(
    OneHotEncoder(use_cat_names=True),
    StandardScaler(),
    LogisticRegression(max_iter=1000)
)

model_t2.fit(x_train, y_train)

In [None]:
y_pred = model_t2.predict(x_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred, average='macro'))
print("Recall:", recall_score(y_test, y_pred, average='macro'))
print("F1 score:", f1_score(y_test, y_pred, average='macro'))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 0.8173076923076923
Precision: 0.8321858864027538
Recall: 0.7432773109243698
F1 score: 0.7651812240047534

Classification Report:
               precision    recall  f1-score   support

           N       0.86      0.53      0.65        34
           Y       0.81      0.96      0.88        70

    accuracy                           0.82       104
   macro avg       0.83      0.74      0.77       104
weighted avg       0.82      0.82      0.80       104

Confusion Matrix:
 [[18 16]
 [ 3 67]]


In [None]:
# Cross-validation
scores = cross_val_score(model_t2, x_train, y_train, cv=5)

# Print the scores
print("Cross-validation scores:", scores)
print("Average cross-validation score:", np.mean(scores))

# Calculate the mean and standard deviation of the scores
mean_score = scores.mean()
std_dev = scores.std()

print("\nMean score:", mean_score)
print("Standard deviation:", std_dev)

Cross-validation scores: [0.82142857 0.79518072 0.8313253  0.84337349 0.84337349]
Average cross-validation score: 0.8269363166953528

Mean score: 0.8269363166953528
Standard deviation: 0.017879473005005992


In [None]:
# hyper parameter tuning with GridSearchCV
param_grid = {
    'logisticregression__penalty': ['l1', 'l2'],
    'logisticregression__C': [0.1, 1, 10]
}

grid_search = GridSearchCV(model_t2, param_grid, cv=5)
grid_search.fit(x_train, y_train)

print('Grid Search:')
print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

# hyper parameter tuning with RandomizedSearchCV
random_search = RandomizedSearchCV(model_lr, param_grid, cv=5, n_iter=10)
random_search.fit(x_train, y_train)

print('\nRandomized Search:')
print("Best Parameters:", random_search.best_params_)
print("Best Score:", random_search.best_score_)

Grid Search:
Best Parameters: {'logisticregression__C': 0.1, 'logisticregression__penalty': 'l2'}
Best Score: 0.8293459552495698

Randomized Search:
Best Parameters: {'logisticregression__penalty': 'l2', 'logisticregression__C': 1}
Best Score: 0.8269363166953528


#### Model 3

In [None]:
model_t3 = make_pipeline(
    OneHotEncoder(use_cat_names=True),
    PCA(n_components=0.95),  # retain 95% of the variance
    LogisticRegression(max_iter=1000)
)

model_t3.fit(x_train, y_train)

In [None]:
y_pred = model_t3.predict(x_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred, average='macro'))
print("Recall:", recall_score(y_test, y_pred, average='macro'))
print("F1 score:", f1_score(y_test, y_pred, average='macro'))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 0.6730769230769231
Precision: 0.33653846153846156
Recall: 0.5
F1 score: 0.40229885057471265

Classification Report:
               precision    recall  f1-score   support

           N       0.00      0.00      0.00        34
           Y       0.67      1.00      0.80        70

    accuracy                           0.67       104
   macro avg       0.34      0.50      0.40       104
weighted avg       0.45      0.67      0.54       104

Confusion Matrix:
 [[ 0 34]
 [ 0 70]]


In [None]:
# Cross-validation
scores = cross_val_score(model_t3, x_train, y_train, cv=5)

# Print the scores
print("Cross-validation scores:", scores)
print("Average cross-validation score:", np.mean(scores))

# Calculate the mean and standard deviation of the scores
mean_score = scores.mean()
std_dev = scores.std()

print("\nMean score:", mean_score)
print("Standard deviation:", std_dev)

Cross-validation scores: [0.70238095 0.71084337 0.69879518 0.69879518 0.69879518]
Average cross-validation score: 0.7019219736087206

Mean score: 0.7019219736087206
Standard deviation: 0.004671884816187243


In [None]:
# hyper parameter tuning with GridSearchCV
param_grid = {
    'logisticregression__penalty': ['l1', 'l2'],
    'logisticregression__C': [0.1, 1, 10]
}

grid_search = GridSearchCV(model_t3, param_grid, cv=5)
grid_search.fit(x_train, y_train)

print('Grid Search:')
print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

# hyper parameter tuning with RandomizedSearchCV
random_search = RandomizedSearchCV(model_lr, param_grid, cv=5, n_iter=10)
random_search.fit(x_train, y_train)

print('\nRandomized Search:')
print("Best Parameters:", random_search.best_params_)
print("Best Score:", random_search.best_score_)

Grid Search:
Best Parameters: {'logisticregression__C': 0.1, 'logisticregression__penalty': 'l2'}
Best Score: 0.7019219736087206

Randomized Search:
Best Parameters: {'logisticregression__penalty': 'l2', 'logisticregression__C': 1}
Best Score: 0.8269363166953528


#### Model 4

In [None]:
model_t4 = make_pipeline(
    OneHotEncoder(use_cat_names=True),
    RFE(LogisticRegression(max_iter=1000), n_features_to_select=10),
    LogisticRegression(max_iter=1000)
)

model_t4.fit(x_train, y_train)

In [None]:
lr_train_acc = model_t4.score(x_train, y_train)
lr_test_acc = model_t4.score(x_test, y_test)

print("Logistic Regression, Training Accuracy Score:", lr_train_acc)
print("Logistic Regression, Validation Accuracy Score:", lr_test_acc)


Logistic Regression, Training Accuracy Score: 0.8293269230769231
Logistic Regression, Validation Accuracy Score: 0.8173076923076923


In [None]:
y_pred = model_t4.predict(x_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred, average='macro'))
print("Recall:", recall_score(y_test, y_pred, average='macro'))
print("F1 score:", f1_score(y_test, y_pred, average='macro'))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 0.8173076923076923
Precision: 0.8321858864027538
Recall: 0.7432773109243698
F1 score: 0.7651812240047534

Classification Report:
               precision    recall  f1-score   support

           N       0.86      0.53      0.65        34
           Y       0.81      0.96      0.88        70

    accuracy                           0.82       104
   macro avg       0.83      0.74      0.77       104
weighted avg       0.82      0.82      0.80       104

Confusion Matrix:
 [[18 16]
 [ 3 67]]


In [None]:
# Cross-validation
scores = cross_val_score(model_t4, x_train, y_train, cv=5)

# Print the scores
print("Cross-validation scores:", scores)
print("Average cross-validation score:", np.mean(scores))

# Calculate the mean and standard deviation of the scores
mean_score = scores.mean()
std_dev = scores.std()

print("\nMean score:", mean_score)
print("Standard deviation:", std_dev)

Cross-validation scores: [0.82142857 0.80722892 0.81927711 0.8313253  0.85542169]
Average cross-validation score: 0.8269363166953528

Mean score: 0.8269363166953528
Standard deviation: 0.01617442349257164


In [None]:
# hyper parameter tuning with GridSearchCV
param_grid = {
    'logisticregression__penalty': ['l1', 'l2'],
    'logisticregression__C': [0.1, 1, 10]
}

grid_search = GridSearchCV(model_t4, param_grid, cv=5)
grid_search.fit(x_train, y_train)

print('Grid Search:')
print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

# hyper parameter tuning with RandomizedSearchCV
random_search = RandomizedSearchCV(model_lr, param_grid, cv=5, n_iter=10)
random_search.fit(x_train, y_train)

print('\nRandomized Search:')
print("Best Parameters:", random_search.best_params_)
print("Best Score:", random_search.best_score_)

Grid Search:
Best Parameters: {'logisticregression__C': 1, 'logisticregression__penalty': 'l2'}
Best Score: 0.8269363166953528

Randomized Search:
Best Parameters: {'logisticregression__penalty': 'l2', 'logisticregression__C': 1}
Best Score: 0.8269363166953528


#### Model 5

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'logisticregression__penalty': ['l1', 'l2'],
    'logisticregression__C': [0.1, 1, 10]
}

model_t5 = make_pipeline(
    OneHotEncoder(use_cat_names=True),
    LogisticRegression(max_iter=1000)
)

grid_search = GridSearchCV(model_t5, param_grid, cv=5, scoring='f1_macro')
grid_search.fit(x_train, y_train)

In [None]:
y_pred = grid_search.predict(x_test)

In [None]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred, average='macro'))
print("Recall:", recall_score(y_test, y_pred, average='macro'))
print("F1 score:", f1_score(y_test, y_pred, average='macro'))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 0.8173076923076923
Precision: 0.8321858864027538
Recall: 0.7432773109243698
F1 score: 0.7651812240047534

Classification Report:
               precision    recall  f1-score   support

           N       0.86      0.53      0.65        34
           Y       0.81      0.96      0.88        70

    accuracy                           0.82       104
   macro avg       0.83      0.74      0.77       104
weighted avg       0.82      0.82      0.80       104

Confusion Matrix:
 [[18 16]
 [ 3 67]]


In [None]:
# Cross-validation
scores = cross_val_score(grid_search, x_train, y_train, cv=5, scoring='f1_macro')
print("Cross-validation scores: ", scores)
print("Average cross-validation score:", np.mean(scores))

# Calculate the mean and standard deviation of the scores
mean_score = scores.mean()
std_dev = scores.std()

print("\nMean score:", mean_score)
print("Standard deviation:", std_dev)



Cross-validation scores:  [0.74004539 0.68076923 0.74863719 0.78215223 0.79523026]
Average cross-validation score: 0.7493668606761352

Mean score: 0.7493668606761352
Standard deviation: 0.03992840912783925


In [None]:
grid_search = GridSearchCV(model_t5, param_grid, cv=5, scoring='f1_macro')
grid_search.fit(x_train, y_train)
print("Best parameters: ", grid_search.best_params_)
print("Best score: ", grid_search.best_score_)

Best parameters:  {'logisticregression__C': 1, 'logisticregression__penalty': 'l2'}
Best score:  0.7516644867142193


In [None]:
y_pred = grid_search.best_estimator_.predict(x_test)
print("F1 score: ", f1_score(y_test, y_pred, average='macro'))

F1 score:  0.7651812240047534


In [None]:
# hyper parameter tuning with RandomizedSearchCV
random_search = RandomizedSearchCV(model_lr, param_grid, cv=5, n_iter=10)
random_search.fit(x_train, y_train)

print('\nRandomized Search:')
print("Best Parameters:", random_search.best_params_)
print("Best Score:", random_search.best_score_)


Randomized Search:
Best Parameters: {'logisticregression__penalty': 'l2', 'logisticregression__C': 1}
Best Score: 0.8269363166953528
