In [121]:
import numpy as np
import pandas as pd

# Read datasets
heartDF = pd.read_csv("./data/heart_statlog_cleveland_hungary_final.csv")
wineDF = pd.read_csv("./data/winequality-red.csv")

# Preprocess datasets
wineDF['label'] = wineDF['quality'].apply(lambda x: 'good' if x > 6 else 'bad')
wineDF.drop('quality', axis=1)

wine_Y = wineDF['label'].values
wine_X = wineDF.drop('label', axis=1).values

heart_Y = heartDF['target'].values
heart_X = heartDF.drop('target', axis=1).values

In [122]:
# Standardize datasets
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
wine_X = scaler.fit_transform(wine_X)
heart_X = scaler.fit_transform(heart_X)

In [123]:
# Get the dataset split for training the models
from sklearn.model_selection import train_test_split
X_train_wine, X_test_wine, y_train_wine, y_test_wine = train_test_split(wine_X, wine_Y, test_size=0.3, stratify=wine_Y)
X_train_heart, X_test_heart, y_train_heart, y_test_heart = train_test_split(heart_X, heart_Y, test_size=0.3, stratify=heart_Y)

In [124]:
# Get the best hyperparameters
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

import warnings
from sklearn.exceptions import ConvergenceWarning, FitFailedWarning

def get_best_params_LogisticRegresion(X_train, X_test, y_train, y_test, count=5):
    # Define parameter grid
    param_grid = {
        'C': np.logspace(-3,3, 7),  # Regularization parameter
        'penalty': ['l1', 'l2', 'elasticnet', None],  # Penalty options
        'solver': ['liblinear', 'newton-cg', 'lbfgs', 'sag', 'saga', 'newton-cholesky'],  # Algorithm to use in the optimization problem
        'max_iter': [100, 500, 1000]  # Maximum number of iterations
    }

    # Create a Logistic Regression model
    logreg = LogisticRegression()
    best_model = None
    best_params = dict()
    for i in range(0, count):
        # Create GridSearchCV
        grid_search = GridSearchCV(logreg, param_grid, cv=10, n_jobs=4)
        # Suppress warnings
        with warnings.catch_warnings():
            warnings.filterwarnings("ignore", category=ConvergenceWarning)
            warnings.filterwarnings("ignore", category=FitFailedWarning)
            warnings.filterwarnings("ignore", category=UserWarning)
            # Fit the grid search to the data
            grid_search.fit(X_train, y_train)

        # Print the best parameters found
        print(f"[{i}] Best Parameters:", grid_search.best_params_)

        # Get the best model
        if best_model is None or grid_search.best_score_ > best_model.score(X_test, y_test):
            best_model = grid_search.best_estimator_
            best_params = grid_search.best_params_

        # Evaluate the best model on the test set
        # y_pred = best_model.predict(X_test)
        # print(classification_report(y_test, y_pred))
    return best_params


In [129]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

def train_logreg_model(hyperparameters, X_train, X_test, y_train, y_test, count=5):
  if hyperparameters['penalty'] == None:
    hyperparameters.pop('C')

  for i in range(0, count):
    print(f"[{i}] Run:")
    # Create the model with given hyperparameters and train it
    logreg = LogisticRegression(**hyperparameters)
    logreg.fit(X_train, y_train)

    # Make predictions on test data
    y_pred = logreg.predict(X_test)

    # Get the confusion matrix
    cm = confusion_matrix(y_test, y_pred).ravel()

    print("Confusion Matrix: ")
    print(cm)
    print(classification_report(y_test, y_pred))
    


In [126]:
# Get best hyper-parameters for wine dataset and heart dataset
print("For Wine Quality Dataset:")
wine_best_params = get_best_params_LogisticRegresion(X_train_wine, X_test_wine, y_train_wine, y_test_wine)
print("\nFor Heart Disease:")
heart_best_params = get_best_params_LogisticRegresion(X_train_heart, X_test_heart, y_train_heart, y_test_heart)


For Wine Quality Dataset:
[0] Best Parameters: {'C': 0.001, 'max_iter': 100, 'penalty': None, 'solver': 'newton-cg'}
[1] Best Parameters: {'C': 0.001, 'max_iter': 100, 'penalty': None, 'solver': 'newton-cg'}
[2] Best Parameters: {'C': 0.001, 'max_iter': 100, 'penalty': None, 'solver': 'newton-cg'}
[3] Best Parameters: {'C': 0.001, 'max_iter': 100, 'penalty': None, 'solver': 'newton-cg'}
[4] Best Parameters: {'C': 0.001, 'max_iter': 100, 'penalty': None, 'solver': 'newton-cg'}

For Heart Disease:
[0] Best Parameters: {'C': 0.1, 'max_iter': 100, 'penalty': 'l2', 'solver': 'liblinear'}
[1] Best Parameters: {'C': 0.1, 'max_iter': 100, 'penalty': 'l2', 'solver': 'liblinear'}
[2] Best Parameters: {'C': 0.1, 'max_iter': 100, 'penalty': 'l2', 'solver': 'liblinear'}
[3] Best Parameters: {'C': 0.1, 'max_iter': 100, 'penalty': 'l2', 'solver': 'liblinear'}
[4] Best Parameters: {'C': 0.1, 'max_iter': 100, 'penalty': 'l2', 'solver': 'liblinear'}


In [128]:
# Train the models
train_logreg_model(wine_best_params, X_train_wine, X_test_wine, y_train_wine, y_test_wine)
train_logreg_model(heart_best_params, X_train_heart, X_test_heart, y_train_heart, y_test_heart)

{'C': 0.001, 'max_iter': 100, 'penalty': None, 'solver': 'newton-cg'}
[0] Run:
Confusion Matrix: 
[415   0   0  65]
              precision    recall  f1-score   support

         bad       1.00      1.00      1.00       415
        good       1.00      1.00      1.00        65

    accuracy                           1.00       480
   macro avg       1.00      1.00      1.00       480
weighted avg       1.00      1.00      1.00       480

[1] Run:
Confusion Matrix: 
[415   0   0  65]
              precision    recall  f1-score   support

         bad       1.00      1.00      1.00       415
        good       1.00      1.00      1.00        65

    accuracy                           1.00       480
   macro avg       1.00      1.00      1.00       480
weighted avg       1.00      1.00      1.00       480

[2] Run:
Confusion Matrix: 
[415   0   0  65]
              precision    recall  f1-score   support

         bad       1.00      1.00      1.00       415
        good       1.00      