# Logistic Regression

In [None]:
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from ml import *

## Wine Quality Dataset

In [None]:
# Preprocess dataset
wineDF = pd.read_csv("./data/winequality-red.csv")

wineDF['label'] = wineDF['quality'].apply(lambda x: 'good' if x > 6 else 'bad')
wineDF.drop('quality', axis=1)

wine_y = wineDF['label'].values
wine_x = wineDF.drop('label', axis=1).values

In [None]:
# Standardize dataset
scaler = StandardScaler()
wine_x = scaler.fit_transform(wine_x)

In [None]:
# Cross validate
all_metrics = []

for i in range(5):
    print(f"Split {i+1}:")
    
    x_train, x_test, y_train, y_test = train_test_split(wine_x, wine_y, test_size=0.2, stratify=wine_y)
    
    model = build_log_reg_model(x_train, y_train)
    
    metrics, report, cm = fit_and_predict(model, x_train, x_test, y_train, y_test)
    
    all_metrics.append(metrics)
    print(report)
    plot_confusion_matrix(cm, plt.cm.Blues)
    
print("Average metrics:")
print_avg_metrics(all_metrics)

## Heart Disease Dataset

In [None]:
# Preprocess dataset
heartDF = pd.read_csv("./data/heart_statlog_cleveland_hungary_final.csv")

heartDF['target'] = heartDF['target'].apply(lambda x: 'heart disease' if x else 'normal')
heart_y = heartDF['target'].values
heart_x = heartDF.drop('target', axis=1).values

In [None]:
# Standardize dataset
scaler = StandardScaler()
heart_x = scaler.fit_transform(heart_x)

In [None]:
# Cross validate
all_metrics = []

for i in range(5):
    print(f"Split {i+1}:")
    
    x_train, x_test, y_train, y_test = train_test_split(heart_x, heart_y, test_size=0.2, stratify=heart_y)
    
    model = build_log_reg_model(x_train, y_train)
    
    metrics, report, cm = fit_and_predict(model, x_train, x_test, y_train, y_test)
    
    all_metrics.append(metrics)
    print(report)
    plot_confusion_matrix(cm, plt.cm.Oranges)
    
print("Average metrics:")
print_avg_metrics(all_metrics)