# COVID-19 Chest X-Ray Database - Experiment

In [None]:
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from nn import *

In [None]:
def run_model(name, classifier, params, cmap):
    print(f"\n- {name}")
    
    model = classifier(**params)    
    y_pred, y_pred_proba = fit_and_predict(model, X_train, X_test, y_train, y_test)
    metrics, cm = get_metrics(y_test, y_pred, y_pred_proba)
    
    plot_confusion_matrix(cm, cmap)
    
    metrics_to_print = metrics.copy()
    metrics_to_print.pop('TPR')
    metrics_to_print.pop('FPR')
    
    for metric, value in metrics_to_print.items():
            print(f"    - {metric}: {value}")
        
    return metrics

## Multilayer Perceptron

In [None]:
# Preprocess dataset
wineDF = pd.read_csv("./data/winequality-red.csv")

wineDF['label'] = wineDF['quality'].apply(lambda x: 1 if x > 6 else 0)
wineDF.drop('quality', axis=1)

wine_y = wineDF['label'].values
wine_X = wineDF.drop('label', axis=1).values

In [None]:
# Standardize dataset
scaler = StandardScaler()
wine_X = scaler.fit_transform(wine_X)

In [None]:
# Obtained from grid search
models['Logistic Regression']['params'] = {'C': 0.046415888336127774, 'max_iter': 100, 'penalty': 'l1', 'solver': 'liblinear'}
models['Decision Tree']['params'] = {'max_depth': None, 'criterion': 'entropy'}
models['K-Nearest Neighbors']['params'] = {'n_neighbors': 3}
models['Neural Network']['params'] = {'activation': 'tanh', 'hidden_layer_sizes': (50,), 'learning_rate_init': 0.001, 'max_iter': 200, 'solver': 'adam'}

In [None]:
# Cross validate
all_metrics = {}

for i in range(5):
    print(f"\nSplit {i+1}:")
    
    (X_train, X_test, y_train, y_test) = train_test_split(wine_X, wine_y, test_size=0.2, stratify=wine_y)
    
    for key, model in models.items():
        metrics = run_model(key, model['classifier'], model['params'], plt.cm.Blues)
        all_metrics[key] = metrics
        
plt.figure()

for name, metrics in all_metrics.items():
        plt.plot(metrics['FPR'],metrics['TPR'], linestyle='-', color=models[name]['color'], label=name)

plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves for Multiple Models')
plt.legend()
plt.show()

## Heart Disease Dataset

In [None]:
# Preprocess dataset
heartDF = pd.read_csv("./data/heart_statlog_cleveland_hungary_final.csv")

heart_y = heartDF['target'].values
heart_X = heartDF.drop('target', axis=1).values

In [None]:
# Standardize dataset
scaler = StandardScaler()
heart_X = scaler.fit_transform(heart_X)

In [None]:
# Obtained from grid search
models['Logistic Regression']['params'] = {'C': 0.046415888336127774, 'max_iter': 100, 'penalty': 'l1', 'solver': 'liblinear'}
models['Decision Tree']['params'] = {'max_depth': 20, 'criterion': 'entropy'}
models['K-Nearest Neighbors']['params'] = {'n_neighbors': 10}
models['Neural Network']['params'] = {'activation': 'relu', 'hidden_layer_sizes': (100,), 'max_iter': 300, 'solver': 'lbfgs'}

In [None]:
# Cross validate
all_metrics = {}

for i in range(5):
    print(f"\nSplit {i+1}:")
    
    (X_train, X_test, y_train, y_test) = train_test_split(heart_X, heart_y, test_size=0.2, stratify=heart_y)
    
    for key, model in models.items():
        metrics = run_model(key, model['classifier'], model['params'], plt.cm.Oranges)
        all_metrics[key] = metrics
        
plt.figure()

for name, metrics in all_metrics.items():
        plt.plot(metrics['FPR'],metrics['TPR'], linestyle='-', color=models[name]['color'], label=name)

plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves for Multiple Models')
plt.legend()
plt.show()