# Evaluation

In [None]:
# Imports 
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, PrecisionRecallDisplay
import os
import sys

sys.path.append(os.path.abspath("../scripts"))
from data_loader import DataLoader

import joblib
import pandas as pd

In [None]:
# Load data
data_loader = DataLoader()
X_train, y_train  = data_loader.training_data
X_val, y_val = data_loader.validation_data
X_test, y_test = data_loader.test_data

print(f"X_train shape: {X_train.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"X_val shape: {X_val.shape}")
print(f"y_val shape: {y_val.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_test shape: {y_test.shape}")

In [None]:
# logistic regression
lr_model_filename = (
    "../models/logistic_regression/lr_model_sampling_20241124_155301.pkl"
)
logistic_regression = joblib.load(lr_model_filename)

# naive bayes

# support vector machine
svm_model_filename = "../models/support_vector_machine/svm_model_halving-grid_full-dataset_accuracy_20241125_230119.pkl"
svm = joblib.load(svm_model_filename)


# decision tree

# knn
knn_filename = "../models/knn/lr_model_sampling_20241126_150545.pkl"
knn = joblib.load(knn_filename)

# Nearest Centroid
nc_filename = "../models/nearest_centroid/lr_model_sampling_20241126_181731.pkl"
nc = joblib.load(nc_filename)

# baselines
bl_model_filename = "../models/baseline/bl_model_majority20241126_120730.pkl"
bl_majority = joblib.load(bl_model_filename)




# all models
model_dict = {
    "Logistic Regression": logistic_regression,
    "Support Vector Machine": svm,
    "Baseline Majority": bl_majority,
    "K-Nearest Neighbors": knn,
    "Nearest Centroid": nc,
    # Add other models here
}

In [None]:
# Evaluate the model's performance on the test set
for model_name, model in model_dict.items():
    y_test = y_test
    X_test = X_test
    y_test_pred = model.predict(X_test)

    report = classification_report(y_test, y_test_pred, digits=4)
    print(f"\n=== Model: {model_name} ===\n")
    print("Classification Report:\n", report)
    accuracy = accuracy_score(y_test, y_test_pred)
    print(f"Accuracy: {accuracy:.4f}")
    tn, fp, fn, tp = confusion_matrix(y_test, y_test_pred).ravel()
    specificity = tn / (tn + fp) # this is basically just the recall of the negative class
    print(f"Specificity: {specificity:.4f}") 
    cm = confusion_matrix(y_test, y_test_pred)
    print("\nConfusion Matrix:\n", cm)

In [None]:
classification_report(y_test, y_test_pred, digits=4, output_dict=True)

In [None]:
metrics_df = pd.DataFrame(columns=[
    'Model', 
    'Accuracy', 
    'Overall_Precision', 
    'Overall_Recall', 
    'Overall_F1',
    'Precision_Negative',
    'Precision_Positive',
    'Specificity',
    'Recall_Negative', 
    'Recall_Positive', 
    'F1-Score_Negative', 
    'F1-Score_Positive', 
])

metrics_list = []  # Temporary list to store rows before creating the final DataFrame

for model_name, model in model_dict.items():
    y_test_pred = model.predict(X_test)
    report = classification_report(y_test, y_test_pred, digits=4, output_dict=True)
    accuracy = accuracy_score(y_test, y_test_pred)
    tn, fp, fn, tp = confusion_matrix(y_test, y_test_pred).ravel()
    specificity = tn / (tn + fp)  # Specificity calculation
    
    # Metrics for the negative class ('0.0')
    precision_negative = report['0.0']['precision']
    recall_negative = report['0.0']['recall']
    f1_score_negative = report['0.0']['f1-score']
    
    # Metrics for the positive class ('1.0')
    precision_positive = report['1.0']['precision']
    recall_positive = report['1.0']['recall']
    f1_score_positive = report['1.0']['f1-score']
    
    # Overall metrics (from 'weighted avg')
    overall_precision = report['weighted avg']['precision']
    overall_recall = report['weighted avg']['recall']
    overall_f1 = report['weighted avg']['f1-score']
    
    # Append metrics as a dictionary to the list
    metrics_list.append({
        'Model': model_name,
        'Accuracy': accuracy,
        'Overall_Precision': overall_precision,
        'Overall_Recall': overall_recall,
        'Overall_F1': overall_f1,
        'Precision_Negative': precision_negative,
        'Precision_Positive': precision_positive,
        'Specificity': specificity,
        'Recall_Negative': recall_negative,
        'Recall_Positive': recall_positive,
        'F1-Score_Negative': f1_score_negative,
        'F1-Score_Positive': f1_score_positive,

    })

# Convert the list of metrics into a DataFrame
metrics_df = pd.DataFrame(metrics_list)

# Round all numeric values to 4 decimal places
metrics_df = metrics_df.round(4)

metrics_df


In [None]:
from sklearn.metrics import ConfusionMatrixDisplay
import matplotlib.pyplot as plt

# Evaluate the model's performance on the test set
for model_name, model in model_dict.items():
    y_test = y_test
    X_test = X_test
    y_test_pred = model.predict(X_test)
    # Plot confusion matrix
    cm = confusion_matrix(y_test, y_test_pred)
    fig, ax = plt.subplots(figsize=(8, 6))
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=model.classes_)
    disp.plot(ax=ax, cmap=plt.cm.Blues, values_format='d')
    plt.title(f'Confusion Matrix for {model_name}')
    plt.show()

In [None]:
from sklearn.metrics import precision_recall_curve, average_precision_score
import matplotlib.pyplot as plt

for model_name, model in model_dict.items():
    y_test_pred = model.predict(X_test)
    
    # Calculate precision, recall, and thresholds
    precision, recall, thresholds = precision_recall_curve(y_test, y_test_pred)
    average_precision = average_precision_score(y_test, y_test_pred)

    # Plot the precision-recall curve
    plt.figure(figsize=(10, 8))
    plt.plot(recall, precision, marker='.', label=f'{model_name} (AP = {average_precision:.2f})')
    plt.xlabel('Recall', fontsize=12)
    plt.ylabel('Precision', fontsize=12)
    plt.title(f'Precision-Recall Curve for {model_name}', fontsize=14)
    plt.legend(fontsize=12)
    plt.grid(alpha=0.6, linestyle='--')
    plt.tight_layout()
    plt.show()

In [None]:
for model_name, model in model_dict.items():
    if hasattr(model, "predict_proba"):
        y_test_proba = model.predict_proba(X_test)[:, 1]  # Use probabilities for the positive class
    else:
        # Fallback to decision_function if predict_proba is unavailable
        try:
            y_test_proba = model.decision_function(X_test)
        except:
            continue
        
    # Calculate precision, recall, and thresholds
    precision, recall, thresholds = precision_recall_curve(y_test, y_test_proba)
    average_precision = average_precision_score(y_test, y_test_proba)

    # Plot the precision-recall curve
    plt.figure(figsize=(10, 8))
    plt.plot(recall, precision, marker='.', label=f'{model_name} (AP = {average_precision:.2f})')
    plt.xlabel('Recall', fontsize=12)
    plt.ylabel('Precision', fontsize=12)
    plt.title(f'Precision-Recall Curve for {model_name}', fontsize=14)
    plt.legend(fontsize=12)
    plt.grid(alpha=0.6, linestyle='--')
    plt.tight_layout()
    plt.show()


In [None]:
# https://scikit-learn.org/stable/auto_examples/model_selection/plot_precision_recall.html#sphx-glr-auto-examples-model-selection-plot-precision-recall-py
for model_name, model in model_dict.items():
    y_test_pred = model.predict(X_test)
    
    # Use PrecisionRecallDisplay for visualization
    display = PrecisionRecallDisplay.from_predictions(
        y_test, y_test_pred,
        name=model_name,
        plot_chance_level=True  # Adds a baseline for random performance
    )

    # Customize the plot
    display.ax_.set_title(f"2-Class Precision-Recall Curve for {model_name}")
    display.ax_.grid(alpha=0.6, linestyle='--')  # Add grid
    display.ax_.set_xlabel('Recall', fontsize=12)
    display.ax_.set_ylabel('Precision', fontsize=12)
    display.figure_.set_size_inches(10, 8)  # Adjust figure size
    display.ax_.legend(fontsize=12)

    # Display the plot
    plt.show()

In [None]:
for model_name, model in model_dict.items():
    # Use predict_proba to get probabilities for the positive class
    if hasattr(model, "predict_proba"):
        y_test_proba = model.predict_proba(X_test)[:, 1]  # Use probabilities for the positive class
    else:
        # Fallback to decision_function if predict_proba is unavailable
        try:
            y_test_proba = model.decision_function(X_test)
        except:
            continue
    
    # Use PrecisionRecallDisplay for visualization
    display = PrecisionRecallDisplay.from_predictions(
        y_test, y_test_proba,  # Use probabilities instead of binary predictions
        name=model_name,
        plot_chance_level=True  # Adds a baseline for random performance
    )

    # Customize the plot
    display.ax_.set_title(f"2-Class Precision-Recall Curve for {model_name}")
    display.ax_.grid(alpha=0.6, linestyle='--')  # Add grid
    display.ax_.set_xlabel('Recall', fontsize=12)
    display.ax_.set_ylabel('Precision', fontsize=12)
    display.figure_.set_size_inches(10, 8)  # Adjust figure size
    display.ax_.legend(fontsize=12)

    # Display the plot
    plt.show()


In [None]:
from sklearn.metrics import roc_curve, roc_auc_score

import matplotlib.pyplot as plt

for model_name, model in model_dict.items():
    if hasattr(model, "predict_proba"):
        y_test_proba = model.predict_proba(X_test)[:, 1]  # Use probabilities for the positive class
    else:
        # Fallback to decision_function if predict_proba is unavailable
        y_test_proba = model.decision_function(X_test)
    
    # Calculate ROC curve
    fpr, tpr, _ = roc_curve(y_test, y_test_proba)
    roc_auc = roc_auc_score(y_test, y_test_proba)

    # Plot the ROC curve
    plt.figure(figsize=(10, 8))
    plt.plot(fpr, tpr, marker='.', label=f'{model_name} (AUC = {roc_auc:.2f})')
    plt.xlabel('False Positive Rate', fontsize=12)
    plt.ylabel('True Positive Rate', fontsize=12)
    plt.title(f'ROC Curve for {model_name}', fontsize=14)
    plt.legend(fontsize=12)
    plt.grid(alpha=0.6, linestyle='--')
    plt.tight_layout()
    plt.show()