In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV,cross_val_score
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score, classification_report

import warnings
warnings.filterwarnings('ignore')


In [2]:
def create_baseline_model(X_train, X_test, y_train, y_test):
    """
    Create and evaluate a baseline SVM model
    """

    baseline_svm = SVC(kernel='linear', random_state=42)


    baseline_svm.fit(X_train, y_train)

    y_pred = baseline_svm.predict(X_test)

    baseline_accuracy = accuracy_score(y_test, y_pred)
    baseline_f1 = f1_score(y_test, y_pred)

    print("Baseline Model Performance:")
    print(f"Accuracy: {baseline_accuracy:.4f}")
    print(f"F1-score: {baseline_f1:.4f}")
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))

    return baseline_accuracy, baseline_f1

In [3]:
def perform_grid_search(X_train, y_train):
    """
    Perform grid search for hyperparameter tuning with separate grids for different kernels
    """
    # Define separate parameter grids for different kernels
    linear_param_grid = {
        'kernel': ['linear'],
        'C': [0.1, 1, 10, 100]
    }

    rbf_param_grid = {
        'kernel': ['rbf'],
        'C': [0.1, 1, 10, 100],
        'gamma': ['scale', 'auto', 0.01, 0.1]
    }

    poly_param_grid = {
        'kernel': ['poly'],
        'C': [0.1, 1, 10],
        'degree': [2, 3],
        'gamma': ['scale', 'auto']
    }

    # Initialize best score tracking
    best_score = -1
    best_params = None
    best_estimator = None

    # Test each kernel's parameter grid separately
    for param_grid in [linear_param_grid, rbf_param_grid, poly_param_grid]:
        print(f"\nTesting parameters for {param_grid['kernel'][0]} kernel...")

        # Initialize GridSearchCV
        grid_search = GridSearchCV(
            estimator=SVC(random_state=42),
            param_grid=param_grid,
            cv=5,
            scoring='f1',
            n_jobs=-1,
            verbose=1
        )

        # Fit GridSearchCV
        grid_search.fit(X_train, y_train)

        # Update best parameters if current kernel performs better
        if grid_search.best_score_ > best_score:
            best_score = grid_search.best_score_
            best_params = grid_search.best_params_
            best_estimator = grid_search.best_estimator_

        print(f"Best parameters for {param_grid['kernel'][0]}: {grid_search.best_params_}")
        print(f"Best F1-score for {param_grid['kernel'][0]}: {grid_search.best_score_:.4f}")

    print("\nOverall Best Results:")
    print(f"Best parameters: {best_params}")
    print(f"Best cross-validation F1-score: {best_score:.4f}")

    return best_estimator, best_params

In [4]:
def evaluate_optimized_model(best_model, X_test, y_test):
    """
    Evaluate the optimized model
    """

    y_pred = best_model.predict(X_test)

    optimized_accuracy = accuracy_score(y_test, y_pred)
    optimized_f1 = f1_score(y_test, y_pred)

    print("\nOptimized Model Performance:")
    print(f"Accuracy: {optimized_accuracy:.4f}")
    print(f"F1-score: {optimized_f1:.4f}")
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))

    return optimized_accuracy, optimized_f1

In [5]:
df = pd.read_csv("full_processed_heart_disease_cleveland.csv")

X = df.drop(["target"],axis=1)
y = df["target"]

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=42)

In [6]:
# Create and evaluate baseline model
baseline_accuracy, baseline_f1 = create_baseline_model(X_train, X_test, y_train, y_test)

Baseline Model Performance:
Accuracy: 0.8852
F1-score: 0.8889

Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.90      0.88        29
           1       0.90      0.88      0.89        32

    accuracy                           0.89        61
   macro avg       0.88      0.89      0.89        61
weighted avg       0.89      0.89      0.89        61



In [7]:
# Perform grid search

best_model, best_params = perform_grid_search(X_train, y_train)


Testing parameters for linear kernel...
Fitting 5 folds for each of 4 candidates, totalling 20 fits
Best parameters for linear: {'C': 1, 'kernel': 'linear'}
Best F1-score for linear: 0.8099

Testing parameters for rbf kernel...
Fitting 5 folds for each of 16 candidates, totalling 80 fits
Best parameters for rbf: {'C': 100, 'gamma': 'scale', 'kernel': 'rbf'}
Best F1-score for rbf: 0.7395

Testing parameters for poly kernel...
Fitting 5 folds for each of 12 candidates, totalling 60 fits
Best parameters for poly: {'C': 1, 'degree': 3, 'gamma': 'auto', 'kernel': 'poly'}
Best F1-score for poly: 0.7322

Overall Best Results:
Best parameters: {'C': 1, 'kernel': 'linear'}
Best cross-validation F1-score: 0.8099


In [8]:
# Evaluate optimized model
optimized_accuracy, optimized_f1 = evaluate_optimized_model(best_model, X_test, y_test)


Optimized Model Performance:
Accuracy: 0.8852
F1-score: 0.8889

Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.90      0.88        29
           1       0.90      0.88      0.89        32

    accuracy                           0.89        61
   macro avg       0.88      0.89      0.89        61
weighted avg       0.89      0.89      0.89        61



In [9]:
print("\nPerformance Improvement Summary:")
print(f"Accuracy improvement: {(optimized_accuracy - baseline_accuracy) * 100:.2f}%")
print(f"F1-score improvement: {(optimized_f1 - baseline_f1) * 100:.2f}%")


Performance Improvement Summary:
Accuracy improvement: 0.00%
F1-score improvement: 0.00%
