# Alternative Models: Eager Learners II

<hr>

* Support Vector Machine
    * SVC
    * NuSVC
    * LinearSVC

In [1]:
# Import needed libraries and modules
from codecarbon import EmissionsTracker
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_validate, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA
from sklearn import svm
import optuna
from optuna.samplers import GPSampler
from sklearn.pipeline import Pipeline
import json

# Fetch dataset from UCI Repository
from ucimlrepo import fetch_ucirepo
heart_disease = fetch_ucirepo(id=45)
df = heart_disease.data.original

In [2]:
# ---------------------------------------------------------------------------- #
#                                PRE-PROCESSING                                #
# ---------------------------------------------------------------------------- #

##### SETTINGS #####
PC_Features = True
Random_Seed = 82024
K_Folds = 10
Max_Iterations = 200
####################

# Drop missing values
df = df.dropna()
df = df.reset_index(drop=True)

# Binarize target
df.loc[df['num'] != 0, 'num'] = 1

# Define features and target vectors
X = df.iloc[:,:-1]
y = df['num']

# Separate integer from categorical features
int_features, cat_features = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak'],\
['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'ca', 'thal']

# Define preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('int', StandardScaler(), int_features),
        ('cat', OneHotEncoder(), cat_features)
    ])

<hr>

## Support Vector Machines:

Suport Vector Machines(SVMs) are machine learning algorithms that work by finding a hyperplane to separate data into classes

**The advantages of support vector machines are:**

* Effective in high dimensional spaces.

* Still effective in cases where number of dimensions is greater than the number of samples.

* Uses a subset of training points in the decision function (called support vectors), so it is also memory efficient.

* Versatile: different Kernel functions can be specified for the decision function as in Gaussian Process Models.

**The disadvantages of support vector machines include:**

* If the number of features is much greater than the number of samples, avoid over-fitting in choosing Kernel functions and regularization term is crucial.

* SVMs do not directly provide probability estimates.

<hr>

### SVC classification SVM implementation:

In [3]:
# ---------------------------------------------------------------------------- #
#                                     MODEL                                    #
# ---------------------------------------------------------------------------- #

# Initiate CodeCarbon to track emissions
tracker = EmissionsTracker('SVC SVM model', log_level='warning')
tracker.start()

# Create model
model = svm.SVC(random_state=Random_Seed, probability=True)

# Define pipeline depending on whether PCA is requested or not
if PC_Features:
    steps = [
        ('preprocessor', preprocessor),
        ('pca', PCA(n_components=12)),
        ('SVM-SVC-classifier', model)
    ]
else:
    steps = [
        ('preprocessor', preprocessor),
        ('SVM-SVC-classifier', model)
    ]
    
pipeline = Pipeline(steps)

# Perform cross-validation
cv_results = cross_validate(pipeline, X, y, scoring=['accuracy', 'roc_auc'], cv=K_Folds)

# Calculate and display results
acc = np.mean(cv_results['test_accuracy'])
acc_std = np.std(cv_results['test_accuracy'])
roc_auc = np.mean(cv_results['test_roc_auc'])
roc_auc_std = np.std(cv_results['test_roc_auc'])

print(f"Accuracy: {acc:.4f} ± {acc_std:.4f}")
print(f"AUC-ROC: {roc_auc:.4f} ± {roc_auc_std:.4f}")

# Stop emission tracking
_ = tracker.stop()

[codecarbon ERROR @ 10:14:43] Error: Another instance of codecarbon is already running. Turn off the other instance to be able to run this one. Exiting.




Accuracy: 0.8314 ± 0.0624
AUC-ROC: 0.8991 ± 0.0426


In [4]:
# Save evaluation to an external file
file = 'scores.csv'

results = pd.DataFrame({
    'SVM-SVC-model': [acc, acc_std, roc_auc, roc_auc_std]
}, index = ['Accuracy', 'Accuracy STD', 'AUC-ROC', 'AUC-ROC STD'])

# Check if file exists
if os.path.exists(file):
    temp = pd.read_csv(file, index_col=0)
    results = pd.concat([temp, results], axis=1)

# Export
results.to_csv(file)

In [5]:
# ---------------------------------------------------------------------------- #
#                                 OPTIMIZATION                                 #
# ---------------------------------------------------------------------------- #

# Initiate CodeCarbon to track emissions
tracker = EmissionsTracker('SVC SVM optimization', log_level='warning')
tracker.start()

# Objective function
def objective(trial):
      # Define parameters
      C = trial.suggest_float('C', 1e-5, 1e2, log=True)  # Faixa de valores para o parâmetro de regularização C
      kernel = trial.suggest_categorical('kernel', ['linear', 'poly', 'rbf', 'sigmoid'])  # Seleção do kernel
      gamma = trial.suggest_categorical('gamma', ['scale', 'auto'])  # Opções para o parâmetro gamma

      # Create model
      temp_model = svm.SVC(C=C, kernel=kernel, gamma=gamma)

      # Define pipeline depending on whether PCA is requested or not
      if PC_Features:
            steps = [
            ('preprocessor', preprocessor),
            ('pca', PCA(n_components=12)),
            ('SVM-SVC-classifier', temp_model)
            ]
      else:
            steps = [
            ('preprocessor', preprocessor),
            ('SVM-SVC-classifier', temp_model)
            ]
            
      pipeline = Pipeline(steps)

      # Perform cross-validation
      roc_auc = cross_val_score(pipeline, X, y, scoring='roc_auc', cv=K_Folds).mean()

      return roc_auc

# Optimizing with Optuna
study = optuna.create_study(direction='maximize', sampler=GPSampler())
study.optimize(objective, n_trials=100)

[codecarbon ERROR @ 10:14:43] Error: Another instance of codecarbon is already running. Turn off the other instance to be able to run this one. Exiting.
  study = optuna.create_study(direction='maximize', sampler=GPSampler())
[I 2024-10-31 10:14:44,919] A new study created in memory with name: no-name-cf706076-d956-4d44-a7a7-f7a38610a4bf
[I 2024-10-31 10:14:45,068] Trial 0 finished with value: 0.8325206043956044 and parameters: {'C': 85.73814790564408, 'kernel': 'rbf', 'gamma': 'scale'}. Best is trial 0 with value: 0.8325206043956044.
[I 2024-10-31 10:14:45,190] Trial 1 finished with value: 0.8902129120879121 and parameters: {'C': 0.002875451111332801, 'kernel': 'rbf', 'gamma': 'auto'}. Best is trial 1 with value: 0.8902129120879121.
[I 2024-10-31 10:14:45,307] Trial 2 finished with value: 0.8830700549450551 and parameters: {'C': 0.0024703870756712745, 'kernel': 'poly', 'gamma': 'scale'}. Best is trial 1 with value: 0.8902129120879121.
[I 2024-10-31 10:14:45,435] Trial 3 finished with 

In [6]:
# Show best model
trial = study.best_trial
print('AUC-ROC: {}'.format(trial.value))
print("Best hyperparameters: {}".format(trial.params))

# Save best trial parameters to a JSON file
with open('params/SVM-SVC-params.json', 'w') as f:
    json.dump(trial.params, f)

AUC-ROC: 0.9167925824175824
Best hyperparameters: {'C': 2.140216894772957, 'kernel': 'sigmoid', 'gamma': 'auto'}


In [7]:
# Visualizing optimization
optuna.visualization.plot_optimization_history(study)

In [8]:
optuna.visualization.plot_slice(study)

In [9]:
# Create optimized model
best_params = trial.params
model = svm.SVC(random_state=Random_Seed, probability=True, **best_params)

# Define pipeline depending on whether PCA is requested or not
if PC_Features:
    steps = [
        ('preprocessor', preprocessor),
        ('pca', PCA(n_components=12)),
        ('SVM-SVC-classifier', model)
    ]
else:
    steps = [
        ('preprocessor', preprocessor),
        ('SVM-SVC-classifier', model)
    ]
    
pipeline = Pipeline(steps)

# Perform cross-validation
cv_results = cross_validate(pipeline, X, y, scoring=['accuracy', 'roc_auc'], cv=K_Folds)

# Calculate and display results
acc = np.mean(cv_results['test_accuracy'])
acc_std = np.std(cv_results['test_accuracy'])
roc_auc = np.mean(cv_results['test_roc_auc'])
roc_auc_std = np.std(cv_results['test_roc_auc'])

print(f"Accuracy: {acc:.4f} ± {acc_std:.4f}")
print(f"AUC-ROC: {roc_auc:.4f} ± {roc_auc_std:.4f}")

# Stop emission tracking
_ = tracker.stop()



Accuracy: 0.8416 ± 0.0547
AUC-ROC: 0.9168 ± 0.0356


In [10]:
# Save evaluation to an external file
file = 'scores.csv'

results = pd.DataFrame({
    'SVM-SVC-optimization': [acc, acc_std, roc_auc, roc_auc_std]
}, index = ['Accuracy', 'Accuracy STD', 'AUC-ROC', 'AUC-ROC STD'])

# Check if file exists
if os.path.exists(file):
    temp = pd.read_csv(file, index_col=0)
    results = pd.concat([temp, results], axis=1)

# Export
results.to_csv(file)

<hr>

### NuSVC classification SVM implementation:

In [11]:
# ---------------------------------------------------------------------------- #
#                                     MODEL                                    #
# ---------------------------------------------------------------------------- #

# Initiate CodeCarbon to track emissions
tracker = EmissionsTracker('NuSVC SVM model', log_level='warning')
tracker.start()

# Create model
model = svm.NuSVC(random_state=Random_Seed, probability=True)

# Define pipeline depending on whether PCA is requested or not
if PC_Features:
    steps = [
        ('preprocessor', preprocessor),
        ('pca', PCA(n_components=12)),
        ('SVM-NuSVC-classifier', model)
    ]
else:
    steps = [
        ('preprocessor', preprocessor),
        ('SVM-NuSVC-classifier', model)
    ]
    
pipeline = Pipeline(steps)

# Perform cross-validation
cv_results = cross_validate(pipeline, X, y, scoring=['accuracy', 'roc_auc'], cv=K_Folds)

# Calculate and display results
acc = np.mean(cv_results['test_accuracy'])
acc_std = np.std(cv_results['test_accuracy'])
roc_auc = np.mean(cv_results['test_roc_auc'])
roc_auc_std = np.std(cv_results['test_roc_auc'])

print(f"Accuracy: {acc:.4f} ± {acc_std:.4f}")
print(f"AUC-ROC: {roc_auc:.4f} ± {roc_auc_std:.4f}")

# Stop emission tracking
_ = tracker.stop()

[codecarbon ERROR @ 10:15:26] Error: Another instance of codecarbon is already running. Turn off the other instance to be able to run this one. Exiting.


Accuracy: 0.8282 ± 0.0611
AUC-ROC: 0.9009 ± 0.0440


In [12]:
# Save evaluation to an external file
file = 'scores.csv'

results = pd.DataFrame({
    'SVM-NuSVC-model': [acc, acc_std, roc_auc, roc_auc_std]
}, index = ['Accuracy', 'Accuracy STD', 'AUC-ROC', 'AUC-ROC STD'])

# Check if file exists
if os.path.exists(file):
    temp = pd.read_csv(file, index_col=0)
    results = pd.concat([temp, results], axis=1)

# Export
results.to_csv(file)

In [13]:
# ---------------------------------------------------------------------------- #
#                                 OPTIMIZATION                                 #
# ---------------------------------------------------------------------------- #

# Initiate CodeCarbon to track emissions
tracker = EmissionsTracker('NuSVC SVM optimization', log_level='warning')
tracker.start()

# Objective function
def objective(trial):
  # Define parameters
  nu = trial.suggest_float('nu', 0.1, 0.9, log=True)  # nu é um parâmetro entre 0 e 1
  kernel = trial.suggest_categorical('kernel', ['linear', 'poly', 'rbf', 'sigmoid'])  # Tipo de kernel
  gamma = trial.suggest_categorical('gamma', ['scale', 'auto'])  # Gamma para os kernels não lineares

  # Create model
  temp_model = svm.NuSVC(nu=nu, kernel=kernel, gamma=gamma)
  
  # Define pipeline depending on whether PCA is requested or not
  if PC_Features:
        steps = [
        ('preprocessor', preprocessor),
        ('pca', PCA(n_components=12)),
        ('SVM-NuSVC-classifier', temp_model)
        ]
  else:
        steps = [
        ('preprocessor', preprocessor),
        ('SVM-NuSVC-classifier', temp_model)
        ]
        
  pipeline = Pipeline(steps)

  # Perform cross-validation
  roc_auc = cross_val_score(pipeline, X, y, scoring='roc_auc', cv=K_Folds).mean()
  
  return roc_auc

# Optimizing with Optuna
study = optuna.create_study(direction='maximize', sampler=GPSampler())
study.optimize(objective, n_trials=100)

[codecarbon ERROR @ 10:15:26] Error: Another instance of codecarbon is already running. Turn off the other instance to be able to run this one. Exiting.

GPSampler is experimental (supported from v3.6.0). The interface can change in the future.

[I 2024-10-31 10:15:26,393] A new study created in memory with name: no-name-59d3bc31-ccb2-4659-ae1e-ad43320546ed
[I 2024-10-31 10:15:26,543] Trial 0 finished with value: 0.1510989010989011 and parameters: {'nu': 0.18265542863524883, 'kernel': 'sigmoid', 'gamma': 'auto'}. Best is trial 0 with value: 0.1510989010989011.
[I 2024-10-31 10:15:26,676] Trial 1 finished with value: 0.9004807692307694 and parameters: {'nu': 0.7398538242087201, 'kernel': 'poly', 'gamma': 'scale'}. Best is trial 1 with value: 0.9004807692307694.
[I 2024-10-31 10:15:26,826] Trial 2 finished with value: 0.1237293956043956 and parameters: {'nu': 0.21031925082988956, 'kernel': 'sigmoid', 'gamma': 'scale'}. Best is trial 1 with value: 0.9004807692307694.
[I 2024-10-31 10:15:2

In [14]:
# Show best model
trial = study.best_trial
print('AUC-ROC: {}'.format(trial.value))
print("Best hyperparameters: {}".format(trial.params))

# Save best trial parameters to a JSON file
with open('params/SVM-NuSVC-params.json', 'w') as f:
    json.dump(trial.params, f)

AUC-ROC: 0.9153846153846155
Best hyperparameters: {'nu': 0.48359699088201413, 'kernel': 'linear', 'gamma': 'scale'}


In [15]:
# Visualizing optimization
optuna.visualization.plot_optimization_history(study)

In [16]:
optuna.visualization.plot_slice(study)

In [17]:
# Create optimized model
best_params = trial.params
model = svm.NuSVC(random_state=Random_Seed, probability=True, **best_params)

# Define pipeline depending on whether PCA is requested or not
if PC_Features:
    steps = [
        ('preprocessor', preprocessor),
        ('pca', PCA(n_components=12)),
        ('SVM-NuSVC-classifier', model)
    ]
else:
    steps = [
        ('preprocessor', preprocessor),
        ('SVM-NuSVC-classifier', model)
    ]
    
pipeline = Pipeline(steps)

# Perform cross-validation
cv_results = cross_validate(pipeline, X, y, scoring=['accuracy', 'roc_auc'], cv=K_Folds)

# Calculate and display results
acc = np.mean(cv_results['test_accuracy'])
acc_std = np.std(cv_results['test_accuracy'])
roc_auc = np.mean(cv_results['test_roc_auc'])
roc_auc_std = np.std(cv_results['test_roc_auc'])

print(f"Accuracy: {acc:.4f} ± {acc_std:.4f}")
print(f"AUC-ROC: {roc_auc:.4f} ± {roc_auc_std:.4f}")

# Stop emission tracking
_ = tracker.stop()



Accuracy: 0.8416 ± 0.0567
AUC-ROC: 0.9154 ± 0.0345


In [18]:
# Save evaluation to an external file
file = 'scores.csv'

results = pd.DataFrame({
    'SVM-NuSVC-optimization': [acc, acc_std, roc_auc, roc_auc_std]
}, index = ['Accuracy', 'Accuracy STD', 'AUC-ROC', 'AUC-ROC STD'])

# Check if file exists
if os.path.exists(file):
    temp = pd.read_csv(file, index_col=0)
    results = pd.concat([temp, results], axis=1)

# Export
results.to_csv(file)

<hr>

### LinearSVC classification SVM implementation:

In [19]:
# ---------------------------------------------------------------------------- #
#                                     MODEL                                    #
# ---------------------------------------------------------------------------- #

# Initiate CodeCarbon to track emissions
tracker = EmissionsTracker('LinearSVC SVM model', log_level='warning')
tracker.start()

# Create model
model = svm.LinearSVC(random_state=Random_Seed, dual=False)

# Define pipeline depending on whether PCA is requested or not
if PC_Features:
    steps = [
        ('preprocessor', preprocessor),
        ('pca', PCA(n_components=12)),
        ('SVM-LinearSVC-classifier', model)
    ]
else:
    steps = [
        ('preprocessor', preprocessor),
        ('SVM-LinearSVC-classifier', model)
    ]
    
pipeline = Pipeline(steps)

# Perform cross-validation
cv_results = cross_validate(pipeline, X, y, scoring=['accuracy', 'roc_auc'], cv=K_Folds)

# Calculate and display results
acc = np.mean(cv_results['test_accuracy'])
acc_std = np.std(cv_results['test_accuracy'])
roc_auc = np.mean(cv_results['test_roc_auc'])
roc_auc_std = np.std(cv_results['test_roc_auc'])

print(f"Accuracy: {acc:.4f} ± {acc_std:.4f}")
print(f"AUC-ROC: {roc_auc:.4f} ± {roc_auc_std:.4f}")

# Stop emission tracking
_ = tracker.stop()

[codecarbon ERROR @ 10:16:04] Error: Another instance of codecarbon is already running. Turn off the other instance to be able to run this one. Exiting.


Accuracy: 0.8448 ± 0.0512
AUC-ROC: 0.9124 ± 0.0336


In [20]:
# Save evaluation to an external file
file = 'scores.csv'

results = pd.DataFrame({
    'SVM-LinearSVC-model': [acc, acc_std, roc_auc, roc_auc_std]
}, index = ['Accuracy', 'Accuracy STD', 'AUC-ROC', 'AUC-ROC STD'])

# Check if file exists
if os.path.exists(file):
    temp = pd.read_csv(file, index_col=0)
    results = pd.concat([temp, results], axis=1)

# Export
results.to_csv(file)

In [21]:
# ---------------------------------------------------------------------------- #
#                                 OPTIMIZATION                                 #
# ---------------------------------------------------------------------------- #

# Initiate CodeCarbon to track emissions
tracker = EmissionsTracker('LinearSVC SVM optimization', log_level='warning')
tracker.start()

# Objective function
def objective(trial):
    # Define parameters
    C = trial.suggest_float('C', 1e-5, 1e2, log=True)

    # Create model
    temp_model = svm.LinearSVC(C=C, dual=False)

    # Define pipeline depending on whether PCA is requested or not
    if PC_Features:
        steps = [
        ('preprocessor', preprocessor),
        ('pca', PCA(n_components=12)),
        ('SVM-LinearSVC-classifier', temp_model)
        ]
    else:
        steps = [
        ('preprocessor', preprocessor),
        ('SVM-LinearSVC-classifier', temp_model)
        ]
        
    pipeline = Pipeline(steps)

    # Perform cross-validation
    roc_auc = cross_val_score(pipeline, X, y, scoring='roc_auc', cv=K_Folds).mean()

    return roc_auc

# Optimizing with Optuna
study = optuna.create_study(direction='maximize', sampler=GPSampler())
study.optimize(objective, n_trials=100)

[codecarbon ERROR @ 10:16:05] Error: Another instance of codecarbon is already running. Turn off the other instance to be able to run this one. Exiting.

GPSampler is experimental (supported from v3.6.0). The interface can change in the future.

[I 2024-10-31 10:16:05,124] A new study created in memory with name: no-name-c5cb75bf-53f7-46fd-8a79-7d88840b1a69


[I 2024-10-31 10:16:05,276] Trial 0 finished with value: 0.9128434065934066 and parameters: {'C': 7.676750739447821}. Best is trial 0 with value: 0.9128434065934066.
[I 2024-10-31 10:16:05,392] Trial 1 finished with value: 0.9123969780219781 and parameters: {'C': 1.1426618248805984}. Best is trial 0 with value: 0.9128434065934066.
[I 2024-10-31 10:16:05,509] Trial 2 finished with value: 0.9072458791208792 and parameters: {'C': 0.0024009916192333563}. Best is trial 0 with value: 0.9128434065934066.
[I 2024-10-31 10:16:05,631] Trial 3 finished with value: 0.9134271978021978 and parameters: {'C': 0.026995103800403597}. Best is trial 3 with value: 0.9134271978021978.
[I 2024-10-31 10:16:05,750] Trial 4 finished with value: 0.9152472527472527 and parameters: {'C': 0.07445655649076541}. Best is trial 4 with value: 0.9152472527472527.
[I 2024-10-31 10:16:05,887] Trial 5 finished with value: 0.9129464285714285 and parameters: {'C': 0.02608462541663076}. Best is trial 4 with value: 0.9152472527

In [22]:
# Show best model
trial = study.best_trial
print('AUC-ROC: {}'.format(trial.value))
print("Best hyperparameters: {}".format(trial.params))

# Save best trial parameters to a JSON file
with open('params/SVM-LinearSVC-params.json', 'w') as f:
    json.dump(trial.params, f)

AUC-ROC: 0.9152472527472527
Best hyperparameters: {'C': 0.07445655649076541}


In [23]:
# Visualizing optimization
optuna.visualization.plot_optimization_history(study)

In [24]:
optuna.visualization.plot_slice(study)

In [25]:
# Create optimized model
best_params = trial.params
model = svm.LinearSVC(random_state=Random_Seed, dual=False, **best_params)

# Define pipeline depending on whether PCA is requested or not
if PC_Features:
    steps = [
        ('preprocessor', preprocessor),
        ('pca', PCA(n_components=12)),
        ('SVM-LinearSVC-classifier', model)
    ]
else:
    steps = [
        ('preprocessor', preprocessor),
        ('SVM-LinearSVC-classifier', model)
    ]
    
pipeline = Pipeline(steps)

# Perform cross-validation
cv_results = cross_validate(pipeline, X, y, scoring=['accuracy', 'roc_auc'], cv=K_Folds)

# Calculate and display results
acc = np.mean(cv_results['test_accuracy'])
acc_std = np.std(cv_results['test_accuracy'])
roc_auc = np.mean(cv_results['test_roc_auc'])
roc_auc_std = np.std(cv_results['test_roc_auc'])

print(f"Accuracy: {acc:.4f} ± {acc_std:.4f}")
print(f"AUC-ROC: {roc_auc:.4f} ± {roc_auc_std:.4f}")

# Stop emission tracking
_ = tracker.stop()



Accuracy: 0.8515 ± 0.0576
AUC-ROC: 0.9152 ± 0.0345


In [26]:
# Save evaluation to an external file
file = 'scores.csv'

results = pd.DataFrame({
    'SVM-LinearSVC-optimization': [acc, acc_std, roc_auc, roc_auc_std]
}, index = ['Accuracy', 'Accuracy STD', 'AUC-ROC', 'AUC-ROC STD'])

# Check if file exists
if os.path.exists(file):
    temp = pd.read_csv(file, index_col=0)
    results = pd.concat([temp, results], axis=1)

# Export
results.to_csv(file)

### References:
* https://scikit-learn.org/stable/modules/svm.html