# Alternative Models: Eager Learners II

<hr>

* Support Vector Machine
    * SVC
    * NuSVC
    * LinearSVC

In [1]:
# Import needed libraries and modules
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.metrics import roc_auc_score, accuracy_score
import torch
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn import tree
import optuna
from optuna.samplers import GPSampler
import sklearn
import sklearn.ensemble
from sklearn.pipeline import Pipeline

# Fetch dataset from UCI Repository
from ucimlrepo import fetch_ucirepo
heart_disease = fetch_ucirepo(id=45)
df = heart_disease.data.original

In [2]:
# ---------------------------------------------------------------------------- #
#                                PRE-PROCESSING                                #
# ---------------------------------------------------------------------------- #

# --------------------------------- SETTINGS --------------------------------- #
Normalize = True
PC_Features = True
Test_Size = 0.2
Random_Seed = 82024
Torch = False
Cross_Validation = True

# Drop missing values
df = df.dropna()
df = df.reset_index(drop=True)

# Binarize target
df.loc[df['num'] != 0, 'num'] = 1

# Define features and target vectors
X = df.iloc[:,:-1]
y = df['num']

# Normalize if requested
if (Normalize) or (PC_Features):
    int_features, cat_features = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak'],\
    ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'ca', 'thal']
    
    preprocessor = ColumnTransformer(
    transformers=[
        ('int', StandardScaler(), int_features),
        ('cat', OneHotEncoder(), cat_features)
    ])
    X = preprocessor.fit_transform(X)
else:
    X = X.values

# Apply PCA if requested
if PC_Features:
    pca = PCA(n_components=12)
    X = pca.fit_transform(X)

# Split train and test data
index = list(range(y.size))
train_index, test_index = train_test_split(index, test_size=Test_Size, random_state=Random_Seed)

train_X = X[train_index]
train_y = y.loc[train_index].values

test_X = X[test_index]
test_y = y.loc[test_index].values

# Convert to torch tensor if requested
if Torch:
    train_X, train_y, test_X, test_y = torch.tensor(train_X),\
    torch.tensor(train_y).double(), torch.tensor(test_X), torch.tensor(test_y).double()

<hr>

## Support Vector Machines:

Suport Vector Machines(SVMs) are machine learning algorithms that work by finding a hyperplane to separate data into classes

**The advantages of support vector machines are:**

* Effective in high dimensional spaces.

* Still effective in cases where number of dimensions is greater than the number of samples.

* Uses a subset of training points in the decision function (called support vectors), so it is also memory efficient.

* Versatile: different Kernel functions can be specified for the decision function as in Gaussian Process Models.

**The disadvantages of support vector machines include:**

* If the number of features is much greater than the number of samples, avoid over-fitting in choosing Kernel functions and regularization term is crucial.

* SVMs do not directly provide probability estimates.

<hr>

### SVC classification SVM implementation:

In [3]:
# ---------------------------------------------------------------------------- #
#                                     MODEL                                    #
# ---------------------------------------------------------------------------- #

# Create model
model = svm.SVC(random_state=Random_Seed, probability=True)

if Cross_Validation:
    cv_results = cross_validate(model, X, y, scoring=['accuracy', 'roc_auc'])
    acc = cv_results['test_accuracy'].mean()
    roc_auc = cv_results['test_roc_auc'].mean()
else:
    # Train model
    model.fit(train_X, train_y)

    # Test model
    y_pred = model.predict(test_X)
    pred_probs = model.predict_proba(test_X)

    # Evaluation
    acc = accuracy_score(test_y, y_pred)
    roc_auc = roc_auc_score(test_y, pred_probs[:, 1])

print(f"Accuracy: {acc:.4f}")
print(f"AUC-ROC: {roc_auc:.4f}")

Accuracy: 0.8281
AUC-ROC: 0.8987


In [4]:
# ---------------------------------------------------------------------------- #
#                                 OPTIMIZATION                                 #
# ---------------------------------------------------------------------------- #

# Objective function
def objective(trial):
      # Sugerindo os hiperparâmetros a serem otimizados
      C = trial.suggest_float('C', 1e-5, 1e2, log=True)  # Faixa de valores para o parâmetro de regularização C
      kernel = trial.suggest_categorical('kernel', ['linear', 'poly', 'rbf', 'sigmoid'])  # Seleção do kernel
      gamma = trial.suggest_categorical('gamma', ['scale', 'auto'])  # Opções para o parâmetro gamma

      # Criando o modelo SVC com os parâmetros sugeridos
      clf = sklearn.svm.SVC(C=C, kernel=kernel, gamma=gamma)
      # Otimizando a média da validação cruzada
      return sklearn.model_selection.cross_val_score(clf, X, y, scoring = 'roc_auc').mean() # Estamos otimizando o score da validação cruzada


# Executando o optuna
study = optuna.create_study(direction='maximize', sampler=GPSampler()) #Criando a otimização, o GPSamples é um processo gaussiano
study.optimize(objective, n_trials=100) #Rodar a otimização

  study = optuna.create_study(direction='maximize', sampler=GPSampler()) #Criando a otimização, o GPSamples é um processo gaussiano
[I 2024-10-21 17:59:32,025] A new study created in memory with name: no-name-7b056338-c058-4f1d-9ecf-cca19a165813
[I 2024-10-21 17:59:32,046] Trial 0 finished with value: 0.9110615079365079 and parameters: {'C': 0.23500206190885128, 'kernel': 'linear', 'gamma': 'scale'}. Best is trial 0 with value: 0.9110615079365079.
[I 2024-10-21 17:59:32,065] Trial 1 finished with value: 0.895056216931217 and parameters: {'C': 4.822875673756542, 'kernel': 'sigmoid', 'gamma': 'scale'}. Best is trial 0 with value: 0.9110615079365079.
[I 2024-10-21 17:59:32,087] Trial 2 finished with value: 0.884292328042328 and parameters: {'C': 0.0011547400716194068, 'kernel': 'poly', 'gamma': 'scale'}. Best is trial 0 with value: 0.9110615079365079.
[I 2024-10-21 17:59:32,108] Trial 3 finished with value: 0.890162037037037 and parameters: {'C': 0.0005247205660969763, 'kernel': 'linear',

In [5]:
# Mostrando o melhor modelo
trial = study.best_trial
print('ROC_AUC: {}'.format(trial.value))
print("Best hyperparameters: {}".format(trial.params))

ROC_AUC: 0.9137235449735449
Best hyperparameters: {'C': 1.8133998891442105, 'kernel': 'linear', 'gamma': 'auto'}


In [6]:
# Visualizando otimização
optuna.visualization.plot_optimization_history(study)

In [7]:
optuna.visualization.plot_slice(study)

In [8]:
# Criando um modelo com os melhores hiperparâmetros
best_params = trial.params
model = svm.SVC(random_state=Random_Seed, probability=True, **best_params)

if Cross_Validation:
    cv_results = cross_validate(model, X, y, scoring=['accuracy', 'roc_auc'])
    acc = cv_results['test_accuracy'].mean()
    roc_auc = cv_results['test_roc_auc'].mean()
else:
    # Train model
    model.fit(train_X, train_y)

    # Test model
    y_pred = model.predict(test_X)
    pred_probs = model.predict_proba(test_X)

    # Evaluation
    acc = accuracy_score(test_y, y_pred)
    roc_auc = roc_auc_score(test_y, pred_probs[:, 1])

print(f"Accuracy: {acc:.4f}")
print(f"AUC-ROC: {roc_auc:.4f}")

Accuracy: 0.8214
AUC-ROC: 0.9137


<hr>

### NuSVC classification SVM implementation:

In [9]:
# ---------------------------------------------------------------------------- #
#                                     MODEL                                    #
# ---------------------------------------------------------------------------- #

# Create model
model = svm.NuSVC(random_state=Random_Seed, probability=True)

if Cross_Validation:
    cv_results = cross_validate(model, X, y, scoring=['accuracy', 'roc_auc'])
    acc = cv_results['test_accuracy'].mean()
    roc_auc = cv_results['test_roc_auc'].mean()
else:
    # Train model
    model.fit(train_X, train_y)

    # Test model
    y_pred = model.predict(test_X)
    pred_probs = model.predict_proba(test_X)

    # Evaluation
    acc = accuracy_score(test_y, y_pred)
    roc_auc = roc_auc_score(test_y, pred_probs[:, 1])

print(f"Accuracy: {acc:.4f}")
print(f"AUC-ROC: {roc_auc:.4f}")

Accuracy: 0.8315
AUC-ROC: 0.9010


In [10]:
# ---------------------------------------------------------------------------- #
#                                 OPTIMIZATION                                 #
# ---------------------------------------------------------------------------- #

# Objective function
def objective(trial):
  # Faixa de parâmetros para otimização
  nu = trial.suggest_float('nu', 0.1, 0.9, log=True)  # nu é um parâmetro entre 0 e 1
  kernel = trial.suggest_categorical('kernel', ['linear', 'poly', 'rbf', 'sigmoid'])  # Tipo de kernel
  gamma = trial.suggest_categorical('gamma', ['scale', 'auto'])  # Gamma para os kernels não lineares

  # Modelo NuSVC com os parâmetros sugeridos
  clf = sklearn.svm.NuSVC(nu=nu, kernel=kernel, gamma=gamma)

  try:
    # Otimizando a média da validação cruzada
    score = sklearn.model_selection.cross_val_score(clf, X, y, n_jobs=-1, cv=3).mean()
  except ValueError:
    # Return a low score if nu is infeasible
    #score = -1
    score = float('-inf')# for a more drastic penalty

  # Otimizando a média da validação cruzada
  return sklearn.model_selection.cross_val_score(clf, X, y, scoring = 'roc_auc').mean() #Estamos otimizando o score da validação cruzada

# Executando o optuna
study = optuna.create_study(direction='maximize', sampler=GPSampler()) #Criando a otimização, o GPSamples é um processo gaussiano
study.optimize(objective, n_trials=100) #Rodar a otimização


GPSampler is experimental (supported from v3.6.0). The interface can change in the future.

[I 2024-10-21 18:00:00,949] A new study created in memory with name: no-name-e4bd6fce-709b-4240-bf2b-f7f1637a78f3
[I 2024-10-21 18:00:02,383] Trial 0 finished with value: 0.9110284391534392 and parameters: {'nu': 0.3504516101661246, 'kernel': 'linear', 'gamma': 'auto'}. Best is trial 0 with value: 0.9110284391534392.
[I 2024-10-21 18:00:02,993] Trial 1 finished with value: 0.9105406746031747 and parameters: {'nu': 0.5600524597545288, 'kernel': 'linear', 'gamma': 'auto'}. Best is trial 0 with value: 0.9110284391534392.
[I 2024-10-21 18:00:03,566] Trial 2 finished with value: 0.9137070105820106 and parameters: {'nu': 0.3466952049830671, 'kernel': 'linear', 'gamma': 'scale'}. Best is trial 2 with value: 0.9137070105820106.
[I 2024-10-21 18:00:03,597] Trial 3 finished with value: 0.19114583333333335 and parameters: {'nu': 0.10044994106962656, 'kernel': 'sigmoid', 'gamma': 'scale'}. Best is trial 2 

In [11]:
# Mostrando o melhor modelo
trial = study.best_trial
print('ROC_AUC: {}'.format(trial.value))
print("Best hyperparameters: {}".format(trial.params))

ROC_AUC: 0.9187417328042328
Best hyperparameters: {'nu': 0.31593168620773676, 'kernel': 'linear', 'gamma': 'scale'}


In [12]:
# Visualizando a otimização
optuna.visualization.plot_optimization_history(study)

In [13]:
optuna.visualization.plot_slice(study)

In [14]:
# Criando um modelo com os melhores hiperparâmetros
best_params = trial.params
model = svm.NuSVC(random_state=Random_Seed, probability=True, **best_params)

if Cross_Validation:
    cv_results = cross_validate(model, X, y, scoring=['accuracy', 'roc_auc'])
    acc = cv_results['test_accuracy'].mean()
    roc_auc = cv_results['test_roc_auc'].mean()
else:
    # Train model
    model.fit(train_X, train_y)

    # Test model
    y_pred = model.predict(test_X)
    pred_probs = model.predict_proba(test_X)

    # Evaluation
    acc = accuracy_score(test_y, y_pred)
    roc_auc = roc_auc_score(test_y, pred_probs[:, 1])

print(f"Accuracy: {acc:.4f}")
print(f"AUC-ROC: {roc_auc:.4f}")

Accuracy: 0.8415
AUC-ROC: 0.9187


<hr>

### LinearSVC classification SVM implementation:

In [15]:
# ---------------------------------------------------------------------------- #
#                                     MODEL                                    #
# ---------------------------------------------------------------------------- #

# Create model
model = svm.LinearSVC(random_state=Random_Seed, dual=False)

if Cross_Validation:
    cv_results = cross_validate(model, X, y, scoring=['accuracy', 'roc_auc'])
    acc = cv_results['test_accuracy'].mean()
    roc_auc = cv_results['test_roc_auc'].mean()
else:
    # Train model
    model.fit(train_X, train_y)

    # Test model
    y_pred = model.predict(test_X)
    # This module does not have a manual way to access probability
    # pred_probs = model.predict_proba(test_X)

    # Evaluation
    acc = accuracy_score(test_y, y_pred)
    # roc_auc = roc_auc_score(test_y, pred_probs[:, 1])

print(f"Accuracy: {acc:.4f}")
print(f"AUC-ROC: {roc_auc:.4f}")

Accuracy: 0.8415
AUC-ROC: 0.9191


In [16]:
# ---------------------------------------------------------------------------- #
#                                 OPTIMIZATION                                 #
# ---------------------------------------------------------------------------- #

# Objective function
def objective(trial):
    # Sugestão de hiperparâmetros para otimização
    C = trial.suggest_float('C', 1e-5, 1e2, log=True)  # Valor do parâmetro de regularização C
    max_iter = trial.suggest_int('max_iter', 100, 10000)  # Número máximo de iterações

    # Criando o pipeline com padronização e o modelo LinearSVC
    clf = Pipeline([
        ('scaler', StandardScaler()),  # Padronizando os dados
        ('svc', sklearn.svm.LinearSVC(C=C, max_iter=max_iter, dual=False))
    ])
    # Otimizando a média da validação cruzada
    return sklearn.model_selection.cross_val_score(clf, X, y, scoring = 'roc_auc').mean() #Estamos otimizando o score da validação cruzada

# Executando o optuna
study = optuna.create_study(direction='maximize', sampler=GPSampler()) #Criando a otimização, o GPSamples é um processo gaussiano
study.optimize(objective, n_trials=100) #Rodar a otimização


GPSampler is experimental (supported from v3.6.0). The interface can change in the future.

[I 2024-10-21 18:00:25,633] A new study created in memory with name: no-name-72609d84-c33b-413c-a693-6f2e7f70ab97
[I 2024-10-21 18:00:25,660] Trial 0 finished with value: 0.9151041666666666 and parameters: {'C': 0.012013871765207159, 'max_iter': 2928}. Best is trial 0 with value: 0.9151041666666666.
[I 2024-10-21 18:00:25,684] Trial 1 finished with value: 0.9117063492063492 and parameters: {'C': 0.00032738922793800666, 'max_iter': 7479}. Best is trial 0 with value: 0.9151041666666666.
[I 2024-10-21 18:00:25,703] Trial 2 finished with value: 0.9188161375661377 and parameters: {'C': 0.2542216530408756, 'max_iter': 7756}. Best is trial 2 with value: 0.9188161375661377.
[I 2024-10-21 18:00:25,726] Trial 3 finished with value: 0.911698082010582 and parameters: {'C': 0.00048809234031923544, 'max_iter': 1381}. Best is trial 2 with value: 0.9188161375661377.
[I 2024-10-21 18:00:25,747] Trial 4 finished

In [17]:
# Mostrando o melhor modelo
trial = study.best_trial
print('AUC-ROC: {}'.format(trial.value))
print("Best hyperparameters: {}".format(trial.params))

AUC-ROC: 0.9190558862433862
Best hyperparameters: {'C': 3.061495617875694, 'max_iter': 5679}


In [18]:
# Visualizando a otimização
optuna.visualization.plot_optimization_history(study)

In [19]:
optuna.visualization.plot_slice(study)

In [20]:
# Criando um modelo com os melhores hiperparâmetros
best_params = trial.params
model = svm.LinearSVC(random_state=Random_Seed, dual=False, **best_params)

if Cross_Validation:
    cv_results = cross_validate(model, X, y, scoring=['accuracy', 'roc_auc'])
    acc = cv_results['test_accuracy'].mean()
    roc_auc = cv_results['test_roc_auc'].mean()
else:
    # Train model
    model.fit(train_X, train_y)

    # Test model
    y_pred = model.predict(test_X)
    # This module does not have a manual way to access probability
    # pred_probs = model.predict_proba(test_X)

    # Evaluation
    acc = accuracy_score(test_y, y_pred)
    # roc_auc = roc_auc_score(test_y, pred_probs[:, 1])

print(f"Accuracy: {acc:.4f}")
print(f"AUC-ROC: {roc_auc:.4f}")

Accuracy: 0.8415
AUC-ROC: 0.9191


### References:
* https://scikit-learn.org/stable/modules/svm.html