# Support Vector Machine

## Import and Export

### Importing data

In [6]:
# Imports 
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate
from sklearn.metrics import accuracy_score, classification_report
from sklearn.svm import SVC

import os
import sys

sys.path.append(os.path.abspath("../scripts"))
from data_loader import DataLoader

In [7]:
data_loader = DataLoader()
X_train, y_train = data_loader.training_data
X_val, y_val = data_loader.validation_data
X_test, y_test = data_loader.test_data

In [8]:
print(f"X_train shape: {X_train.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"X_val shape: {X_val.shape}")
print(f"y_val shape: {y_val.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_test shape: {y_test.shape}")
# (data == value_to_count) creates a boolean Series where True corresponds to occurrences of the specific value.
# .sum() counts the True values (since True is equivalent to 1 in Python).
print(f"Number of negative samples in training set: {(y_train == 0.0).sum()}")
print(f"Number of positive samples in training set: {(y_train == 1.0).sum()}")

X_train shape: (177576, 21)
y_train shape: (177576,)
X_val shape: (25875, 21)
y_val shape: (25875,)
X_test shape: (50229, 21)
y_test shape: (50229,)
Number of negative samples in training set: 149592
Number of positive samples in training set: 27984


### Exporting models

In [9]:
import joblib
from datetime import datetime

# Save model to pkl file for later reuse
def save_model (model, model_name):
    # Get the current timestamp
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

    # Save the best model to a file with a timestamp
    model_filename = f'../models/support_vector_machine/svm_model_{model_name}_{timestamp}.pkl'
    joblib.dump(model, model_filename)

    print(f"Initial model saved to '{model_filename}'")

## Initial exploration

Here, we just try out the Support Vector Machine to get an initial feeling how it performs and to have something to improve upon in the following sections using Resampling, Hyperparameter Tuning, and PCA.

In [None]:
# Initialize the support vector machine model
model_initial = SVC(C=1.0, kernel='rbf', verbose=True)

# Train the model on the preprocessed training data
model_initial.fit(X_train, y_train)

# Make predictions on the validation set
y_val_pred = model_initial.predict(X_val)

# Evaluate the model's performance
accuracy = accuracy_score(y_val, y_val_pred)
report = classification_report(y_val, y_val_pred)

print(f"Validation Accuracy: {accuracy}")
print("Classification Report:\n", report)

This is basically the exact same result as for the majority classifier.

Because it has such a long runtime (), we save it. Of course, we hope that we do not have to reuse it because we improve it in the following cells. But better safe than sorry. 

In [None]:
# save model to pkl file for later reuse
import joblib
from datetime import datetime

# Get the current timestamp
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

# Save the best model to a file with a timestamp
model_filename = f'../models/support_vector_machine/svm_model_initial_{timestamp}.pkl'
joblib.dump(model_initial, model_filename)

print(f"Initial model saved to '{model_filename}'")

## Resampling Methods

### Random Undersampling

In [5]:
# test random undersampling
X_train_undersampling_random, y_train_undersampling_random = data_loader.training_data_undersampling_random
X_val, y_val = data_loader.validation_data
X_test, y_test = data_loader.test_data

print(f"X_train_undersampling_random shape: {X_train_undersampling_random.shape}")
print(f"y_train_undersampling_random shape: {y_train_undersampling_random.shape}")
print(f"X_val shape: {X_val.shape}")
print(f"y_val shape: {y_val.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_test shape: {y_test.shape}")

# (data == value_to_count) creates a boolean Series where True corresponds to occurrences of the specific value.
# .sum() counts the True values (since True is equivalent to 1 in Python).
print(f"Number of negative samples in undersampled training set: {(y_train_undersampling_random == 0.0).sum()}")
print(f"Number of positive samples in undersampled training set: {(y_train_undersampling_random == 1.0).sum()}")

X_train_undersampling_random shape: (55968, 21)
y_train_undersampling_random shape: (55968,)
X_val shape: (25875, 21)
y_val shape: (25875,)
X_test shape: (50229, 21)
y_test shape: (50229,)
Number of negative samples in undersampled training set: 27984
Number of positive samples in undersampled training set: 27984


After the undersampling, we have as many positive as negative examples in our training data. This more balanced training set might be useful to improve the recall on the positive class, i.e., diabetes (previously, the minority class).

By random undersampling, we reduced the original training dataset containing 177567 examples to 55968 examples, which is approximately a third of the original. 

In [8]:
# Initialize the support vector machine model
model_undersampling = SVC(C=1.0, kernel='rbf', verbose=True)

# Train the model on the preprocessed training data
model_undersampling.fit(X_train_undersampling_random, y_train_undersampling_random)

# Make predictions on the validation set
y_val_pred_undersampling = model_undersampling.predict(X_val)



[LibSVM]

In [None]:
# Evaluate the model's performance
accuracy_undersampling = accuracy_score(y_val, y_val_pred_undersampling)
report_undersampling = classification_report(y_val, y_val_pred_undersampling)

print(f"Validation Accuracy: {accuracy_undersampling}")
print("Classification Report:\n", report_undersampling)

print(f"Number of support vectors for each class: {model_undersampling.n_support_}")

Validation Accuracy: 0.6922125603864734
Classification Report:
               precision    recall  f1-score   support

         0.0       0.95      0.67      0.79     21797
         1.0       0.32      0.81      0.45      4078

    accuracy                           0.69     25875
   macro avg       0.63      0.74      0.62     25875
weighted avg       0.85      0.69      0.73     25875

[16269 16310]


The reduction of data points to a third drastically improves the runtime from over 40 minutes (TODO!) to 3 minutes (in our setting). This makes sense since the training time of a Support Vector Machine generally has a cubic runtime O(n^3), i.e., it grows cubically with the number n of training examples (source: https://stackoverflow.com/questions/18165213/how-much-time-does-it-take-to-train-a-svm-classifier).

SVM on undersampled data yields a better recall for the positive class than SVM on the full training data. But as expected for higher recall, but the overall accuracy suffers.

In [None]:
save_model(model_undersampling, "undersampling")

Initial model saved to '../models/support_vector_machine/svm_model_undersampling_20241125_105832.pkl'


Paper for theory why Undersampling is so good for SVMs: https://www.sciencedirect.com/science/article/pii/S1474667016429952

### Another Undersampling

In [None]:
# TODO: one or two other undersampling methods so that I can compare?
# ideas:
# - slighter/softer undersampling (i.e., majority class has just 2x more examples) -> slightly more examples
# - undersampling the SMOTE tomek
# - more ideas: https://imbalanced-learn.org/stable/references/under_sampling.html

### Random Oversampling

Testing random oversampling for SVMs would make training time even longer. For comparison, the original training dataset has 177567 samples and our undersampled dataset 63964 samples (i.e., approximately a third of the original). Our oversampled dataset (the random version and also the SMOTE version) has 299184 samples. Thus, is not performed for SVMs here.

### SMOTE Oversampling

Testing SMOTE oversampling for SVMs would make training time even longer. For comparison, the original training dataset has 177567 samples and our undersampled dataset 63964 samples (i.e., approximately a third of the original). Our oversampled dataset (the random version and also the SMOTE version) has 299184 samples. Thus, is not performed for SVMs here.

### SMOTE Tomek

In [12]:
X_train_oversampling_smote_tomek, y_train_oversampling_smote_tomek = data_loader.training_data_resampling_smote_tomek
X_val, y_val = data_loader.validation_data
X_test, y_test = data_loader.test_data

print(f"X_train_oversampling_smote shape: {X_train_oversampling_smote_tomek.shape}")
print(f"y_train_oversampling_smote shape: {y_train_oversampling_smote_tomek.shape}")
print(f"X_val shape: {X_val.shape}")
print(f"y_val shape: {y_val.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_test shape: {y_test.shape}")
print(f"Number of negative samples in SMOTE Tomek training set: {(y_train_oversampling_smote_tomek == 0.0).sum()}")
print(f"Number of positive samples in SMOTE Tomek training set: {(y_train_oversampling_smote_tomek == 1.0).sum()}")

X_train_oversampling_smote shape: (298548, 21)
y_train_oversampling_smote shape: (298548,)
X_val shape: (25875, 21)
y_val shape: (25875,)
X_test shape: (50229, 21)
y_test shape: (50229,)
Number of negative samples in SMOTE Tomek training set: 149274
Number of positive samples in SMOTE Tomek training set: 149274


After the SMOTE Tomek sampling, we have as many positive as negative examples in our training data, as after the Random Undersampling. Here again, we hope that this more balanced training set might be useful to improve the recall on the positive class (i.e., diabetes) (previously, the minority class) compared with the original dataset. 

By SMOTE Tomek sampling, we increased the original training dataset containing 177567 examples to 298548 examples, which is roughly 1.5 times the size of the original dataset. 

In [None]:
# TODO

In [None]:
# Initialize the support vector machine model
model_smote_tomek = SVC(C=1.0, kernel='rbf', verbose=True)

# Train the model on the preprocessed training data
model_smote_tomek.fit(X_train_oversampling_smote_tomek, y_train_oversampling_smote_tomek)

# Make predictions on the validation set
y_val_pred_smote_tomek = model_smote_tomek.predict(X_val)

# Evaluate the model's performance
accuracy_smote_tomek = accuracy_score(y_val, y_val_pred_smote_tomek)
report_smote_tomek = classification_report(y_val, y_val_pred_smote_tomek)

print(f"Validation Accuracy: {accuracy_smote_tomek}")
print("Classification Report:\n", report_smote_tomek)

[LibSVM]

In [None]:
save_model(model_smote_tomek, "smote_tomek")

### Conclusion

In [None]:
# TODO: compare inparticular original, random undersampling, and SMOTE Tomek

## Hyperparameter Tuning

We tune the following hyperparameters: the type of kernel (e.g., linear, rbf, polynomial, sigmoid), the regularization parameter (C), and kernel-specific parameters like gamma for the RBF kernel and the degree for polynomial kernels.

The best model of randomized search yields a worse accuracy than the best model of grid search. Also the f1-score is worse. Thus, it seems to make sense to go with the model identified by grid search.

### Halving Grid Search

Due to time complexity, hyperparameter tuning is only feasible with the halving grid search.

### F1 as objective

In [None]:
# Hyperparameter tuning with Halving Grid Search
from sklearn.experimental import enable_halving_search_cv
from imblearn.pipeline import Pipeline
from sklearn.model_selection import HalvingGridSearchCV
from imblearn.under_sampling import RandomUnderSampler, TomekLinks, CondensedNearestNeighbour

pipeline = Pipeline([
    ('resampler', None),  # Placeholder for resampling method
    ('classifier', SVC(max_iter=10000, random_state=42, probability=True))  # Model
])

param_grid_toy = {
    'classifier__kernel': ['linear'],  # Kernel type
    'classifier__C': [0.1],  # Regularization strength
    # 'classifier__penalty': ['l1', 'l2', 'elasticnet', 'none'],  # Regularization type  # not possible for SVM
    # 'classifier__solver': ['newton-cg', 'lbfgs', 'liblinear', 'saga', 'sag'],  # Optimization algorithm  # not possible for SVM
    'classifier__tol': [1e-3],  # Tolerance for stopping criteria  
    'resampler': [RandomUnderSampler(random_state=42)] # many other options: TomekLinks(), CondensedNearestNeighbour(), ...
}
param_grid_small = {
    'classifier__kernel': ['linear'],  # Kernel type
    'classifier__C': [0.01, 0.1, 1, 10],  # Regularization strength
    'resampler': [RandomUnderSampler(random_state=42), RandomUnderSampler(random_state=42, sampling_strategy=0.5)] # many other options: TomekLinks(), CondensedNearestNeighbour(), ...
}
param_grid = [
    {
        'classifier__kernel': ['linear'],  # Kernel type
        'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100],  # Regularization strength
        'classifier__tol': [1e-3, 1e-2, 1e-1],  # Tolerance for stopping criteria 
        'resampler': [RandomUnderSampler(random_state=42), RandomUnderSampler(random_state=42, sampling_strategy=0.5)] # many other options: TomekLinks(), CondensedNearestNeighbour(), ...
    }, 
    {
        'classifier__kernel': ['poly'],  # Kernel type
        'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100],  # Regularization strength
        'classifier__degree': [2, 3, 4, 5], # Degree of the polynomial kernel function (only for ‘poly’)
        'classifier__gamma': ['scale', 1, 0.1, 0.01, 0.001, 0.0001],  # Kernel coefficient (only for ‘rbf’, ‘poly’ and ‘sigmoid’)
        'classifier__coef0': [-1.0, 0.0, 1.0],  # Independent term in kernel function (only for ‘poly’ and ‘sigmoid’)
        'classifier__tol': [1e-3, 1e-2, 1e-1],  # Tolerance for stopping criteria 
        'resampler': [RandomUnderSampler(random_state=42), RandomUnderSampler(random_state=42, sampling_strategy=0.5)] # many other options: TomekLinks(), CondensedNearestNeighbour(), ...
    }, 
    {
        'classifier__kernel': ['rbf'],  # Kernel type
        'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100],  # Regularization strength
        'classifier__gamma': ['scale', 1, 0.1, 0.01, 0.001, 0.0001],  # Kernel coefficient  # Kernel coefficient (only for ‘rbf’, ‘poly’ and ‘sigmoid’)
        'classifier__tol': [1e-3, 1e-2, 1e-1],  # Tolerance for stopping criteria  # TODO if time allows
        'resampler': [RandomUnderSampler(random_state=42), RandomUnderSampler(random_state=42, sampling_strategy=0.5)] # many other options: TomekLinks(), CondensedNearestNeighbour(), ...
    }
]

# Set up HalvingGridSearchCV
halving_grid_search = HalvingGridSearchCV(
    estimator=pipeline,
    param_grid=param_grid_small,  # Parameter grid remains the same  # TODO
    cv=5,  # 5-fold cross-validation
    scoring='f1',  # we want to optimize recall  # TODO decide with team
    n_jobs=-1,  # Use all processors
    verbose=1  # To track progress
)

# Fit the random search on training data
halving_grid_search.fit(X_train, y_train)

# Get the best parameters and score
print("Best Parameters:", halving_grid_search.best_params_)
print("Best Cross-Validation F1-Score:", halving_grid_search.best_score_)

n_iterations: 2
n_required_iterations: 2
n_possible_iterations: 2
min_resources_: 59192
max_resources_: 177576
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 8
n_resources: 59192
Fitting 5 folds for each of 8 candidates, totalling 40 fits
----------
iter: 1
n_candidates: 3
n_resources: 177576
Fitting 5 folds for each of 3 candidates, totalling 15 fits


KeyboardInterrupt: 

In [45]:
report_halving_GS = classification_report(y_val, halving_grid_search.predict(X_val))
print("Classification Report:\n", report_halving_GS)


Classification Report:
               precision    recall  f1-score   support

         0.0       0.73      0.00      0.00     21797
         1.0       0.16      1.00      0.27      4078

    accuracy                           0.16     25875
   macro avg       0.44      0.50      0.14     25875
weighted avg       0.64      0.16      0.05     25875



In [None]:
best_model_halving_grid = halving_grid_search.best_estimator_
save_model(best_model_halving_grid, "halving-grid")

Initial model saved to '../models/support_vector_machine/svm_model_halving-grid_small_20241125_184350.pkl'


### Accuracy as objective

Let's try with scoring='accuracy' and see whether the results are more balanced.

In [None]:
# Hyperparameter tuning with Halving Grid Search
from sklearn.experimental import enable_halving_search_cv
from imblearn.pipeline import Pipeline
from sklearn.model_selection import HalvingGridSearchCV
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.under_sampling import RandomUnderSampler, TomekLinks, CondensedNearestNeighbour
from imblearn.combine import SMOTETomek

pipeline = Pipeline([
    ('resampler', None),  # Placeholder for resampling method
    ('classifier', SVC(max_iter=10000, random_state=42, probability=True))  # Model
])

param_grid_toy = {
    'classifier__kernel': ['linear'],  # Kernel type
    'classifier__C': [0.1],  # Regularization strength
    # 'classifier__penalty': ['l1', 'l2', 'elasticnet', 'none'],  # Regularization type  # not possible for SVM
    # 'classifier__solver': ['newton-cg', 'lbfgs', 'liblinear', 'saga', 'sag'],  # Optimization algorithm  # not possible for SVM
    'classifier__tol': [1e-3],  # Tolerance for stopping criteria 
    'resampler': [RandomUnderSampler(random_state=42)] # many other options: TomekLinks(), CondensedNearestNeighbour(), ...
}
param_grid_small = {
    'classifier__kernel': ['linear'],  # Kernel type
    'classifier__C': [0.01, 0.1, 1, 10],  # Regularization strength
    'resampler': [RandomUnderSampler(random_state=42), RandomUnderSampler(random_state=42, sampling_strategy=0.5)] # many other options: TomekLinks(), CondensedNearestNeighbour(), ...
}
param_grid = [
    {
        'classifier__kernel': ['linear'],  # Kernel type
        'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100],  # Regularization strength
        'classifier__tol': [1e-3, 1e-2, 1e-1],  # Tolerance for stopping criteria 
        'resampler': [None,  # original, imbalanced dataset
                      RandomUnderSampler(random_state=42), 
                      RandomUnderSampler(random_state=42, sampling_strategy=0.5), # sampling_strategy = number of minoority class instances / number of majority class instances
                      RandomUnderSampler(random_state=42, sampling_strategy=0.25), 
                      RandomOverSampler(random_state=42), 
                      SMOTE(random_state=42), 
                      SMOTETomek(random_state=42)] 
    }, 
    {
        'classifier__kernel': ['poly'],  # Kernel type
        'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100],  # Regularization strength
        'classifier__degree': [2, 3, 4, 5], # Degree of the polynomial kernel function (only for ‘poly’)
        'classifier__gamma': ['scale', 1, 0.1, 0.01, 0.001, 0.0001],  # Kernel coefficient (only for ‘rbf’, ‘poly’ and ‘sigmoid’)
        'classifier__coef0': [-1.0, 0.0, 1.0],  # Independent term in kernel function (only for ‘poly’ and ‘sigmoid’)
        'classifier__tol': [1e-3, 1e-2, 1e-1],  # Tolerance for stopping criteria  
        'resampler': [None,  # original, imbalanced dataset
                      RandomUnderSampler(random_state=42), 
                      RandomUnderSampler(random_state=42, sampling_strategy=0.5), # sampling_strategy = number of minoority class instances / number of majority class instances
                      RandomUnderSampler(random_state=42, sampling_strategy=0.25), 
                      RandomOverSampler(random_state=42), 
                      SMOTE(random_state=42), 
                      SMOTETomek(random_state=42)] 
    }, 
    {
        'classifier__kernel': ['rbf'],  # Kernel type
        'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100],  # Regularization strength
        'classifier__gamma': ['scale', 1, 0.1, 0.01, 0.001, 0.0001],  # Kernel coefficient  # Kernel coefficient (only for ‘rbf’, ‘poly’ and ‘sigmoid’)
        'classifier__tol': [1e-3, 1e-2, 1e-1],  # Tolerance for stopping criteria 
        'resampler': [None,  # original, imbalanced dataset
                      RandomUnderSampler(random_state=42), 
                      RandomUnderSampler(random_state=42, sampling_strategy=0.5), # sampling_strategy = number of minoority class instances / number of majority class instances
                      RandomUnderSampler(random_state=42, sampling_strategy=0.25), 
                      RandomOverSampler(random_state=42), 
                      SMOTE(random_state=42), 
                      SMOTETomek(random_state=42)] 
    }
]

# Set up HalvingGridSearchCV
halving_grid_search = HalvingGridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,  # Parameter grid remains the same  
    cv=5,  # 5-fold cross-validation
    scoring='accuracy',  # we want to optimize recall  # TODO decide with team
    n_jobs=-1,  # Use all processors
    verbose=1  # To track progress
)

# Fit the random search on training data
halving_grid_search.fit(X_train, y_train)

# Get the best parameters and score
print("Best Parameters:", halving_grid_search.best_params_)
print("Best Cross-Validation Accuracy:", halving_grid_search.best_score_)

n_iterations: 9
n_required_iterations: 9
n_possible_iterations: 9
min_resources_: 27
max_resources_: 177576
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 9954
n_resources: 27
Fitting 5 folds for each of 9954 candidates, totalling 49770 fits


14220 fits failed out of a total of 49770.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
2844 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\Salome Heckenthaler\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\Salome Heckenthaler\anaconda3\Lib\site-packages\sklearn\base.py", line 1474, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Salome Heckenthaler\anaconda3\Lib\site-packages\imblearn\pipeline.py", line 329, in fit
    Xt, yt = self._fit(X, y, routed_params)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^

----------
iter: 1
n_candidates: 3318
n_resources: 81
Fitting 5 folds for each of 3318 candidates, totalling 16590 fits


1105 fits failed out of a total of 16590.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
1105 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\Salome Heckenthaler\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\Salome Heckenthaler\anaconda3\Lib\site-packages\sklearn\base.py", line 1474, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Salome Heckenthaler\anaconda3\Lib\site-packages\imblearn\pipeline.py", line 329, in fit
    Xt, yt = self._fit(X, y, routed_params)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^

----------
iter: 2
n_candidates: 1106
n_resources: 243
Fitting 5 folds for each of 1106 candidates, totalling 5530 fits




----------
iter: 3
n_candidates: 369
n_resources: 729
Fitting 5 folds for each of 369 candidates, totalling 1845 fits




----------
iter: 4
n_candidates: 123
n_resources: 2187
Fitting 5 folds for each of 123 candidates, totalling 615 fits




----------
iter: 5
n_candidates: 41
n_resources: 6561
Fitting 5 folds for each of 41 candidates, totalling 205 fits




----------
iter: 6
n_candidates: 14
n_resources: 19683
Fitting 5 folds for each of 14 candidates, totalling 70 fits




----------
iter: 7
n_candidates: 5
n_resources: 59049
Fitting 5 folds for each of 5 candidates, totalling 25 fits




----------
iter: 8
n_candidates: 2
n_resources: 177147
Fitting 5 folds for each of 2 candidates, totalling 10 fits




Best Parameters: {'classifier__C': 10, 'classifier__gamma': 'scale', 'classifier__kernel': 'rbf', 'classifier__tol': 0.1, 'resampler': None}
Best Cross-Validation Accuracy: 0.48605709126489416




In [57]:
report_halving_GS = classification_report(y_val, halving_grid_search.predict(X_val))
print("Classification Report:\n", report_halving_GS)

Classification Report:
               precision    recall  f1-score   support

         0.0       0.75      0.20      0.32     21797
         1.0       0.13      0.63      0.22      4078

    accuracy                           0.27     25875
   macro avg       0.44      0.42      0.27     25875
weighted avg       0.65      0.27      0.30     25875



In [58]:
best_model_halving_grid = halving_grid_search.best_estimator_
save_model(best_model_halving_grid, "halving-grid_even-oversampling_accuracy")

Initial model saved to '../models/support_vector_machine/svm_model_halving-grid_even-oversampling_accuracy_20241126_003933.pkl'


# Retrain best model with probabilities

Activate probability outputs, i.e., SVC(probability=True). We do this to gain more insight into the model during evaluation (e.g., better precision recall curves). Since this increases the training time significantly, and does not change the performance of the model, this is not done during Hyperparameter Tuning, but now, just for the best model resulting from the Hyperparameter Tuning. 

In [None]:
# Hyperparameter tuning with Halving Grid Search
from sklearn.experimental import enable_halving_search_cv
from imblearn.pipeline import Pipeline
from sklearn.model_selection import HalvingGridSearchCV
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.under_sampling import RandomUnderSampler, TomekLinks, CondensedNearestNeighbour
from imblearn.combine import SMOTETomek

pipeline_probability = Pipeline([
    ('resampler', None),  # Placeholder for resampling method
    ('classifier', SVC(max_iter=10000, random_state=42, probability=True))  # Model, now with probabilities
])

param_grid_probability = {
    'classifier__C': [0.01], 
    'classifier__coef0': [-1.0], 
    'classifier__degree': [5], 
    'classifier__gamma': [0.0001], 
    'classifier__kernel': ['poly'], 
    'classifier__tol': [0.1], 
    'resampler': [RandomUnderSampler(random_state=42)] 
    }

# Set up HalvingGridSearchCV
halving_grid_search_probability = HalvingGridSearchCV(
    estimator=pipeline_probability,
    param_grid=param_grid_probability,  # Parameter grid remains the same 
    cv=5,  # 5-fold cross-validation
    scoring='accuracy',  # we want to optimize recall  # TODO decide with team
    n_jobs=-1,  # Use all processors
    verbose=1  # To track progress
)

# Fit the random search on training data
halving_grid_search_probability.fit(X_train, y_train)

# Get the best parameters and score
print("Best Parameters:", halving_grid_search_probability.best_params_)
print("Best Cross-Validation Accuracy:", halving_grid_search_probability.best_score_)

n_iterations: 1
n_required_iterations: 1
n_possible_iterations: 1
min_resources_: 177576
max_resources_: 177576
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 1
n_resources: 177576
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Best Parameters: {'classifier__C': 0.01, 'classifier__coef0': -1.0, 'classifier__degree': 5, 'classifier__gamma': 0.0001, 'classifier__kernel': 'poly', 'classifier__tol': 0.1, 'resampler': RandomUnderSampler(random_state=42)}
Best Cross-Validation Accuracy: 0.23306622137471456




In [14]:
report_halving_GS_probability = classification_report(y_val, halving_grid_search_probability.predict(X_val))
print("Classification Report:\n", report_halving_GS_probability)

Classification Report:
               precision    recall  f1-score   support

         0.0       0.96      0.06      0.11     21797
         1.0       0.16      0.99      0.28      4078

    accuracy                           0.21     25875
   macro avg       0.56      0.52      0.20     25875
weighted avg       0.84      0.21      0.14     25875



In [15]:
model_with_probability = halving_grid_search_probability.best_estimator_
save_model(model_with_probability, "_with-probability_undersampling_")

Initial model saved to '../models/support_vector_machine/svm_model__with-probability_undersampling__20241127_123528.pkl'


Now with the full training dataset:

In [16]:
# Hyperparameter tuning with Halving Grid Search
from sklearn.experimental import enable_halving_search_cv
from imblearn.pipeline import Pipeline
from sklearn.model_selection import HalvingGridSearchCV
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.under_sampling import RandomUnderSampler, TomekLinks, CondensedNearestNeighbour
from imblearn.combine import SMOTETomek

pipeline_probability = Pipeline([
    ('resampler', None),  # Placeholder for resampling method
    ('classifier', SVC(max_iter=10000, random_state=42, probability=True))  # Model, now with probabilities
])

param_grid_probability = {
    'classifier__C': [0.01], 
    'classifier__coef0': [-1.0], 
    'classifier__degree': [5], 
    'classifier__gamma': [0.0001], 
    'classifier__kernel': ['poly'], 
    'classifier__tol': [0.1], 
    'resampler': [None] 
    }

# Set up HalvingGridSearchCV
halving_grid_search_probability = HalvingGridSearchCV(
    estimator=pipeline_probability,
    param_grid=param_grid_probability,  # Parameter grid remains the same  
    cv=5,  # 5-fold cross-validation
    scoring='accuracy',  # we want to optimize recall  # TODO decide with team
    n_jobs=-1,  # Use all processors
    verbose=1  # To track progress
)

# Fit the random search on training data
halving_grid_search_probability.fit(X_train, y_train)

# Get the best parameters and score
print("Best Parameters:", halving_grid_search_probability.best_params_)
print("Best Cross-Validation Accuracy:", halving_grid_search_probability.best_score_)

n_iterations: 1
n_required_iterations: 1
n_possible_iterations: 1
min_resources_: 177576
max_resources_: 177576
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 1
n_resources: 177576
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Best Parameters: {'classifier__C': 0.01, 'classifier__coef0': -1.0, 'classifier__degree': 5, 'classifier__gamma': 0.0001, 'classifier__kernel': 'poly', 'classifier__tol': 0.1, 'resampler': None}
Best Cross-Validation Accuracy: 0.5165336620011021




In [17]:
report_halving_GS_probability = classification_report(y_val, halving_grid_search_probability.predict(X_val))
print("Classification Report:\n", report_halving_GS_probability)

Classification Report:
               precision    recall  f1-score   support

         0.0       0.95      0.47      0.63     21797
         1.0       0.24      0.88      0.37      4078

    accuracy                           0.54     25875
   macro avg       0.60      0.67      0.50     25875
weighted avg       0.84      0.54      0.59     25875



In [18]:
model_with_probability = halving_grid_search_probability.best_estimator_
save_model(model_with_probability, "_with-probability_full-data_")

Initial model saved to '../models/support_vector_machine/svm_model__with-probability_full-data__20241127_155642.pkl'


# Dimensionality Reduction with PCA

In [6]:
import pandas as pd

# Load the PCA datasets
train_pca = pd.read_csv("../data/pca/dataset_train_pca.csv")
val_pca = pd.read_csv("../data/pca/dataset_val_pca.csv")
test_pca = pd.read_csv("../data/pca/dataset_test_pca.csv")

In [7]:
# Split the PCA datasets into features and target
X_train_pca = train_pca.drop(columns=["Diabetes"])
y_train_pca = train_pca["Diabetes"]

X_val_pca = val_pca.drop(columns=["Diabetes"])
y_val_pca = val_pca["Diabetes"]

X_test_pca = test_pca.drop(columns=["Diabetes"])
y_test_pca = test_pca["Diabetes"]

In [None]:
# Initialize the classifier
model_pca = SVC(C=100, kernel='linear')  # result of grid search TODO

# Train the model on the preprocessed training data
model_pca.fit(X_train_pca, y_train_pca)

# Make predictions on the validation set
y_val_pred = model_pca.predict(X_val_pca)

# Calculate the accuracy
accuracy = accuracy_score(y_val_pca, y_val_pred)
print(f"Validation Accuracy: {accuracy}")
print(classification_report(y_val_pca, y_val_pred))


PCA on all components has an infeasible time complexity for Support Vector Machines (at least for Salome's computer without a GPU). It needs more than 12 hours and was automatically cut off.

Using only the best n components for classification

In [None]:
num_components = 2  # TODO 5
X_train_best_components = X_train_pca.iloc[:, :num_components]
X_val_best_components = X_val_pca.iloc[:, :num_components]

# Initialize the classifier
model_pca_n = SVC(C=100, kernel='linear', verbose=True)  # result of grid search TODO

# Train the classifier
model_pca_n.fit(X_train_best_components, y_train_pca)

# Make predictions on the validation set
y_val_pred = model_pca_n.predict(X_val_best_components)

# Calculate the accuracy
accuracy = accuracy_score(y_val_pca, y_val_pred)
print(f"Validation Accuracy: {accuracy}")
print(classification_report(y_val_pca, y_val_pred))

In [None]:
# TODO also here: undersampling?