Classify the spam data using support vector machines with the Gaussian (or RBF) kernel.
Choose thes parameter $C$ and $\gamma$ by cross-validation.

In [3]:
import numpy as np
import pandas as pd

from sklearn.metrics import zero_one_loss
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC

# Download the spam data

In [2]:
# Read the data into a pandas data frame
df = pd.read_csv('../data/spam.dat', sep=' ', header=None)

# Extract the response variable Y from the data frame
# and convert it to a numpy array
Y = df[df.columns[-1]].to_numpy()

# Extract all 57 covariates into a numpy array X
X = df[df.columns[:-1]].to_numpy()

# Define the model pipeline and train a single model

In [3]:
# We train the model with sklearn's default paramters
pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("svc", SVC(kernel="rbf")),
])
fitted_pipeline = pipeline.fit(X, Y)
empirical_error_rate =  zero_one_loss(Y, fitted_pipeline.predict(X))
true_error_rate_cv_estimate = 1 - cross_val_score(pipeline, X, Y, cv=5).mean()
print(
    f"Misclassification rate: {empirical_error_rate:.3}\n"
    f"Cross-validation estimate of the true error rate: {true_error_rate_cv_estimate:.3}"
)

Misclassification rate: 0.0526
Cross-validation estimate of the true error rate: 0.0767


# List the parameters of the pipeline
This is used to identify the exact syntax to use when performing the hyperparameter search.

Here we note see that `svc__C` and `svc__gamma` are the parameters of interest.

In [23]:
pipeline.get_params()

{'memory': None,
 'steps': [('scaler', StandardScaler()), ('svc', SVC())],
 'transform_input': None,
 'verbose': False,
 'scaler': StandardScaler(),
 'svc': SVC(),
 'scaler__copy': True,
 'scaler__with_mean': True,
 'scaler__with_std': True,
 'svc__C': 1.0,
 'svc__break_ties': False,
 'svc__cache_size': 200,
 'svc__class_weight': None,
 'svc__coef0': 0.0,
 'svc__decision_function_shape': 'ovr',
 'svc__degree': 3,
 'svc__gamma': 'scale',
 'svc__kernel': 'rbf',
 'svc__max_iter': -1,
 'svc__probability': False,
 'svc__random_state': None,
 'svc__shrinking': True,
 'svc__tol': 0.001,
 'svc__verbose': False}

# Perform the hyperparameter search
(We started searching among $C , \gamma \in (0.01, 1, 100)$
and refined the search range to arrive at the search range below.)

In [26]:
param_grid = [{
    'svc__C': [1e2, 1e3, 1e4],
    'svc__gamma': [1e-4, 1e-3],
}]
grid_search = GridSearchCV(pipeline, param_grid)
grid_search.fit(X, Y)

cv_results = pd.DataFrame(grid_search.cv_results_)
print("Here are the cross-validation estimates of the true error rate for each parameter pair:")
print(cv_results.sort_values(by="rank_test_score")[["param_svc__C", "param_svc__gamma", "mean_test_score"]])
print(
    "\nThe best parameter pair among these is\n"
    f"C = {grid_search.best_params_['svc__C']}\n"
    f"gamma = {grid_search.best_params_['svc__gamma']}"
)
best_model = grid_search.best_estimator_

Here are the cross-validation estimates of the true error rate for each parameter pair:
   param_svc__C  param_svc__gamma  mean_test_score
1         100.0            0.0010         0.930448
4       10000.0            0.0001         0.928491
3        1000.0            0.0010         0.926535
2        1000.0            0.0001         0.925014
0         100.0            0.0001         0.918060
5       10000.0            0.0010         0.905884

The best parameter pair among these is
C = 100.0
gamma = 0.001


# Sanity check

In [36]:
empirical_error_rate =  zero_one_loss(Y, best_model.predict(X))
true_error_rate_cv_estimate = 1 - cross_val_score(best_model, X, Y, cv=5).mean()
print(
    "Sanity check (using the best estimator):\n"
    f"Misclassification rate: {empirical_error_rate:.3}\n"
    f"Cross-validation estimate of the true error rate: {true_error_rate_cv_estimate:.3}"
)

Sanity check (using the best estimator):
Misclassification rate: 0.0539
Cross-validation estimate of the true error rate: 0.0696
