Classify the spam data using support vector machines
with the kernel $K ( x, \tilde x ) = {( 1 + \langle x, \tilde x\rangle )}^p$.
Choose $p$ by cross-validation.

In [1]:
import numpy as np
import pandas as pd

from collections import namedtuple
from sklearn.metrics import zero_one_loss
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

In [2]:
# Read the data into a pandas data frame
df = pd.read_csv('../data/spam.dat', sep=' ', header=None)

# Extract the response variable Y from the data frame
# and convert it to a numpy array
Y = df[df.columns[-1]].to_numpy()

# Extract all 57 covariates into a numpy array X
X = df[df.columns[:-1]].to_numpy()

In [29]:
FittedPolySCV = namedtuple("FittedPolySCV", [
    "fitted_pipeline", "empirical_error_rate", "true_error_rate_cv_estimate"
])

# For each degree p, fit a SVC with polynomial kernel
results = []
for p in range(2, 8):
    pipeline = make_pipeline(
        StandardScaler(),
        SVC(kernel="poly", degree=p, coef0=1)
    )
    fitted_pipeline = pipeline.fit(X, Y)
    empirical_error_rate = zero_one_loss(Y, fitted_pipeline.predict(X))
    true_error_rate_cv_estimate = 1 - cross_val_score(pipeline, X, Y, cv=5).mean()
    results.append(FittedPolySCV(fitted_pipeline, empirical_error_rate, true_error_rate_cv_estimate))

In [30]:
# Report out the results
for result in results:
    print(
        f"SCV model with polynomial kernel of degree {result.fitted_pipeline["svc"].degree}\n"
        f"Misclassification rate: {result.empirical_error_rate:.3}\n"
        f"Cross-validation estimate of the true error rate: {result.true_error_rate_cv_estimate:.3}\n"
        "-------------------------------------------------------"
    )

SCV model with polynomial kernel of degree 2
Misclassification rate: 0.0524
Cross-validation estimate of the true error rate: 0.0737
-------------------------------------------------------
SCV model with polynomial kernel of degree 3
Misclassification rate: 0.0426
Cross-validation estimate of the true error rate: 0.0778
-------------------------------------------------------
SCV model with polynomial kernel of degree 4
Misclassification rate: 0.0402
Cross-validation estimate of the true error rate: 0.0835
-------------------------------------------------------
SCV model with polynomial kernel of degree 5
Misclassification rate: 0.0337
Cross-validation estimate of the true error rate: 0.0863
-------------------------------------------------------
SCV model with polynomial kernel of degree 6
Misclassification rate: 0.0285
Cross-validation estimate of the true error rate: 0.0882
-------------------------------------------------------
SCV model with polynomial kernel of degree 7
Misclassif