Classify the spam data using support vector machines
with the kernel $K ( x, \tilde x ) = {( 1 + \langle x, \tilde x\rangle )}^p$.
Choose $p$ by cross-validation.

In [1]:
import numpy as np
import pandas as pd

from collections import namedtuple
from scipy import stats
from sklearn.metrics import zero_one_loss
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC

In [2]:
# Read the data into a pandas data frame
df = pd.read_csv('../data/spam.dat', sep=' ', header=None)

# Extract the response variable Y from the data frame
# and convert it to a numpy array
Y = df[df.columns[-1]].to_numpy()

# Extract all 57 covariates into a numpy array X
X = df[df.columns[:-1]].to_numpy()

In [3]:
FittedPolySCV = namedtuple("FittedPolySCV", [
    "fitted_model", "empirical_error_rate", "true_error_rate_cv_estimate"
])

def fit_poly_SCV(response, covariate, p, cv=10):
    """
    Fit a SVC using a polynomial kernel of degree p.
    Return the fitted model, the empirical error_rate, and
    a cross-validation estimate of the true error rate.
    """

    # Define the model
    model = SVC(kernel='poly', degree=p, coef0=1)

    # Normalize the data
    normalized_covariate = (covariate - covariate.mean(axis=0)) / covariate.std(axis=0)
    
    # Fit the model
    fitted_model = model.fit(normalized_covariate, response)
    
    # Compute the empirical error rate
    empirical_error_rate = zero_one_loss(response, fitted_model.predict(normalized_covariate))
    
    # Compute a cross-validation estimate of the true error rate
    if cv is not None:
        true_error_rate_cv_estimate = 1 - cross_val_score(
            model, normalized_covariate, response, cv=cv
        ).mean()
    else:
        true_error_rate_cv_estimate = None

    return FittedPolySCV(fitted_model, empirical_error_rate, true_error_rate_cv_estimate)

In [4]:
# Train several SVC models using polynomial kernels with increasing degrees
model_results = [
    fit_poly_SCV(Y, X, p=p, cv=5)
    for p in range(2, 8)
]

In [5]:
# Report out the results
for model_result in model_results:
    print(
        f"SCV model with polynomial kernel of degree {model_result.fitted_model.degree}\n"
        f"Misclassification rate: {model_result.empirical_error_rate:.3}\n"
        f"Cross-validation estimate of the true error rate: {model_result.true_error_rate_cv_estimate:.3}\n"
        "-------------------------------------------------------"
    )

SCV model with polynomial kernel of degree 2
Misclassification rate: 0.0524
Cross-validation estimate of the true error rate: 0.0741
-------------------------------------------------------
SCV model with polynomial kernel of degree 3
Misclassification rate: 0.0426
Cross-validation estimate of the true error rate: 0.0793
-------------------------------------------------------
SCV model with polynomial kernel of degree 4
Misclassification rate: 0.0402
Cross-validation estimate of the true error rate: 0.083
-------------------------------------------------------
SCV model with polynomial kernel of degree 5
Misclassification rate: 0.0337
Cross-validation estimate of the true error rate: 0.0874
-------------------------------------------------------
SCV model with polynomial kernel of degree 6
Misclassification rate: 0.0285
Cross-validation estimate of the true error rate: 0.0932
-------------------------------------------------------
SCV model with polynomial kernel of degree 7
Misclassifi

### Sanity check
We create synthetic data to verify that the SVC with quadratic kernel
beats the OSH classifier (i.e. SVC classifier with *linear* kernel)
when the decision boundary of the Bayes rule is the unit disk.

In [6]:
from sklearn.svm import LinearSVC

In [7]:
# Number of samples
n = 5000

# X is uniformly distributed in the square
# centered at the origin of side length 3.
synthetic_X = stats.uniform.rvs(loc=-1.5, scale=3, size=[n, 2])

# Y = 1 in the interior of the unit disk, and Y = 0 otherwise
synthetic_Y = (synthetic_X[:, 0]**2 + synthetic_X[:, 1]**2 < 1).astype(int)

In [8]:
# Linear SVC

model = LinearSVC()
fitted_model = model.fit(synthetic_X, synthetic_Y)
empirical_error_rate = zero_one_loss(synthetic_Y, fitted_model.predict(synthetic_X))
true_error_rate_cv_estimate = 1 - cross_val_score(model, synthetic_X, synthetic_Y, cv=5).mean()
print(
    f"Misclassification rate: {empirical_error_rate:.3}\n"
    f"Cross-validation estimate of the true error rate: {true_error_rate_cv_estimate:.3}"
)

Misclassification rate: 0.352
Cross-validation estimate of the true error rate: 0.352


In [9]:
# Polynomial SVC
model_results = [
    fit_poly_SCV(synthetic_Y, synthetic_X, p=p, cv=5)
    for p in range(2, 6)
]
for model_result in model_results:
    print(
        f"SCV model with polynomial kernel of degree {model_result.fitted_model.degree}\n"
        f"Misclassification rate: {model_result.empirical_error_rate:.3}\n"
        f"Cross-validation estimate of the true error rate: {model_result.true_error_rate_cv_estimate:.3}\n"
        "-------------------------------------------------------"
    )

SCV model with polynomial kernel of degree 2
Misclassification rate: 0.0036
Cross-validation estimate of the true error rate: 0.0054
-------------------------------------------------------
SCV model with polynomial kernel of degree 3
Misclassification rate: 0.0022
Cross-validation estimate of the true error rate: 0.0042
-------------------------------------------------------
SCV model with polynomial kernel of degree 4
Misclassification rate: 0.0014
Cross-validation estimate of the true error rate: 0.0036
-------------------------------------------------------
SCV model with polynomial kernel of degree 5
Misclassification rate: 0.0016
Cross-validation estimate of the true error rate: 0.004
-------------------------------------------------------
