# Sparse Kernel

## Support Vector Machine

In [32]:
import numpy as np
from sklearn import datasets
from sklearn import preprocessing
import matplotlib.pyplot as plt

import jax
import jax.numpy as jnp
from jaxopt import BoxOSQP

from basic.kernel.kernel import Linear

In [126]:
class BinarySVC():
    """ Support Vector Machine Binary Classifier
     
    """
    kernel = Linear()
    kernel_params = None
    C = 2.0
    tolerance = 1e-6

    def fit(self, X, t):
        """fit model to data
        
        Solve the following dual optimization problem:
            $$ \min_{\alpha} \frac{1}{2}(\alpha t)^T K (\alpha t) - (\alpha t)^T t $$ 
            
            subject to:
            
            $$ 0 \leq \alpha_i \leq C $$
            $$ \sum_{i=1}^{N} \alpha_i t_i = 0 $$
            
            reformulation by substituting $ \beta = \alpha t $:
            $$ \min_{\beta} \frac{1}{2} \beta^T K \beta - \beta^T 1 $$
            
        Args:
            X (jnp.array, (N, D)): input data
            t (jnp.array, (N,)): target data
            kernel (function): kernel function
            kernel_params (dict): kernel parameters
            C (float): regularization parameter
            
        Returns:
            dict: dictionary of parameters
        """
        
        def matvec_Q(X, beta):
            # the objective implementation in OSQP is 0.5*x^T * matvec_Q(P,x)
            # this returns Kbeta = X X^T beta
            # because OSQP assume 0.5*x^T * matvec_Q(P,x) in the objective
            # return shape: (N,)
            
            Gram = self.kernel.kernel(self.kernel_params, X, X)
            return Gram @ beta

        def matvec_A(_, beta):
            return beta, jnp.sum(beta)
        
        # l, u must have same shape as matvec_A's output.
        l = -jax.nn.relu(-t * self.C), 0.
        u =  jax.nn.relu( t * self.C), 0.
        
        # formulate and solve quadratic programming problem
        hyper_params = dict(params_obj=(X, -t), params_eq=None, params_ineq=(l, u))
        osqp = BoxOSQP(matvec_Q=matvec_Q, matvec_A=matvec_A, tol=self.tolerance)
        params, _ = osqp.run(init_params=None, **hyper_params)
        beta = params.primal[0]

        # get support vectors
        sv = self.get_support_vectors(beta)
        
        return beta, sv

    def get_support_vectors(self, beta):
        # beta is signed 
        # beta = 0 means the Langrange multiplier is 0, which means the corresponding sample does not contribute to the sum in the objective function.
        # beta ~= 0 means the samples are support vectors
        
        is_sc = jnp.abs(beta) > self.tolerance

        return jnp.where(is_sc)[0]
    
    def _accuracy(self):
        """get accuracy of model:
            if 0 < abs(beta) < C, then epsilon = 0, then the sample is on the margin
            if abs(beta) = C, then the sample can lie inside the margin and can either be correctly classied if epsilon <=1 or misclassified if epsilon > 1
        """
        pass
    
    def predict(self, X_test, X_train, y_train, beta, sv):
        
        """solving primal problem gives w and b
            From Eq. (7.29) and (7.37) in Bishop's book:
            $$ w = \sum_{i=1}^{N} \alpha_i t_i x_i = (\beta^T x)^T = x^T \beta $$
            $$ wx = w^Tx^T = \beta^T x x^T = \beta^T K$$
            
        """
        # get wx
        Gram = self.kernel.kernel(self.kernel_params, X_train[sv], X_test)
        wx = beta[sv].T @ Gram 
        
        # get b
        # get indice of support vectors on the margin: 0 < abs(beta) < C
        M_mask = jnp.abs(beta[sv]) < self.C-self.tolerance
        Gram_S = self.kernel.kernel(self.kernel_params, X_train[sv], X_train[sv]) # (S,S)

        # define some jittable functions
        def set_nonmargin_to_zero(x, M):
            return jnp.where(M, x, 0)
        
        def get_nonzero_mean(x):
            return jnp.mean(x, where = x != 0)
                    
        bv = set_nonmargin_to_zero(y_train[sv] - Gram_S @ beta[sv], M_mask)
        b = get_nonzero_mean(bv)

        # This version is not jittable, and seems slightly different for the final b
        # b1 
        #Gram_M = kernel.kernel(kernel_params, X_train[sv][M_mask], X_train[sv]) # (M, S)
        #bv1 = y_train[sv][M_mask] - Gram_M @ beta[sv]
        #b1 = jnp.mean(bv1)
        #print(bv1.shape, b1, bv1)
        #print(b)
        # retur signs of wx + b: 1 or -1
        return jnp.sign(wx + b)
        
    

In [127]:
lam = 0.5
tol = 1e-06
num_samples = 30
num_features = 5
verbose = False

# Prepare data.
X, y = datasets.make_classification(n_samples=num_samples, n_features=num_features,
                                n_classes=2,
                                random_state=0)
X = preprocessing.Normalizer().fit_transform(X)
y = jnp.array(y * 2. - 1)  # Transform labels from {0, 1} to {-1., 1.}.

C = 1./ lam

# Compare the obtained dual coefficients.
# kernels 
linear_kernel = Linear()
svc = BinarySVC()
beta, sv = svc.fit(X, y)

# predict
svc_predict = jax.jit(svc.predict)
#svc_predict = svc.predict
y_predict = svc_predict(X_test=X, 
                        X_train=X, 
                        y_train=y, 
                        beta=beta, 
                        sv=sv)

print(jnp.abs(y_predict - y).sum())

AttributeError: 'BinarySVC' object has no attribute 'tol'

In [105]:
from absl import app
#from absl import flags

import jax
import jax.numpy as jnp
from jaxopt import projection
from jaxopt import ProjectedGradient
from jaxopt import BoxOSQP

import numpy as onp
from sklearn import datasets
from sklearn import preprocessing
from sklearn import svm

tol = 1e-06 
verbose = False
    
def binary_kernel_svm_skl(X, y, C):
    print(f"Solve SVM with sklearn.svm.SVC: ")
    K = jnp.dot(X, X.T)
    svc = svm.SVC(kernel="precomputed", C=C, tol=tol).fit(K, y)
    dual_coef = onp.zeros(K.shape[0])
    dual_coef[svc.support_] = svc.dual_coef_[0]
    print(svc.intercept_)
    return dual_coef


def binary_kernel_svm_pg(X, y, C):
    print(f"Solve SVM with Projected Gradient: ")

    def objective_fun(beta, X, y):
        """Dual objective of binary kernel SVMs with intercept."""
        # The dual objective is:
        # fun(beta) = 0.5 beta^T K beta - beta^T y
        # subject to
        # sum(beta) = 0
        # 0 <= beta_i <= C if y_i = 1
        # -C <= beta_i <= 0 if y_i = -1
        # where C = 1.0 / lam
        # and K = X X^T
        Kbeta = jnp.dot(X, jnp.dot(X.T, beta))

        return 0.5 * jnp.dot(beta, Kbeta) - jnp.dot(beta, y)

    # Define projection operator.
    w = jnp.ones(X.shape[0])

    def proj(beta, C):
        box_lower = jnp.where(y == 1, 0, -C)
        box_upper = jnp.where(y == 1, C, 0)
        proj_params = (box_lower, box_upper, w, 0.0)
        return projection.projection_box_section(beta, proj_params)

    # Run solver.
    beta_init = jnp.ones(X.shape[0])
    solver = ProjectedGradient(fun=objective_fun,
                                projection=proj,
                                tol=tol, maxiter=500, verbose=verbose)
    beta_fit = solver.run(beta_init, hyperparams_proj=C, X=X, y=y).params

    return beta_fit


def binary_kernel_svm_osqp(X, y, C):
    # The dual objective is:
    # fun(beta) = 0.5 beta^T K beta - beta^T y
    # subject to
    # sum(beta) = 0
    # 0 <= beta_i <= C if y_i = 1
    # -C <= beta_i <= 0 if y_i = -1
    # where C = 1.0 / lam

    print(f"Solve SVM with OSQP: ")

    def matvec_Q(X, beta):
        # the objective implementation in OSQP is 0.5*x^T * matvec_Q(P,x)
        # this returns Kbeta = X X^T beta
        # because OSQP assume 0.5*x^T * matvec_Q(P,x) in the objective
        return jnp.dot(X, jnp.dot(X.T,  beta))

    # There qre two types of constraints:
    #   0 <= y_i * beta_i <= C     (1)
    # and:
    #   sum(beta) = 0              (2)
    # The first one involves the identity matrix over the betas.
    # The second one involves their sum (i.e dot product with vector full of 1).
    # We take advantage of matvecs to avoid materializing A in memory.
    # We return a tuple whose entries correspond each type of constraint.
    def matvec_A(_, beta):
        return beta, jnp.sum(beta)

    # l, u must have same shape than matvec_A's output.
    l = -jax.nn.relu(-y * C), 0.
    u =  jax.nn.relu( y * C), 0.

    hyper_params = dict(params_obj=(X, -y), params_eq=None, params_ineq=(l, u))
    osqp = BoxOSQP(matvec_Q=matvec_Q, matvec_A=matvec_A, tol=tol)
    params, _ = osqp.run(init_params=None, **hyper_params)
    beta = params.primal[0]

    return beta


def print_svm_result(beta, threshold=1e-4):
    # Here the vector `beta` of coefficients is signed:
    # its sign depends of the true label of the corresponding example.
    # Hence we use jnp.abs() to detect support vectors.
    is_support_vectors = jnp.abs(beta) > threshold
    print(f"Beta: {beta}")
    print(f"Support vector indices: {onp.where(is_support_vectors)[0]}")
    print("")


def main():

    lam = 0.5
    num_samples = 30
    num_features = 5


    # Prepare data.
    X, y = datasets.make_classification(n_samples=num_samples, n_features=num_features,
                                        n_classes=2,
                                        random_state=0)
    X = preprocessing.Normalizer().fit_transform(X)
    y = jnp.array(y * 2. - 1)  # Transform labels from {0, 1} to {-1., 1.}.

    C = 1./ lam

    beta_fit_osqp = binary_kernel_svm_osqp(X, y, C)
    print_svm_result(beta_fit_osqp)

    beta_fit_pg = binary_kernel_svm_pg(X, y, C)
    print_svm_result(beta_fit_pg)

    beta_fit_skl = binary_kernel_svm_skl(X, y, C)
    print_svm_result(beta_fit_skl)


if __name__ == "__main__":
    jax.config.update("jax_platform_name", "cpu")
    main()

Solve SVM with OSQP: 
Beta: [ 2.8452001e-07 -1.9592164e-07  7.1804309e-01 -3.1301258e-07
  7.4410588e-07  7.0679100e-08 -4.3022047e-07 -2.0713337e-07
  5.9224438e-02  2.0000005e+00  2.0000005e+00 -1.6404691e-07
 -1.9999998e+00  2.0000000e+00 -3.8241390e-07 -1.9999998e+00
 -1.5248646e-07 -7.4367165e-07 -7.7726805e-01 -3.8845691e-08
  3.8410909e-07 -2.0431916e-08  2.2927431e-08  5.0132905e-07
 -3.4100701e-08 -4.6190539e-07 -2.0000002e+00 -1.7885380e-07
 -1.2326537e-07  5.6264736e-07]
Support vector indices: [ 2  8  9 10 12 13 15 18 26]

Solve SVM with Projected Gradient: 
Beta: [ 0.          0.          0.718046    0.          0.          0.
  0.          0.          0.05922639  2.          2.          0.
 -2.          2.          0.         -2.          0.          0.
 -0.77726483  0.          0.          0.          0.          0.
  0.          0.         -2.          0.          0.          0.        ]
Support vector indices: [ 2  8  9 10 12 13 15 18 26]

Solve SVM with sklearn.svm.SV