# SVM, Cross Validation and Grid Search

In [16]:
import numpy as np
import scipy.sparse as sp
from collections import Counter
import scipy.optimize
import pickle
import timeit

In [17]:
with open("X1.txt") as f:
    emails = f.readlines()
labels = np.loadtxt("y1.txt")

In [18]:
def tfidf(docs):
    all_words = set([a for a in " ".join(docs).split(" ") if a != ""])
    all_words_dict = {k:i for i,k in enumerate(all_words)}
    
    word_counts = [Counter([a for a in d.split(" ") if a!=""]) for d in docs]
    data = [a for wc in word_counts for a in wc.values()]
    rows = [i for i,wc in enumerate(word_counts) for a in wc.values()]
    cols = [all_words_dict[k] for wc in word_counts for k in wc.keys()]
    X = sp.coo_matrix((data, (rows,cols)), (len(docs), len(all_words)))
    
    idf = np.log(float(len(docs))/np.asarray((X>0).sum(axis=0))[0])
    
    return X*sp.diags(idf), list(all_words)

features, all_words = tfidf(emails)


## SVM classification

In [70]:
class SVM:
    def __init__(self, X, y, reg):
        """ Initialize the SVM attributes and initialize the weights vector to the zero vector. 
            Attributes: 
                X (array_like) : training data intputs
                y (vector) : 1D numpy array of training data outputs
                reg (float) : regularizer parameter
                theta : 1D numpy array of weights
        """
        self.X = X
        self.y = y
        self.reg = reg
        self.theta = np.zeros(X.shape[1])
    
    def objective(self, X, y):
        """ Calculate the objective value of the SVM. When given the training data (self.X, self.y), this is the 
            actual objective being optimized. 
            Args:
                X (array_like) : array of examples, where each row is an example
                y (array_like) : array of outputs for the training examples
            Output:
                (float) : objective value of the SVM when calculated on X,y
        """
        y_hat = sp.diags(y).dot(X).dot(self.theta)
        return(np.sum(np.maximum( 1 - y_hat , 0 )) + (self.reg/2 * sum(self.theta*self.theta) ) )
        
    
    def gradient(self):
        """ Calculate the gradient of the objective value on the training examples. 
            Output:
                (vector) : 1D numpy array containing the gradient
        """
        YX = sp.diags(self.y).dot(self.X)
        return (-YX.T.dot( YX.dot(self.theta) <= 1 ) + self.reg * self.theta)
    
    def train(self, niters=100, learning_rate=1, verbose=False):
        """ Train the support vector machine with the given parameters. 
            Args: 
                niters (int) : the number of iterations of gradient descent to run
                learning_rate (float) : the learning rate (or step size) to use when training
                verbose (bool) : an optional parameter that you can use to print useful information (like objective value)
        """
        
        for i in range(niters):
            self.theta -= (learning_rate * self.gradient())
            if verbose:
                print(self.objective(self.X, self.y))
            
    
    def predict(self, X):
        """ Predict the class of each label in X. 
            Args: 
                X (array_like) : array of examples, where each row is an example
            Output:
                (vector) : 1D numpy array containing predicted labels
        """
        pred = np.zeros(X.shape[0])
        result = X.dot(self.theta)
        for i,yi in enumerate(result):
            pred[i] = (1 if yi > 0 else -1)
        return pred
        
        pass
    


In [71]:
# Verify the correctness of your code on small examples
y0 = np.random.randint(0,2,5)*2-1
print(y0)
X0 = np.random.random((5,10))
t0 = np.random.random(10)
svm0 = SVM(X0,y0, 1e-4)
svm0.theta = t0

def obj(theta):
    return svm0.objective(X0,y0)
    pass
obj(t0)
def grad(theta):
    return svm0.gradient()
    pass
# scipy.optimize.check_grad(obj, grad, t0)

%timeit svm0.train(niters=100, learning_rate=1, verbose=False)
svm0.predict(X0)


[ 1 -1 -1 -1 -1]
23 ms ± 2.91 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


array([ 1., -1., -1., -1., -1.])

In [59]:

svm = SVM(features, labels, 1e-4)
%timeit -n 1 -r 1 svm.train(niters = 100, learning_rate = 1)
yp = svm.predict(features)
print(yp)
print(labels)
print(np.mean(labels != yp))

print(labels)


6.11 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
[ 1.  1.  1. ..., -1. -1. -1.]
[ 1.  1.  1. ..., -1. -1. -1.]
0.0
[ 1.  1.  1. ..., -1. -1. -1.]


## Model Selection: Cross validation and Parameter Grid Search

In [54]:
import math as mt

class ModelSelector:
    """ A class that performs model selection. 
        Attributes:
            blocks (list) : list of lists of indices of each block used for k-fold cross validation, e.g. blocks[i] 
            gives the indices of the examples in the ith block 
            test_block (list) : list of indices of the test block that used only for reporting results
            
    """
    def __init__(self, X, y, P, k, niters):
        """ Initialize the model selection with data and split into train/valid/test sets. Split the permutation into blocks 
            and save the block indices as an attribute to the model. 
            Args:
                X (array_like) : array of features for the datapoints
                y (vector) : 1D numpy array containing the output labels for the datapoints
                P (vector) : 1D numpy array containing a random permutation of the datapoints
                k (int) : number of folds
                niters (int) : number of iterations to train for
        """
        
        self.X = X
        self.y = y
        self.P = P
        self.k = k
        self.niters = niters
        
        blocks_ = np.array_split(P, self.k+1)
        
        self.blocks = blocks_[:-1]
        self.test_block = blocks_[-1]

        self.train_indices = np.hstack(self.blocks)
        self.test_indices = self.test_block


    def cross_validation(self, lr, reg):
        """ Given the permutation P in the class, evaluate the SVM using k-fold cross validation for the given parameters 
            over the permutation
            Args: 
                lr (float) : learning rate
                reg (float) : regularizer parameter
            Output: 
                (float) : the cross validated error rate
        """
        pred_err = np.zeros(len(self.blocks))
        for i,v_block in enumerate(self.blocks):
            t_block = np.ones(self.X.shape[0], bool)
            t_block[v_block,] = False
            t_block[self.test_indices,] = False

            train_set = self.X[t_block,:]
            train_label = self.y[t_block,]

            valid_set = self.X[v_block,:]
            valid_label = self.y[v_block,]
            
            svm_cv = SVM(train_set, train_label, reg)
            svm_cv.train(niters = self.niters, learning_rate = lr)
            
            v_pred = svm_cv.predict(valid_set)
            pred_err[i] = np.mean(v_pred != valid_label)
        
        return np.mean(pred_err)
        
        pass
    
    def grid_search(self, lrs, regs):
        """ Given two lists of parameters for learning rate and regularization parameter, perform a grid search using
            k-wise cross validation to select the best parameters. 
            Args:  
                lrs (list) : list of potential learning rates
                regs (list) : list of potential regularizers
            Output: 
                (lr, reg) : 2-tuple of the best found parameters
        """
        cv_err = 1.0
        result = (0.0,0.0)
        for i,lr in enumerate(lrs):
            for j,rg in enumerate(regs):
                thisCV_err = self.cross_validation(lr,rg)
                if(thisCV_err < cv_err):
                    result = (lr,rg)
                    cv_err = thisCV_err
        
        return(result)
        pass
    
    def test(self, lr, reg):
        """ Given parameters, calculate the error rate of the test data given the rest of the data. 
            Args: 
                lr (float) : learning rate
                reg (float) : regularizer parameter
            Output: 
                (err, svm) : tuple of the error rate of the SVM on the test data and the learned model
        """
        
        svm = SVM(self.X[self.train_indices,:], self.y[self.train_indices,], reg)
        svm.train(niters = self.niters, learning_rate = lr)
        
        test = self.X[self.test_indices,:]
        test_label = self.y[self.test_indices,]
        
        pred = svm.predict(test)
        err = np.mean(pred != test_label)
        return (err, svm)
        
        pass
    

## K-fold cross validation

In [55]:

# print(np.arange(X0.shape[0]))
print(y0)
MS0 = ModelSelector(X0, y0, np.random.permutation(X0.shape[0]), 3, 100)
MS0.cross_validation(0.1, 1e-4)


[ 1 -1  1  1 -1]


0.33333333333333331

In [51]:

# arr = np.random.permutation(features.shape[0])
mean = np.zeros(10)
for i in range(10):
    MS0 = ModelSelector(features, labels, np.random.permutation(features.shape[0]), 5, 100)
    mean[i] = MS0.cross_validation(1, 1e-4)

print(np.mean(mean))
print(np.std(mean))


0.0105351318692
0.00044501054284


## Grid search

In [56]:

MS = ModelSelector(features, labels, np.arange(features.shape[0]), 4, 100)
%timeit -n 1 -r 1 lr, reg = MS.grid_search(np.logspace(-1,1,3), np.logspace(-2,1,4))
print(lr, reg)
print(MS.test(lr,reg))


2min 43s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
