Pipeline is as follows:  
1) Data is either raw word counts or word counts capped at one  
2) Data is sent through PCA without whitening (linear SVC) or with whitening (RBF SVC)  
3) Cross validation is used to chose the best L1-regularized LinearSVC  
4) The LinearSVC defines the subspace of features used for final SVC  
5) Cross validation is used to choose the best SVC on reduced feature space

## Imports

In [None]:
from utils import get_split_data
from sklearn.svm import SVC, LinearSVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.decomposition import PCA
import numpy as np
import time

X_train, y_train, X_val, y_val = get_split_data()

## Data Preprocessing

In [None]:
# Truncate word counts to one
X_train_trunc = np.minimum(X_train, np.ones(X_train.shape))
X_val_trunc = np.minimum(X_val, np.ones(X_val.shape))

In [None]:
# PCA based on raw input data
raw_PCA = PCA()
raw_PCA.fit(X_train)

# Transform training and validation data
X_train_raw_PCA = raw_PCA.transform(X_train)
X_val_raw_PCA = raw_PCA.transform(X_val)

In [None]:
# Whitened PCA based on raw input data
raw_PCA_white = PCA(whiten=True)
raw_PCA_white.fit(X_train)

# Transform training and validation data
X_train_raw_PCA_white = raw_PCA_white.transform(X_train)
X_val_raw_PCA_white = raw_PCA_white.transform(X_val)

In [None]:
# PCA based on truncated input data
trunc_PCA = PCA()
trunc_PCA.fit(X_train_trunc)

X_train_trunc_PCA = trunc_PCA.transform(X_train_trunc)
X_val_trunc_PCA = trunc_PCA.transform(X_val_trunc)

In [None]:
# Whitened PCA based on truncated input data
trunc_PCA_white = PCA(whiten=True)
trunc_PCA_white.fit(X_train_trunc)

X_train_trunc_PCA_white = trunc_PCA_white.transform(X_train_trunc)
X_val_trunc_PCA_white = trunc_PCA_white.transform(X_val_trunc)

## Functions

In [None]:
C_tests_L1 = np.logspace(-3,3,19)
K_fold_L1 = 10

In [None]:
def crossvalidate_L1_LinearSVC(X_t, y_t, X_v, y_v, C_params, K_folds):
    
    start = time.time()
    
    # Set parameters to be crossvalidated
    tuned_parameters = [{'loss':['squared_hinge'], 'penalty':['l1'], 'dual':[False], 'C': C_params}]
    # Perform cross validation
    clf = GridSearchCV(LinearSVC(), tuned_parameters, cv=K_folds, scoring='accuracy')
    clf.fit(X_t, y_t)
    
    end = time.time()
    
    print("L1 Cross-validation Training Time = ", (end - start))

    print("Best parameter set found on development set:")
    print(clf.best_params_)
    print()
    bestmodel = clf.best_estimator_
    N_coef = sum(bestmodel.coef_[0] != 0)
    N_dim = len(bestmodel.coef_[0])
    print("Dimensionality of model: %s of %s" % (N_coef, N_dim))
    print()
    
    return bestmodel

In [None]:
def crossvalidate_final_LinearSVC(X_t, y_t, X_v, y_v, C_params, K_folds):
    
    start = time.time()
    
    # Set parameters to be crossvalidated
    tuned_parameters = [{'loss':['hinge'], 'penalty':['l2'], 'dual':[False], 'C': C_params}]
    # Perform cross validation
    clf = GridSearchCV(LinearSVC(), tuned_parameters, cv=K_folds, scoring='accuracy')
    clf.fit(X_t, y_t)
    
    end = time.time()
    
    print("Final Linear SVC Cross-validation Training Time = ", (end - start))

    print("Best parameter set found on development set:")
    print(clf.best_params_)
    print()
    
    bestmodel = clf.best_estimator_
    fCorr = sum(bestmodel.predict(X_v) == y_v)/len(y_v)
    print("Test Accuracy = ", fCorr)
    print()
    
    return bestmodel

In [None]:
def crossvalidate_final_rbfSVC(X_t, y_t, X_v, y_v, C_params, K_folds):
    
    start = time.time()
    
    # Set parameters to be crossvalidated
    tuned_parameters = [{'C': C_params, 'kernel':['rbf'], 'cache_size':[1000]}]
    # Perform cross validation
    clf = GridSearchCV(SVC(), tuned_parameters, cv=K_folds, scoring='accuracy')
    clf.fit(X_t, y_t)
    
    end = time.time()
    
    print("Final RBF SVC Cross-validation Training Time = ", (end - start))

    print("Best parameter set found on development set:")
    print(clf.best_params_)
    print()
    
    bestmodel = clf.best_estimator_
    fCorr = sum(bestmodel.predict(X_v) == y_v)/len(y_v)
    print("Test Accuracy = ", fCorr)
    print()
    
    return bestmodel