Models trained on full training dataset.  

Pipeline is as follows:  
1) Data is word counts  
2) Data is sent through PCA with whitening  
3) Cross validation is used to chose the best L1-regularized LinearSVC  
4) The LinearSVC defines the subspace of features used for final SVC  
5) Cross validation is used to choose the best RBF SVC on reduced feature space

## Imports

In [1]:
from utils import get_unsplit_data, get_test_data
from sklearn.svm import SVC, LinearSVC
from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import PCA
import numpy as np
import time
import pickle

X_train, y_train, = get_unsplit_data()
X_test = get_test_data()

## Data Preprocessing

In [2]:
# Whitened PCA based on input data
PCA_white = PCA(whiten=True)
PCA_white.fit(X_train)

# Transform training and test data
X_train_PCA_white = PCA_white.transform(X_train)
X_test_PCA_white = PCA_white.transform(X_test)

## Single Stage Functions

In [9]:
def crossvalidate_L1_LinearSVC(X_t, y_t, C_params, K_folds):
    
    start = time.time()
    # Set parameters to be crossvalidated
    tuned_parameters = [{'loss':['squared_hinge'], 'penalty':['l1'], 'dual':[False], 'C': C_params}]
    # Perform cross validation
    clf = GridSearchCV(LinearSVC(), tuned_parameters, cv=K_folds, scoring='accuracy')
    clf.fit(X_t, y_t)
    end = time.time()
    
    print("L1 Cross-validation Training Time = ", (end - start))
    print()

    print("Grid scores:")
    print()
    means = clf.cv_results_['mean_test_score']
    stds = clf.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, clf.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))
    print()
    
    print("Best parameter set:")
    print(clf.best_params_)
    bestmodel = clf.best_estimator_
    N_coef = np.sum(bestmodel.coef_[0] != 0)
    N_dim = len(bestmodel.coef_[0])
    print("Dimensionality of model: %s of %s" % (N_coef, N_dim))
    print()
    
    return bestmodel

In [4]:
def crossvalidate_final_rbfSVC(X_t, y_t, C_params, G_params, K_folds):
    
    start = time.time()
    # Set parameters to be crossvalidated
    tuned_parameters = [{'C': C_params, 'gamma': G_params, 'kernel':['rbf'], 'cache_size':[2000]}]
    # Perform cross validation
    clf = GridSearchCV(SVC(), tuned_parameters, cv=K_folds, scoring='accuracy', n_jobs=4)
    clf.fit(X_t, y_t)
    end = time.time()
    
    print("Final RBF SVC Cross-validation Training Time = ", (end - start))
    print()

    print("Grid scores:")
    print()
    means = clf.cv_results_['mean_test_score']
    stds = clf.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, clf.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))
    print()
    
    print("Best parameter set:")
    print(clf.best_params_)
    print()
    
    print("Retraining model for decision probabilities")
    best_param = clf.best_params_
    best_param['probability'] = True
    
    start = time.time()
    # Train Final Model
    model = SVC(**best_param)
    model.fit(X_t, y_t)
    end = time.time()
    
    print("Final RBF SVC Training Time = ", (end - start))
    print()
    
    nSupp = len(model.support_)
    fCorr = np.sum(model.predict(X_t) == y_t)/len(y_t)
    print("Number of Support Vectors = ", nSupp)
    print("Training Accuracy = ", fCorr)
    print()
    
    return model

In [5]:
def predict_test_data(final_model, X_PCA_test_reduced):
    # Predict labels of dataset
    y_labels = final_model.predict(X_PCA_test_reduced)
    # Prediction probabilities
    y_probs = final_model.predict_proba(X_PCA_test_reduced)
    return [y_labels, y_probs]

In [6]:
def save_predicted_labels(y_label, file_prefix):
    y_label = y_label.reshape((len(y_label),1))
    result_col_1 = (np.array(range(len(y_label)))+1).reshape((len(y_label),1))
    result = np.concatenate((result_col_1,y_label), axis = 1)
    np.savetxt(file_prefix + "_pred_labels.txt", result, fmt="%d", delimiter=',', header='Id,Prediction')
    return

In [7]:
def save_predicted_probs(y_probs, file_prefix):
    result_col_1 = (np.array(range(len(y_probs[:,0])))+1).reshape((len(y_probs[:,0]),1))
    result = np.concatenate((result_col_1,y_probs), axis = 1)
    np.savetxt(file_prefix + "_pred_probs.txt", result, fmt=["%d", "%f", "%f"], delimiter=',', header='Id,P[0],P[1]')
    return

## Full Pipelines

In [8]:
def rbfSVC_pipeline(PCA_model, X_t, y_t, X_v, C_init, K_init, C_final, G_final, K_final, file_prefix):
    # Save PCA model
    pickle.dump( PCA_model, open( file_prefix + "_pca.p", "wb" ) )
    # Train L1-regularized linear SVC for dimension reduction
    reducing_model = crossvalidate_L1_LinearSVC(X_t, y_t, C_init, K_init)
    # Save model
    pickle.dump( reducing_model, open( file_prefix + "_reducing_svc.p", "wb" ) )
    # Reduce dimension of dataset
    X_t_reduced = X_t[:,(reducing_model.coef_ != 0)[0]]
    X_v_reduced = X_v[:,(reducing_model.coef_ != 0)[0]]
    # Train final rbf SVC
    final_model = crossvalidate_final_rbfSVC(X_t_reduced, y_t, C_final, G_final, K_final)
    # Save model
    pickle.dump( final_model, open( file_prefix + "_final_svc.p", "wb" ) )
    # Predict test labels and probabilities
    [y_label, y_prob] = predict_test_data(final_model, X_v_reduced)
    # Save labels and probabilities
    save_predicted_labels(y_label, file_prefix)
    save_predicted_probs(y_prob, file_prefix)
    return [reducing_model, final_model]

## Training Models

In [None]:
# RBF SVC trained on whitened PCA of raw data

C_i = np.logspace(-3,0,10)
K_i = 10

C_f = np.logspace(-4,2,13)
G_f = np.logspace(-4,0,9)
K_f = 5

fileprefix = 'Final_Pipeline_'

[rModel, fModel] = rbfSVC_pipeline(PCA_white, X_train_PCA_white, y_train, X_test_PCA_white, C_i, K_i, C_f, G_f, K_f, fileprefix)

Output:
    
L1 Cross-validation Training Time =  452.21334862709045  

Best parameter set found on development set:  
{'penalty': 'l1', 'C': 0.01, 'loss': 'squared_hinge', 'dual': False}  
Dimensionality of model: 818 of 1000  

Final RBF SVC Cross-validation Training Time =  11110.732734203339  

Best parameter set found on development set:  
{'kernel': 'rbf', 'C': 10.0, 'gamma': 0.0001, 'cache_size': 2000}  

Retraining model for decision probabilities  
Final RBF SVC Training Time =  1113.8617765903473  

Number of Support Vectors =  9161  
Training Accuracy =  0.89405  

