Models trained on full training dataset.  

Pipeline is as follows:  
1) Data is either raw word counts or word counts capped at one  
2) Data is sent through PCA without whitening (linear SVC) or with whitening (RBF SVC)  
3) Cross validation is used to chose the best L1-regularized LinearSVC  
4) The LinearSVC defines the subspace of features used for final SVC  
5) Cross validation is used to choose the best SVC on reduced feature space

## Imports

In [1]:
from utils import get_unsplit_data, get_test_data
from sklearn.svm import SVC, LinearSVC
from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import PCA
import numpy as np
import time
import pickle

X_train, y_train, = get_unsplit_data()
X_test = get_test_data()

## Data Preprocessing

In [2]:
# Truncate word counts to one
X_train_trunc = np.minimum(X_train, np.ones(X_train.shape))
X_test_trunc = np.minimum(X_test, np.ones(X_test.shape))

In [3]:
# PCA based on raw input data
raw_PCA = PCA()
raw_PCA.fit(X_train)

# Transform training and test data
X_train_raw_PCA = raw_PCA.transform(X_train)
X_test_raw_PCA = raw_PCA.transform(X_test)

In [4]:
# Whitened PCA based on raw input data
raw_PCA_white = PCA(whiten=True)
raw_PCA_white.fit(X_train)

# Transform training and test data
X_train_raw_PCA_white = raw_PCA_white.transform(X_train)
X_test_raw_PCA_white = raw_PCA_white.transform(X_test)

In [5]:
# PCA based on truncated input data
trunc_PCA = PCA()
trunc_PCA.fit(X_train_trunc)

# Transform training and test data
X_train_trunc_PCA = trunc_PCA.transform(X_train_trunc)
X_test_trunc_PCA = trunc_PCA.transform(X_test_trunc)

In [6]:
# Whitened PCA based on truncated input data
trunc_PCA_white = PCA(whiten=True)
trunc_PCA_white.fit(X_train_trunc)

# Transform training and test data
X_train_trunc_PCA_white = trunc_PCA_white.transform(X_train_trunc)
X_test_trunc_PCA_white = trunc_PCA_white.transform(X_test_trunc)

## Single Stage Functions

In [7]:
def crossvalidate_L1_LinearSVC(X_t, y_t, C_params, K_folds):
    
    start = time.time()
    # Set parameters to be crossvalidated
    tuned_parameters = [{'loss':['squared_hinge'], 'penalty':['l1'], 'dual':[False], 'C': C_params}]
    # Perform cross validation
    clf = GridSearchCV(LinearSVC(), tuned_parameters, cv=K_folds, scoring='accuracy')
    clf.fit(X_t, y_t)
    end = time.time()
    
    print("L1 Cross-validation Training Time = ", (end - start))
    print()

    print("Best parameter set found on development set:")
    print(clf.best_params_)
    bestmodel = clf.best_estimator_
    N_coef = np.sum(bestmodel.coef_[0] != 0)
    N_dim = len(bestmodel.coef_[0])
    print("Dimensionality of model: %s of %s" % (N_coef, N_dim))
    print()
    
    return bestmodel

In [8]:
def crossvalidate_final_LinearSVC(X_t, y_t, C_params, K_folds):
    
    start = time.time()
    # Set parameters to be crossvalidated
    tuned_parameters = [{'C': C_params, 'kernel':['linear'], 'cache_size':[2000]}]
    # Perform cross validation
    clf = GridSearchCV(SVC(), tuned_parameters, cv=K_folds, scoring='accuracy')
    clf.fit(X_t, y_t)
    end = time.time()
    
    print("Linear SVC Cross-validation Training Time = ", (end - start))
    print()

    print("Best parameter set found on development set:")
    print(clf.best_params_)
    print()
    
    print("Retraining model for decision probabilities")
    best_param = clf.best_params_
    best_param['probability'] = True
    
    start = time.time()
    # Train Final Model
    model = SVC(**best_param)
    model.fit(X_t, y_t)
    end = time.time()
    
    print("Final Linear SVC Training Time = ", (end - start))
    print()
    
    nSupp = len(model.support_)
    fCorr = np.sum(model.predict(X_t) == y_t)/len(y_t)
    print("Number of Support Vectors = ", nSupp)
    print("Training Accuracy = ", fCorr)
    print()
    
    return model

In [9]:
def crossvalidate_final_rbfSVC(X_t, y_t, C_params, G_params, K_folds):
    
    start = time.time()
    # Set parameters to be crossvalidated
    tuned_parameters = [{'C': C_params, 'gamma': G_params, 'kernel':['rbf'], 'cache_size':[2000]}]
    # Perform cross validation
    clf = GridSearchCV(SVC(), tuned_parameters, cv=K_folds, scoring='accuracy')
    clf.fit(X_t, y_t)
    end = time.time()
    
    print("Final RBF SVC Cross-validation Training Time = ", (end - start))
    print()

    print("Best parameter set found on development set:")
    print(clf.best_params_)
    print()

    print("Retraining model for decision probabilities")
    best_param = clf.best_params_
    best_param['probability'] = True
    
    start = time.time()
    # Train Final Model
    model = SVC(**best_param)
    model.fit(X_t, y_t)
    end = time.time()
    
    print("Final RBF SVC Training Time = ", (end - start))
    print()
    
    nSupp = len(model.support_)
    fCorr = np.sum(model.predict(X_t) == y_t)/len(y_t)
    print("Number of Support Vectors = ", nSupp)
    print("Training Accuracy = ", fCorr)
    print()
    
    return model

In [10]:
def predict_test_data(final_model, X_PCA_test_reduced):
    # Predict labels of dataset
    y_labels = final_model.predict(X_PCA_test_reduced)
    # Prediction probabilities
    y_probs = final_model.predict_proba(X_PCA_test_reduced)
    return [y_labels, y_probs]

In [21]:
def save_predicted_labels(y_label, file_prefix):
    y_label = y_label.reshape((len(y_label),1))
    result_col_1 = (np.array(range(len(y_label)))+1).reshape((len(y_label),1))
    result = np.concatenate((result_col_1,y_label), axis = 1)
    np.savetxt(file_prefix + "_pred_labels.txt", result, fmt="%d", delimiter=',', header='Id,Prediction')
    return

In [22]:
def save_predicted_probs(y_probs, file_prefix):
    result_col_1 = (np.array(range(len(y_probs[:,0])))+1).reshape((len(y_probs[:,0]),1))
    result = np.concatenate((result_col_1,y_probs), axis = 1)
    np.savetxt(file_prefix + "_pred_probs.txt", result, fmt=["%d", "%f", "%f"], delimiter=',', header='Id,P[0],P[1]')
    return

## Full Pipelines

In [13]:
def linearSVC_pipeline(PCA_model, X_t, y_t, X_v, C_init, K_init, C_final, K_final, file_prefix):
    # Save PCA model
    pickle.dump( PCA_model, open( file_prefix + "_pca.p", "wb" ) )
    # Train L1-regularized linear SVC for dimension reduction
    reducing_model = crossvalidate_L1_LinearSVC(X_t, y_t, C_init, K_init)
    # Save model
    pickle.dump( reducing_model, open( file_prefix + "_reducing_svc.p", "wb" ) )
    # Reduce dimension of dataset
    X_t_reduced = X_t[:,(reducing_model.coef_ != 0)[0]]
    X_v_reduced = X_v[:,(reducing_model.coef_ != 0)[0]]
    # Train final linear SVC
    final_model = crossvalidate_final_LinearSVC(X_t_reduced, y_t, C_final, K_final)
    # Save model
    pickle.dump( final_model, open( file_prefix + "_final_svc.p", "wb" ) )
    # Predict test labels and probabilities
    [y_label, y_prob] = predict_test_data(final_model, X_v_reduced)
    # Save labels and probabilities
    save_predicted_labels(y_label, file_prefix)
    save_predicted_probs(y_prob, file_prefix)
    return [reducing_model, final_model]

In [14]:
def rbfSVC_pipeline(PCA_model, X_t, y_t, X_v, C_init, K_init, C_final, G_final, K_final, file_prefix):
    # Save PCA model
    pickle.dump( PCA_model, open( file_prefix + "_pca.p", "wb" ) )
    # Train L1-regularized linear SVC for dimension reduction
    reducing_model = crossvalidate_L1_LinearSVC(X_t, y_t, C_init, K_init)
    # Save model
    pickle.dump( reducing_model, open( file_prefix + "_reducing_svc.p", "wb" ) )
    # Reduce dimension of dataset
    X_t_reduced = X_t[:,(reducing_model.coef_ != 0)[0]]
    X_v_reduced = X_v[:,(reducing_model.coef_ != 0)[0]]
    # Train final rbf SVC
    final_model = crossvalidate_final_rbfSVC(X_t_reduced, y_t, C_final, G_final, K_final)
    # Save model
    pickle.dump( final_model, open( file_prefix + "_final_svc.p", "wb" ) )
    # Predict test labels and probabilities
    [y_label, y_prob] = predict_test_data(final_model, X_v_reduced)
    # Save labels and probabilities
    save_predicted_labels(y_label, file_prefix)
    save_predicted_probs(y_prob, file_prefix)
    return [reducing_model, final_model]

## Training Models

In [None]:
# Linear SVC trained on PCA of raw data

C_i = np.logspace(-2,2,9)
K_i = 10

C_f = np.logspace(-2,2,9)
K_f = 5

fileprefix = 'Linear_SVC_rawPCA'

[rModel, fModel] = linearSVC_pipeline(raw_PCA, X_train_raw_PCA, y_train, X_test_raw_PCA, C_i, K_i, C_f, K_f, fileprefix)

In [None]:
# Linear SVC trained on PCA of truncated data

C_i = np.logspace(-2,2,9)
K_i = 10

C_f = np.logspace(-2,2,9)
K_f = 5

fileprefix = 'Linear_SVC_truncPCA'

[rModel, fModel] = linearSVC_pipeline(trunc_PCA, X_train_trunc_PCA, y_train, X_test_trunc_PCA, C_i, K_i, C_f, K_f, fileprefix)

In [None]:
# RBF SVC trained on whitened PCA of raw data

C_i = np.logspace(-2,2,9)
K_i = 10

C_f = np.logspace(-1,1,3)
G_f = np.logspace(-4,-2,3)
K_f = 3

fileprefix = 'RBF_SVC_whitenedRawPCA_'

[rModel, fModel] = rbfSVC_pipeline(raw_PCA_white, X_train_raw_PCA_white, y_train, X_test_raw_PCA_white, C_i, K_i, C_f, G_f, K_f, fileprefix)

In [None]:
# RBF SVC trained on whitened PCA of truncated data

C_i = np.logspace(-2,2,9)
K_i = 10

C_f = np.logspace(-1,1,3)
G_f = np.logspace(-4,-2,3)
K_f = 3

fileprefix = 'RBF_SVC_whitenedTruncPCA_'

[rModel, fModel] = rbfSVC_pipeline(trunc_PCA_white, X_train_trunc_PCA_white, y_train, X_test_trunc_PCA_white, C_i, K_i, C_f, G_f, K_f, fileprefix)