In [1]:
from utils import get_split_data
from sklearn.svm import SVC, LinearSVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.decomposition import PCA
import numpy as np

X_train, y_train, X_val, y_val = get_split_data()
C_tests = [.001, .005, .01, .05, .1, .5, 1, 5, 10, 50, 100, 500, 1000]
K_fold = 5

In [2]:
def crossvalidate_L1LinearSVC(X_t, y_t, X_v, y_v, C_params, K_folds):
    # Set parameters to be crossvalidated
    tuned_parameters = [{'loss':['squared_hinge'], 'penalty':['l1'], 'dual':[False], 
                         'C': C_params}]
    # Perform cross validation
    clf = GridSearchCV(LinearSVC(), tuned_parameters, cv=K_folds, scoring='accuracy')
    clf.fit(X_t, y_t)

    print("Best parameters set found on development set:")
    print()
    print(clf.best_params_)
    print()
    print("Grid scores on development set:")
    print()
    means = clf.cv_results_['mean_test_score']
    stds = clf.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, clf.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))
    print()

    print("Detailed classification report:")
    print()
    print("The model is trained on the full development set.")
    print("The scores are computed on the full evaluation set.")
    print()
    y_true, y_pred = y_v, clf.predict(X_v)
    print(classification_report(y_true, y_pred))
    print()
    N_support = sum(clf.best_estimator_.coef_[0] != 0)
    N_dim = len(clf.best_estimator_.coef_[0])
    print("Dimensionality of suppport: %s of %s" % (N_support, N_dim))

In [3]:
# Raw data
crossvalidate_L1LinearSVC(X_train, y_train, X_val, y_val, C_tests, K_fold)

Best parameters set found on development set:

{'C': 0.1, 'penalty': 'l1', 'dual': False, 'loss': 'squared_hinge'}

Grid scores on development set:

0.700 (+/-0.012) for {'C': 0.001, 'penalty': 'l1', 'dual': False, 'loss': 'squared_hinge'}
0.802 (+/-0.007) for {'C': 0.005, 'penalty': 'l1', 'dual': False, 'loss': 'squared_hinge'}
0.826 (+/-0.009) for {'C': 0.01, 'penalty': 'l1', 'dual': False, 'loss': 'squared_hinge'}
0.847 (+/-0.009) for {'C': 0.05, 'penalty': 'l1', 'dual': False, 'loss': 'squared_hinge'}
0.848 (+/-0.012) for {'C': 0.1, 'penalty': 'l1', 'dual': False, 'loss': 'squared_hinge'}
0.844 (+/-0.013) for {'C': 0.5, 'penalty': 'l1', 'dual': False, 'loss': 'squared_hinge'}
0.843 (+/-0.013) for {'C': 1, 'penalty': 'l1', 'dual': False, 'loss': 'squared_hinge'}
0.843 (+/-0.013) for {'C': 5, 'penalty': 'l1', 'dual': False, 'loss': 'squared_hinge'}
0.843 (+/-0.013) for {'C': 10, 'penalty': 'l1', 'dual': False, 'loss': 'squared_hinge'}
0.843 (+/-0.014) for {'C': 50, 'penalty': 'l1', '

In [4]:
# Normalize input data to total word count of review

word_count_train = np.sum(X_train, axis=1)
word_count_train[word_count_train==0] = 1     # Reviews of length zero are given effective length one
X_train_normed = np.divide(X_train, np.stack([word_count_train]*X_train.shape[1], axis=1))

word_count_val = np.sum(X_val, axis=1)
word_count_val[word_count_val==0] = 1     # Reviews of length zero are given effective length one
X_val_normed = np.divide(X_val, np.stack([word_count_val]*X_val.shape[1], axis=1))


crossvalidate_L1LinearSVC(X_train_normed, y_train, X_val_normed, y_val, C_tests, K_fold)

Best parameters set found on development set:

{'C': 5, 'penalty': 'l1', 'dual': False, 'loss': 'squared_hinge'}

Grid scores on development set:

0.495 (+/-0.000) for {'C': 0.001, 'penalty': 'l1', 'dual': False, 'loss': 'squared_hinge'}
0.601 (+/-0.015) for {'C': 0.005, 'penalty': 'l1', 'dual': False, 'loss': 'squared_hinge'}
0.637 (+/-0.018) for {'C': 0.01, 'penalty': 'l1', 'dual': False, 'loss': 'squared_hinge'}
0.748 (+/-0.011) for {'C': 0.05, 'penalty': 'l1', 'dual': False, 'loss': 'squared_hinge'}
0.791 (+/-0.007) for {'C': 0.1, 'penalty': 'l1', 'dual': False, 'loss': 'squared_hinge'}
0.839 (+/-0.011) for {'C': 0.5, 'penalty': 'l1', 'dual': False, 'loss': 'squared_hinge'}
0.847 (+/-0.007) for {'C': 1, 'penalty': 'l1', 'dual': False, 'loss': 'squared_hinge'}
0.847 (+/-0.008) for {'C': 5, 'penalty': 'l1', 'dual': False, 'loss': 'squared_hinge'}
0.846 (+/-0.008) for {'C': 10, 'penalty': 'l1', 'dual': False, 'loss': 'squared_hinge'}
0.845 (+/-0.008) for {'C': 50, 'penalty': 'l1', 'du

In [5]:
# Truncate word counts to one
X_train_trunc = np.minimum(X_train, np.ones(X_train.shape))
X_val_trunc = np.minimum(X_val, np.ones(X_val.shape))


crossvalidate_L1LinearSVC(X_train_trunc, y_train, X_val_trunc, y_val, C_tests, K_fold)

Best parameters set found on development set:

{'C': 0.05, 'penalty': 'l1', 'dual': False, 'loss': 'squared_hinge'}

Grid scores on development set:

0.668 (+/-0.017) for {'C': 0.001, 'penalty': 'l1', 'dual': False, 'loss': 'squared_hinge'}
0.791 (+/-0.006) for {'C': 0.005, 'penalty': 'l1', 'dual': False, 'loss': 'squared_hinge'}
0.813 (+/-0.013) for {'C': 0.01, 'penalty': 'l1', 'dual': False, 'loss': 'squared_hinge'}
0.844 (+/-0.010) for {'C': 0.05, 'penalty': 'l1', 'dual': False, 'loss': 'squared_hinge'}
0.843 (+/-0.011) for {'C': 0.1, 'penalty': 'l1', 'dual': False, 'loss': 'squared_hinge'}
0.839 (+/-0.010) for {'C': 0.5, 'penalty': 'l1', 'dual': False, 'loss': 'squared_hinge'}
0.838 (+/-0.010) for {'C': 1, 'penalty': 'l1', 'dual': False, 'loss': 'squared_hinge'}
0.837 (+/-0.011) for {'C': 5, 'penalty': 'l1', 'dual': False, 'loss': 'squared_hinge'}
0.837 (+/-0.011) for {'C': 10, 'penalty': 'l1', 'dual': False, 'loss': 'squared_hinge'}
0.837 (+/-0.011) for {'C': 50, 'penalty': 'l1', 

In [6]:
# PCA based on raw input data
rawPCA = PCA()
rawPCA.fit(X_train)

# Transform training and validation data
X_train_rawPCA = rawPCA.transform(X_train)
X_val_rawPCA = rawPCA.transform(X_val)


crossvalidate_L1LinearSVC(X_train_rawPCA, y_train, X_val_rawPCA, y_val, C_tests, K_fold)

Best parameters set found on development set:

{'C': 0.1, 'penalty': 'l1', 'dual': False, 'loss': 'squared_hinge'}

Grid scores on development set:

0.715 (+/-0.012) for {'C': 0.001, 'penalty': 'l1', 'dual': False, 'loss': 'squared_hinge'}
0.803 (+/-0.005) for {'C': 0.005, 'penalty': 'l1', 'dual': False, 'loss': 'squared_hinge'}
0.820 (+/-0.006) for {'C': 0.01, 'penalty': 'l1', 'dual': False, 'loss': 'squared_hinge'}
0.846 (+/-0.008) for {'C': 0.05, 'penalty': 'l1', 'dual': False, 'loss': 'squared_hinge'}
0.847 (+/-0.010) for {'C': 0.1, 'penalty': 'l1', 'dual': False, 'loss': 'squared_hinge'}
0.844 (+/-0.014) for {'C': 0.5, 'penalty': 'l1', 'dual': False, 'loss': 'squared_hinge'}
0.844 (+/-0.014) for {'C': 1, 'penalty': 'l1', 'dual': False, 'loss': 'squared_hinge'}
0.843 (+/-0.014) for {'C': 5, 'penalty': 'l1', 'dual': False, 'loss': 'squared_hinge'}
0.843 (+/-0.014) for {'C': 10, 'penalty': 'l1', 'dual': False, 'loss': 'squared_hinge'}
0.843 (+/-0.014) for {'C': 50, 'penalty': 'l1', '

In [7]:
# PCA based on normed input data
normedPCA = PCA()
normedPCA.fit(X_train_normed)

X_train_normedPCA = normedPCA.transform(X_train_normed)
X_val_normedPCA = normedPCA.transform(X_val_normed)


crossvalidate_L1LinearSVC(X_train_normed, y_train, X_val_normedPCA, y_val, C_tests, K_fold)

Best parameters set found on development set:

{'C': 5, 'penalty': 'l1', 'dual': False, 'loss': 'squared_hinge'}

Grid scores on development set:

0.495 (+/-0.000) for {'C': 0.001, 'penalty': 'l1', 'dual': False, 'loss': 'squared_hinge'}
0.601 (+/-0.015) for {'C': 0.005, 'penalty': 'l1', 'dual': False, 'loss': 'squared_hinge'}
0.637 (+/-0.018) for {'C': 0.01, 'penalty': 'l1', 'dual': False, 'loss': 'squared_hinge'}
0.748 (+/-0.011) for {'C': 0.05, 'penalty': 'l1', 'dual': False, 'loss': 'squared_hinge'}
0.791 (+/-0.007) for {'C': 0.1, 'penalty': 'l1', 'dual': False, 'loss': 'squared_hinge'}
0.839 (+/-0.011) for {'C': 0.5, 'penalty': 'l1', 'dual': False, 'loss': 'squared_hinge'}
0.847 (+/-0.007) for {'C': 1, 'penalty': 'l1', 'dual': False, 'loss': 'squared_hinge'}
0.847 (+/-0.008) for {'C': 5, 'penalty': 'l1', 'dual': False, 'loss': 'squared_hinge'}
0.846 (+/-0.008) for {'C': 10, 'penalty': 'l1', 'dual': False, 'loss': 'squared_hinge'}
0.845 (+/-0.008) for {'C': 50, 'penalty': 'l1', 'du

In [8]:
# PCA based on truncated input data
truncPCA = PCA()
truncPCA.fit(X_train_trunc)

X_train_truncPCA = truncPCA.transform(X_train_trunc)
X_val_truncPCA = truncPCA.transform(X_val_trunc)

crossvalidate_L1LinearSVC(X_train_truncPCA, y_train, X_val_truncPCA, y_val, C_tests, K_fold)

Best parameters set found on development set:

{'C': 0.1, 'penalty': 'l1', 'dual': False, 'loss': 'squared_hinge'}

Grid scores on development set:

0.710 (+/-0.011) for {'C': 0.001, 'penalty': 'l1', 'dual': False, 'loss': 'squared_hinge'}
0.792 (+/-0.009) for {'C': 0.005, 'penalty': 'l1', 'dual': False, 'loss': 'squared_hinge'}
0.814 (+/-0.011) for {'C': 0.01, 'penalty': 'l1', 'dual': False, 'loss': 'squared_hinge'}
0.841 (+/-0.009) for {'C': 0.05, 'penalty': 'l1', 'dual': False, 'loss': 'squared_hinge'}
0.843 (+/-0.008) for {'C': 0.1, 'penalty': 'l1', 'dual': False, 'loss': 'squared_hinge'}
0.839 (+/-0.011) for {'C': 0.5, 'penalty': 'l1', 'dual': False, 'loss': 'squared_hinge'}
0.838 (+/-0.011) for {'C': 1, 'penalty': 'l1', 'dual': False, 'loss': 'squared_hinge'}
0.837 (+/-0.011) for {'C': 5, 'penalty': 'l1', 'dual': False, 'loss': 'squared_hinge'}
0.837 (+/-0.011) for {'C': 10, 'penalty': 'l1', 'dual': False, 'loss': 'squared_hinge'}
0.837 (+/-0.011) for {'C': 50, 'penalty': 'l1', '