In [1]:
from utils import get_split_data
from sklearn.svm import SVC, LinearSVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.decomposition import PCA
import numpy as np
import time

X_train, y_train, X_val, y_val = get_split_data()

In [2]:
# Normalize input data to total word count of review

#word_count_train = np.sum(X_train, axis=1)
#word_count_train[word_count_train==0] = 1     # Reviews of length zero are given effective length one
#X_train_normed = np.divide(X_train, np.stack([word_count_train]*X_train.shape[1], axis=1))

#word_count_val = np.sum(X_val, axis=1)
#word_count_val[word_count_val==0] = 1     # Reviews of length zero are given effective length one
#X_val_normed = np.divide(X_val, np.stack([word_count_val]*X_val.shape[1], axis=1))

In [3]:
# Truncate word counts to one
X_train_trunc = np.minimum(X_train, np.ones(X_train.shape))
X_val_trunc = np.minimum(X_val, np.ones(X_val.shape))

In [4]:
# PCA based on raw input data
raw_PCA = PCA()
raw_PCA.fit(X_train)

# Transform training and validation data
X_train_raw_PCA = raw_PCA.transform(X_train)
X_val_raw_PCA = raw_PCA.transform(X_val)

In [5]:
# Whitened PCA based on raw input data
raw_PCA_white = PCA(whiten=True)
raw_PCA_white.fit(X_train)

# Transform training and validation data
X_train_raw_PCA_white = raw_PCA_white.transform(X_train)
X_val_raw_PCA_white = raw_PCA_white.transform(X_val)

In [6]:
# PCA based on truncated input data
trunc_PCA = PCA()
trunc_PCA.fit(X_train_trunc)

X_train_trunc_PCA = trunc_PCA.transform(X_train_trunc)
X_val_trunc_PCA = trunc_PCA.transform(X_val_trunc)

In [7]:
# Whitened PCA based on truncated input data
trunc_PCA_white = PCA(whiten=True)
trunc_PCA_white.fit(X_train_trunc)

X_train_trunc_PCA_white = trunc_PCA_white.transform(X_train_trunc)
X_val_trunc_PCA_white = trunc_PCA_white.transform(X_val_trunc)

In [8]:
def linearSVCmodel(X_t, y_t, X_v, y_v):
    start = time.time()
    model = SVC(kernel='linear', cache_size=1000)
    model.fit(X_t, y_t)
    end = time.time()
    print("Training Time = ", (end - start))
    fCorr = sum(model.predict(X_v) == y_v)/len(y_v)
    nSupp = len(model.support_)
    print("Test Accuracy = ", fCorr)
    print("Number of Support Vectors = ", nSupp)
    print("\n")
    return
    
print("Linear Kernel: Raw Data")
linearSVCmodel(X_train, y_train, X_val, y_val)

print("Linear Kernel: Trunc Data")
linearSVCmodel(X_train_trunc, y_train, X_val_trunc, y_val)

print("Linear Kernel: PCA Raw Data")
linearSVCmodel(X_train_raw_PCA, y_train, X_val_raw_PCA, y_val)

print("Linear Kernel: PCA Trunc Data")
linearSVCmodel(X_train_trunc_PCA, y_train, X_val_trunc_PCA, y_val)

print("Linear Kernel: Whitened PCA Raw Data")
linearSVCmodel(X_train_raw_PCA_white, y_train, X_val_raw_PCA_white, y_val)

print("Linear Kernel: Whitened PCA Trunc Data")
linearSVCmodel(X_train_trunc_PCA_white, y_train, X_val_trunc_PCA_white, y_val)

Linear Kernel: Raw Data
Training Time =  337.5080087184906
Test Accuracy =  0.834
Number of Support Vectors =  6383


Linear Kernel: Trunc Data
Training Time =  300.51007771492004
Test Accuracy =  0.837
Number of Support Vectors =  6614


Linear Kernel: PCA Raw Data
Training Time =  330.5796024799347
Test Accuracy =  0.834
Number of Support Vectors =  6381


Linear Kernel: PCA Trunc Data
Training Time =  299.9918556213379
Test Accuracy =  0.837
Number of Support Vectors =  6622


Linear Kernel: Whitened PCA Raw Data
Training Time =  2252.2509009838104
Test Accuracy =  0.829
Number of Support Vectors =  6295


Linear Kernel: Whitened PCA Trunc Data
Training Time =  2341.721186876297
Test Accuracy =  0.836
Number of Support Vectors =  6461




In [9]:
def rbfSVCmodel(X_t, y_t, X_v, y_v):
    start = time.time()
    model = SVC(kernel='rbf', cache_size=1000)
    model.fit(X_t, y_t)
    end = time.time()
    print("Training Time = ", (end - start))
    fCorr = sum(model.predict(X_v) == y_v) / float(len(y_v))
    nSupp = len(model.support_)
    print("Test Accuracy = ", fCorr)
    print("Number of Support Vectors = ", nSupp)
    print("\n")
    return

print("RBF Kernel: Raw Data")
rbfSVCmodel(X_train, y_train, X_val, y_val)

print("RBF Kernel: Trunc Data")
rbfSVCmodel(X_train_trunc, y_train, X_val_trunc, y_val)

print("RBF Kernel: PCA Raw Data")
rbfSVCmodel(X_train_raw_PCA, y_train, X_val_raw_PCA, y_val)

print("RBF Kernel: PCA Trunc Data")
rbfSVCmodel(X_train_trunc_PCA, y_train, X_val_trunc_PCA, y_val)

print("RBF Kernel: Whitened PCA Raw Data")
rbfSVCmodel(X_train_raw_PCA_white, y_train, X_val_raw_PCA_white, y_val)

print("RBF Kernel: Whitened PCA Trunc Data")
rbfSVCmodel(X_train_trunc_PCA_white, y_train, X_val_trunc_PCA_white, y_val)

RBF Kernel: Raw Data
Training Time =  349.9552800655365
Test Accuracy =  0.827
Number of Support Vectors =  12302


RBF Kernel: Trunc Data
Training Time =  349.5130817890167
Test Accuracy =  0.818
Number of Support Vectors =  13238


RBF Kernel: PCA Raw Data
Training Time =  326.35124039649963
Test Accuracy =  0.827
Number of Support Vectors =  12302


RBF Kernel: PCA Trunc Data
Training Time =  350.7436020374298
Test Accuracy =  0.818
Number of Support Vectors =  13238


RBF Kernel: Whitened PCA Raw Data
Training Time =  377.951233625412
Test Accuracy =  0.839
Number of Support Vectors =  13622


RBF Kernel: Whitened PCA Trunc Data
Training Time =  380.8051128387451
Test Accuracy =  0.834
Number of Support Vectors =  13869




Input data normalized to review length generally performs poorly.
Polynomial Kernel performs very poorly.
