In [14]:
###############################################################################################
#    Code for SVM Baseline using Bag-Of-Word Features and scikit sparsification               #
#       functions (countVectorizer) for Sentiment Analysis on IMDB dataset                    #
###############################################################################################

#Import Libraries

#Import Matrix Handling Capabilities
import numpy as np
import matplotlib.pyplot as plt
#Import File Handling
from os import listdir
from os.path import isfile, join
import re
#Import scikit library features for model manipulation
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.kernel_approximation import RBFSampler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import confusion_matrix
from sklearn.externals import joblib

#Add path to files containing movie review train and test datasets 
pwd='/Users/pulkit/Google Drive/Stan Courses/4. Fall 2017/CS229 ML/Project/CS229-Project/'

In [15]:
###############################################################################################
#       Extract Features for Train, Test and Adversary Cases from the raw files               #
###############################################################################################


##########################
#  Import Training Data  #
##########################

# Add datapath to training dataset
datapath=pwd+'stanford_train/'

#Load Precomputed Wordlist
print('Loading word list!')
wordsList = np.load(datapath+'wordsList-lexic-sorted.npy').tolist()
print('Loaded the word list!')

numDictionaryWords=len(wordsList)
print(numDictionaryWords)

#Add Path to positive and negative review files
positiveFiles = [datapath+'pos/' + f for f in listdir(datapath+'pos/') if isfile(join(datapath+'pos/', f))]
negativeFiles = [datapath+'neg/' + f for f in listdir(datapath+'neg/') if isfile(join(datapath+'neg/', f))]
numWords = []

#define for string cleaning
strip_special_chars = re.compile("[^A-Za-z0-9 ]+")
def cleanSentences(string):
    '''
    Cleans Sentences
    '''
    string = string.lower().replace("<br />", " ")
    return re.sub(strip_special_chars, "", string.lower())

#define for finding index in wordvec
def findIndex(search_list, begin, end, key):
    '''
    Find Index of word in a list
    '''
    mid = int((end - begin + 1)/2) + begin
    if end == begin:
        if search_list[mid] == key:
            return mid
        else:
            return -1
    if end == begin + 1:
        if search_list[begin] == key:
            return begin
        if search_list[end] == key:
            return end
        else:
            return -1
    if search_list[mid] < key:
        return findIndex(search_list, mid, end, key)
    return findIndex(search_list, begin, mid, key)

# Adversaries for Positive and Negative Examples as Found by Naive Bayes
negativeAdversaries=['edie','antwone','din','gunga','yokai']
positiveAdversaries=['boll','410','uwe','tashan','hobgoblins']


fileCounter = 0
#Define a list for storing the reviews for normal dataset and adverserial dataset
#Formed by replacing first word with an adversary word as found by NB
#Corpus has size 50010(adversarialWords([0:10]),numTraining([10:25010]) and numTest ([25010:50010])Examples)
#Corpus2 has size 10000(odd elements has the first word ina ctual sentence and even are numTraining+numTest Eg)
corpus=[] 
corpus2=[]
corpus.extend(negativeAdversaries) #So that sparsification leads same matrix size for prediction
corpus.extend(positiveAdversaries)

print('Going through Positive Files for Training Bag-Of-Words')
for pf in positiveFiles:
    with open(pf, "r") as f:
        indexCounter = 0
        line=f.readline()
        cleanedLine = cleanSentences(line)
        corpus.append(format(str(cleanedLine))) #Corpus contains original sentences and adversary words
        
        split = cleanedLine.split()
        corpus2.append(split[0]) #Odd index of corpus2 contains the original first word
        
        adversaryIdx=np.random.randint(5, size=1,dtype='int32')
        split[0]=positiveAdversaries[np.asscalar(adversaryIdx)]
        cleanedLine=" ".join(split)
        corpus2.append(format(str(cleanedLine))) #Even index of corpus2 contains adversary sentence motivated by NB
                
        fileCounter = fileCounter + 1 
print('Positive files finished')

print('Going through Negative Files for Training Bag-Of-Words')
for nf in negativeFiles:
    with open(nf, "r") as f:
        indexCounter = 0
        line=f.readline()
        cleanedLine = cleanSentences(line)
        #split = cleanedLine.split()
        corpus.append(format(str(cleanedLine)))
        
        split = cleanedLine.split()
        corpus2.append(split[0])
        
        adversaryIdx=np.random.randint(5, size=1,dtype='int32')
        split[0]=negativeAdversaries[np.asscalar(adversaryIdx)]
        cleanedLine=" ".join(split)
        corpus2.append(format(str(cleanedLine)))
        
        fileCounter = fileCounter + 1 
print('Negative files finished')

#########################
###Import Testing Data###
#########################

# Add datapath to training dataset

datapath=pwd+'stanford_test/'

positiveFiles = [datapath+'pos/' + f for f in listdir(datapath+'pos/') if isfile(join(datapath+'pos/', f))]
negativeFiles = [datapath+'neg/' + f for f in listdir(datapath+'neg/') if isfile(join(datapath+'neg/', f))]

fileCounter = 0
print('Going through Positive Files for Testing Bag-Of-Words')
for pf in positiveFiles:
    with open(pf, "r") as f:
        indexCounter = 0
        line=f.readline()
        cleanedLine = cleanSentences(line)
        #split = cleanedLine.split()
        corpus.append(format(str(cleanedLine)))
        
        split = cleanedLine.split()
        corpus2.append(split[0])
        
        adversaryIdx=np.random.randint(5, size=1,dtype='int32')
        split[0]=positiveAdversaries[np.asscalar(adversaryIdx)]
        cleanedLine=" ".join(split)
        corpus2.append(format(str(cleanedLine)))
        
        fileCounter = fileCounter + 1 
print('Positive files finished')

print('Going through Negative Files for Testing Bag-Of-Words')
for nf in negativeFiles:
    with open(nf, "r") as f:
        indexCounter = 0
        line=f.readline()
        cleanedLine = str(cleanSentences(line))
        #split = cleanedLine.split()
        corpus.append(format(cleanedLine))
        
        split = cleanedLine.split()
        corpus2.append(split[0])
        
        adversaryIdx=np.random.randint(5, size=1,dtype='int32')
        split[0]=negativeAdversaries[np.asscalar(adversaryIdx)]
        cleanedLine=" ".join(split)
        corpus2.append(format(str(cleanedLine)))
        
        fileCounter = fileCounter + 1 
print('Negative files finished')

#Make a sparse Bag-oF-Word Matrix as Feature Set for both normal and adversary case
vectorizer = CountVectorizer()
X= vectorizer.fit_transform(corpus)
X_adversary=vectorizer.fit_transform(corpus2)

print(X.shape)
print(X_adversary.shape)

Loading word list!
Loaded the word list!
400000
Going through Positive Files for Training Bag-Of-Words
Positive files finished
Going through Negative Files for Training Bag-Of-Words
Negative files finished
Going through Positive Files for Testing Bag-Of-Words
Positive files finished
Going through Negative Files for Testing Bag-Of-Words
Negative files finished
(50010, 166935)
(100000, 166935)


In [16]:
###############################################################################################
#                           Fit Models and check training error                               #
#  Uses SGD with hinge loss and l2 normalization to fit SVM. Generate features appropriately  #
###############################################################################################

#Choose:
#  Kernels from linear, polyD (polynomial with degree d), RBF
#  L2 penalty al (start from 1e-4)
#  File name for saving model
kernel='linear' 
ALPHA=1e-4
label="linear_with_bow.txt"

# Slice Training Data
X_train=X[10:25010,:]

#Generate Labels
labels=-1*np.ones(numFiles)
labels[0:int(numFiles/2)]=np.ones(int(numFiles/2))
y_train=labels

#Compute new features
print('Computing Feature Set')
if kernel=='linear':
    X_train_newFeatures = X_train
elif kernel=='poly2':
    poly = PolynomialFeatures(degree=2)
    X_train_newFeatures=poly.fit_transform(X_train.toarray())
elif kernel=='poly3':
    poly = PolynomialFeatures(degree=3)
    X_train_newFeatures=poly.fit_transform(X_train.toarray())
elif kernel=='RBF':
    rbf_feature = RBFSampler(gamma=1, random_state=1)#, n_components=25000)
    X_train_newFeatures = rbf_feature.fit_transform(X_train)
print('Feature Set Computed')

#Train Model
clf = SGDClassifier(loss="hinge", penalty="l2", max_iter=1e7, tol=1e-5, alpha=ALPHA)
print('Fitting SVM')
clf.fit(X_train_newFeatures,y_train)
print('Fitting Done')

#Predict
y_model_train=clf.predict(X_train_newFeatures)
misclassifiedFraction=(np.sum(np.fabs(y_model_train-y_train)))/(2*y_train.size)
print('Misclassified Fraciton on Train Dataset='+str(misclassifiedFraction))

confusion_matrix(y_train,y_model_train)

#Save Results:
model_filename = label

joblib.dump(clf,label) 

Computing Feature Set
Feature Set Computed
Fitting SVM
Fitting Done
Misclassified Fraciton on Train Dataset=0.0368


['linear_with_bow.txt']

In [12]:
###############################################################################################
#                           Find Error on Testing Dataset                                     #
#                Generate features appropriately based on model imported                      #
###############################################################################################

#Choose:
#  File name for model testing
#  Kernels from linear, polyD (polynomial with degree d), RBF based on model imported
label="linear_with_bow.txt"
kernel='linear' 

# Load Model and Predict
X_test=X[25010:,:]
y_test=labels
svm_model = joblib.load(label) 
print(svm_model)

# Make appropriate features

print('Computing Feature Set')
if kernel=='linear':
    X_test_newFeatures=X_test
elif kernel=='poly2':
    poly = PolynomialFeatures(degree=2)
    X_test_newFeatures=poly.fit_transform(X_test.toarray())
elif kernel=='poly3':
    poly = PolynomialFeatures(degree=3)
    X_test_newFeatures=poly.fit_transform(X_test.toarray())
elif kernel=='RBF':
    rbf_feature = RBFSampler(gamma=1, random_state=1)#, n_components=25000)
    X_test_newFeatures = rbf_feature.fit_transform(X_test)
print('Feature Set Computed')

print('Predicting')
y_model=svm_model.predict(X_test_newFeatures)
print(y_model.shape)

#print(np.unique(y_model))

misclassifiedFraction=(np.sum(np.fabs(y_model-y_test)))/(2*y_test.size)
print('Misclassified Fraciton on Test Dataset='+str(misclassifiedFraction))

confusion_matrix(y_test,y_model)

SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', max_iter=10000000.0,
       n_iter=None, n_jobs=1, penalty='l2', power_t=0.5, random_state=None,
       shuffle=True, tol=1e-05, verbose=0, warm_start=False)
Computing Feature Set
Feature Set Computed
Predicting
(25000,)
Misclassified Fraciton on Test Dataset=0.14144


array([[10542,  1958],
       [ 1578, 10922]])

In [13]:
###############################################################################################
#                            Find Error on Adversarial Dataset                                #
#                Generate features appropriately based on model imported                      #
###############################################################################################

#Choose:
#  File name for model testing
#  Kernels from linear, polyD (polynomial with degree d), RBF based on model imported
label="linear_with_bow.txt"
kernel='linear' 

# Load Model and Predict
X_test=X_adversary[50000:,:]        
y_test=labels

svm_model = joblib.load(label) 
print(svm_model)

# Make appropriate features

print('Computing Feature Set')
if kernel=='linear':
    X_test_newFeatures=X_test
elif kernel=='poly2':
    poly = PolynomialFeatures(degree=2)
    X_test_newFeatures=poly.fit_transform(X_test.toarray())
elif kernel=='poly3':
    poly = PolynomialFeatures(degree=3)
    X_test_newFeatures=poly.fit_transform(X_test.toarray())
elif kernel=='RBF':
    rbf_feature = RBFSampler(gamma=1, random_state=1)#, n_components=25000)
    X_test_newFeatures = rbf_feature.fit_transform(X_test)
print('Feature Set Computed')

print('Predicting')
y_model=svm_model.predict(X_test_newFeatures)
print(y_model.shape)

#print(np.unique(y_model))

misclassifiedFraction=(np.sum(np.fabs(y_model[1::2]-y_test)))/(2*y_test.size)
print('Misclassified Fraciton='+str(misclassifiedFraction))

confusion_matrix(y_test,y_model[1::2])

SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', max_iter=10000000.0,
       n_iter=None, n_jobs=1, penalty='l2', power_t=0.5, random_state=None,
       shuffle=True, tol=1e-05, verbose=0, warm_start=False)
Computing Feature Set
Feature Set Computed
Predicting
(50000,)
Misclassified Fraciton=0.18836


array([[10136,  2364],
       [ 2345, 10155]])