In [19]:
###############################################################################################
#                  Code for SVM Baseline using Word-to-Vec Features                           #
#                        for Sentiment Analysis on IMDB dataset                               #
###############################################################################################

#Import Libraries

#Import Matrix Handling Capabilities
import numpy as np
import matplotlib.pyplot as plt
#Import File Handling
from os import listdir
from os.path import isfile, join
import re
#Import scikit library features for model manipulation
from sklearn.linear_model import SGDClassifier
from sklearn.kernel_approximation import RBFSampler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import confusion_matrix
from sklearn.externals import joblib

#Add path to files containing movie review train dataset 
pwd='/Users/pulkit/Google Drive/Stan Courses/4. Fall 2017/CS229 ML/Project/CS229-Project/'
datapath=pwd+'stanford_train/'

#Import precomputed word list and word vectors
print('Loading word list!')
wordsList = np.load(datapath+'wordsList-lexic-sorted.npy').tolist()
print('Loaded the word list!')
print('Loading word vectors!')
wordVectors = np.load(datapath+'wordVectors-lexic-sorted.npy')
print ('Loaded the word vectors!')
print(len(wordsList))
print(wordVectors.shape)
_,wordEncodingLen=wordVectors.shape

Loading word list!
Loaded the word list!
Loading word vectors!
Loaded the word vectors!
400000
(400000, 50)


In [20]:
# Find statistics of dataset to decide number of word to used for mki

positiveFiles = [datapath+'pos/' + f for f in listdir(datapath+'pos/') if isfile(join(datapath+'pos/', f))]
negativeFiles = [datapath+'neg/' + f for f in listdir(datapath+'neg/') if isfile(join(datapath+'neg/', f))]
numWords = []

print('Going through Positive Files')
for pf in positiveFiles:
    with open(pf, "r", encoding='utf-8') as f:
        line=f.readline()
        counter = len(line.split())
        numWords.append(counter)       
print('Positive files finished')

print('Going through Negative Files')
for nf in negativeFiles:
    with open(nf, "r", encoding='utf-8') as f:
        line=f.readline()
        counter = len(line.split())
        numWords.append(counter)  
print('Negative files finished')

numFiles = len(numWords)
print('The total number of files is', numFiles)
print('The total number of words in the files is', sum(numWords))
print('The average number of words in the files is', sum(numWords)/len(numWords))

#Define maximum sequence length to take into account in model based on previous statistics
maxSeqLength=250

Going through Positive Files
Positive files finished
Going through Negative Files
Negative files finished
The total number of files is 25000
The total number of words in the files is 5844680
The average number of words in the files is 233.7872


In [21]:
##########################
#  Import Training Data  #
##########################

#Go through files and find word IDs

#define for string cleaning
strip_special_chars = re.compile("[^A-Za-z0-9 ]+")
def cleanSentences(string):
    '''
    Cleans Sentences
    '''
    string = string.lower().replace("<br />", " ")
    return re.sub(strip_special_chars, "", string.lower())

#define for finding index in wordvec
def findIndex(search_list, begin, end, key):
    '''
    Find Index of word in a list
    '''
    mid = int((end - begin + 1)/2) + begin
    if end == begin:
        if search_list[mid] == key:
            return mid
        else:
            return -1
    if end == begin + 1:
        if search_list[begin] == key:
            return begin
        if search_list[end] == key:
            return end
        else:
            return -1
    if search_list[mid] < key:
        return findIndex(search_list, mid, end, key)
    return findIndex(search_list, begin, mid, key)

ids = np.zeros((numFiles, maxSeqLength), dtype='int32')
fileCounter = 0
print('Going through Positive Files for indices')
for pf in positiveFiles:
    with open(pf, "r") as f:
        indexCounter = 0
        line=f.readline()
        cleanedLine = cleanSentences(line)
        split = cleanedLine.split()
        for word in split:
            try:
                ids[fileCounter][indexCounter] = findIndex(wordsList, 0, len(wordsList)-1, word)
            except ValueError:
                ids[fileCounter][indexCounter] = 399999 #Vector for unknown words
            indexCounter = indexCounter + 1
            if indexCounter >= maxSeqLength:
                break
        fileCounter = fileCounter + 1 
print('Positive files finished')

print('Going through Negative Files for indices')
for nf in negativeFiles:
    with open(nf, "r") as f:
        indexCounter = 0
        line=f.readline()
        cleanedLine = cleanSentences(line)
        split = cleanedLine.split()
        for word in split:
            try:
                ids[fileCounter][indexCounter] = findIndex(wordsList, 0, len(wordsList)-1, word)
            except ValueError:
                ids[fileCounter][indexCounter] = 399999 #Vector for unknown words
            indexCounter = indexCounter + 1
            if indexCounter >= maxSeqLength:
                break
        fileCounter = fileCounter + 1 
 #Pass into embedding function and see if it evaluates. 
print('Negative files finished')


#ids=np.load(datapath+'idsMatrix.npy')
print(ids.shape)

Going through Positive Files for indices
Positive files finished
Going through Negative Files for indices
Negative files finished
(25000, 250)


In [22]:
##########################
#    Generate Features   #
##########################

labels=-1*np.ones(numFiles)
labels[0:int(numFiles/2)]=np.ones(int(numFiles/2))

#print(np.unique(labels))
print('Making Input Data Matrix')

#Choose feature type:
#'allVectors': as all vectors implying each example is 12500 dimensional
#'meanVectors': as all vectors implying each example is 50 dimensional

featureType='allVectors'

if featureType=='allVectors':
    inputData=np.zeros((numFiles,maxSeqLength*wordEncodingLen))
    for i in range(numFiles):
            for j in range(maxSeqLength):
                    inputData[i,j*wordEncodingLen:(j+1)*wordEncodingLen]=wordVectors[ids[i,j],]
elif featureType=='meanVectors':
    inputData=np.zeros((numFiles,wordEncodingLen))
    for i in range(numFiles):
        inputData[i,0:wordEncodingLen]=np.mean(wordVectors[ids[i,:],],axis=0)

print('Input Data Matrix Loaded')

print(inputData.shape)

X_train=inputData
y_train=labels

#mean=np.mean(wordVectors[ids[i,:],],axis=0)
#print(mean.shape)
#print(wordVectors[ids[1,0:2],])

Making Input Data Matrix
Input Data Matrix Loaded
(25000, 12500)


In [17]:
# OLD FUNCTION FOR SPLITTING DATA
# Left Middle 200 Reveiws for Validation:
'''
from sklearn.model_selection import train_test_split

def splitData():
    print('Splitting Data')
    X_train, X_test, y_train, y_test = train_test_split(inputData[100:24900], labels[100:24900], test_size=0.1, random_state=42)
    print(X_train.shape,X_test.shape,y_train.shape,y_test.shape)
    #print(np.unique(y_train),np.unique(y_test))
    
    return X_train, X_test, y_train, y_test
    
#splitData()

X_train, X_test, y_train, y_test=splitData()
'''

"\nfrom sklearn.model_selection import train_test_split\n\ndef splitData():\n    print('Splitting Data')\n    X_train, X_test, y_train, y_test = train_test_split(inputData[100:24900], labels[100:24900], test_size=0.1, random_state=42)\n    print(X_train.shape,X_test.shape,y_train.shape,y_test.shape)\n    #print(np.unique(y_train),np.unique(y_test))\n    \n    return X_train, X_test, y_train, y_test\n    \n#splitData()\n\nX_train, X_test, y_train, y_test=splitData()\n"

In [24]:
###############################################################################################
#                           Fit Models and check training error                               #
#  Uses SGD with hinge loss and l2 normalization to fit SVM. Generate features appropriately  #
###############################################################################################

#Choose:
#  Kernels from linear, polyD (polynomial with degree d), RBF
#  L2 penalty al (start from 1e-4)
#  File name for saving model
kernel='linear' 
ALPHA=1e-4
label="linear_with_n12500.txt"

#Compute new features
print('Computing Feature Set')
if kernel=='linear':
    X_train_newFeatures = X_train
elif kernel=='poly2':
    poly = PolynomialFeatures(degree=2)
    X_train_newFeatures=poly.fit_transform(X_train.toarray())
elif kernel=='poly3':
    poly = PolynomialFeatures(degree=3)
    X_train_newFeatures=poly.fit_transform(X_train.toarray())
elif kernel=='RBF':
    rbf_feature = RBFSampler(gamma=1, random_state=1)#, n_components=25000)
    X_train_newFeatures = rbf_feature.fit_transform(X_train)
print('Feature Set Computed')

#Train Model
clf = SGDClassifier(loss="hinge", penalty="l2", max_iter=1e7, tol=1e-5, alpha=ALPHA)
print('Fitting SVM')
clf.fit(X_train_newFeatures,y_train)
print('Fitting Done')

y_model_train=clf.predict(X_train_newFeatures)
misclassifiedFraction=(np.sum(np.fabs(y_model_train-y_train)))/(2*y_train.size)
print('Misclassified Fraciton on Train Set='+str(misclassifiedFraction))
confusion_matrix(y_train,y_model_train)

#Save Model:
model_filename = label
joblib.dump(clf,label) 

Computing Feature Set
Feature Set Computed
Fitting SVM
Fitting Done
Misclassified Fraciton on Train Set=0.11072


['linear_with_n12500.txt']