In [1]:
import nltk
import os
import pandas as pd
import numpy as np
from sklearn.svm import SVC

from IPython.core.debugger import set_trace

In [4]:
negative_reviews = [[], [], []]
to_search = './Datasets/neg/'
for f in os.listdir(to_search):
    path = to_search+f
    fold = f[:5]
    if fold < 'cv233':
        fold_index = 0
    elif fold < 'cv466':
        fold_index = 1
    else:
        fold_index = 2
    with open(path, 'r', encoding='latin-1') as fin:
            negative_reviews[fold_index].append(fin.read().strip())
            
positive_reviews = [[], [], []]
to_search = './Datasets/pos/'
for f in os.listdir(to_search):
    path = to_search+f
    fold = f[:5]
    if fold < 'cv233':
        fold_index = 0
    elif fold < 'cv466':
        fold_index = 1
    else:
        fold_index = 2
    with open(path, 'r', encoding='latin-1') as fin:
            positive_reviews[fold_index].append(fin.read().strip())

In [None]:
# The review data already contains tokenised words
"""
In NLTK, to use Naive Bayes we supply a list of training data. This is a list of tuples: [(features, label), ...]
In this example:
Label is a string, denoting the class of the the features

Feature is a dictionary of {word: Boolen} saying whether the word is present in this data point. In training examples, this will
only contain positive example words, so all boolean will be True.

In test example (say if we fix our vector size representing a data point/document) then we would have False for any tokens in 
vector that is not present in the test data.

The NLTK version ignores any words not seen before when calculating probabilities, rather than giving zero probability.
It does not do any add-one smoothing.
"""

In [7]:
def add_not_tag(review):
    """
    Given a tokenised review (string with space after all tokens) we return another string with the NOT_* tag added to 
    words between negated word and punctuation
    """
    review_ls = review.split()
    negation_word = {'not': True, "isn't": True, "doesn't": True, "wasn't":True, "couldn't": True, "wouldn't": True, "didn't": True}
    punctuation = {'?': True, '!': True, '.': True, ',': True, ':': True, ';':True}
    
    convert_word = False
    for index in range(len(review_ls)):
        word = review_ls[index]
        
        if word in punctuation:
            convert_word = False
            continue
        
        if convert_word:
            review_ls[index] = 'NOT_'+word
            continue
        
        if word in negation_word:
            convert_word = True
            continue
            
    return ' '.join(review_ls)
    

In [8]:
# Get top 16165 unigrams from 1400 reviews
neg_flatlist = [neg_review for fold in negative_reviews for neg_review in fold]
pos_flatlist = [pos_review for fold in positive_reviews for pos_review in fold]

freq = {}
for review in neg_flatlist + pos_flatlist:
    
    # CHANGED TAG
    review = add_not_tag(review)
    
    for word in review.split():
        if word in freq:
            freq[word] += 1
        else:
            freq[word] = 1
            
sorted_freq = sorted([(count, word) for word, count in freq.items()], reverse=True)
vocabulary = set([word for _, word in sorted_freq[:16165]]) # We use set to exploit O(1) lookup time

In [160]:
"""
Implementing custom Naive Bayes. We use log values here instead of exact
P(c) = 0.5 since only two classes and equally spread 
P(d) = We ignore
P(f_i|c) = Number of times word f_i occured in a document / total count of words in document. Plus smoothing (1/|V|)
"""

def get_freq_test(review):
    freq = {v:0 for v in vocabulary}
    for word in review.split():
        if word in freq:
            freq[word] += 1
    return freq


def get_freq_train(review):
    """
    review: A string containing all the tokens in a training data document. P(f_i|c) from training documents
    """
    # We start with the add-one smoothing
    
    log_prob_dict = {v:1 for v in vocabulary}
    count = 0
    for word in review.split():
        if word in log_prob_dict:
            log_prob_dict[word] += 1
            count += 1
    
    # We divide by the denominator and log the values
    for key in log_prob_dict.keys():
        # The total words appearing in a document are the ones that are in the vocabulary and count towards the variable 'count'
        log_prob_dict[key] = np.log(log_prob_dict[key] / (count + len(vocabulary)))
        
    return log_prob_dict
        

def train_and_test(train_index1, train_index2, test_index):
    neg_train_ls = []
    for neg_review in negative_reviews[train_index1] + negative_reviews[train_index2]:
        
        # CHANGED TAG
        neg_review = add_not_tag(neg_review)
        
        for neg_word in neg_review.split():
            neg_train_ls.append(neg_word)
    
    neg_train = ' '.join(neg_train_ls)
    negative_log_probs = get_freq_train(neg_train)
    
    pos_train_ls = []
    for pos_review in positive_reviews[train_index1] + positive_reviews[train_index2]:
        
        #CHANGED TAG
        pos_review = add_not_tag(pos_review)
        
        for pos_word in pos_review.split():
            pos_train_ls.append(pos_word)
    pos_train = ' '.join(pos_train_ls)
    positive_log_probs = get_freq_train(pos_train)
    
#     prior = 0.5  # But since this is same for both we don't actually need to use it
    correct = 0
    for pos_review in positive_reviews[test_index]:
        
        # CHANGED TAG
        pos_review = add_not_tag(pos_review)
        
        counts = get_freq_test(pos_review)
        neg_sum = 0   # For negative class
        pos_sum = 0   # For positive class
        for word in counts.keys():
            neg_sum += negative_log_probs[word] * counts[word]
            pos_sum += positive_log_probs[word] * counts[word]
        
        if pos_sum > neg_sum:
            correct += 1
            
    for neg_review in negative_reviews[test_index]:
        
        # CHANGED TAG
        neg_review = add_not_tag(neg_review)
        
        
        counts = get_freq_test(neg_review)
        neg_sum = 0   # For negative class
        pos_sum = 0   # For positive class
        for word in counts.keys():
            neg_sum += negative_log_probs[word] * counts[word]
            pos_sum += positive_log_probs[word] * counts[word]
        
        if neg_sum > pos_sum:
            correct += 1
        
    return correct / (len(positive_reviews[test_index]) + len(negative_reviews[test_index]))
    

In [161]:
ans1 = train_and_test(0, 1, 2)
ans2 = train_and_test(0, 2, 1)
ans3 = train_and_test(1, 2, 0)

NB_avg = (ans1 + ans2 + ans3)/3
NB_avg

0.7871104019172689

### SVM

We create a feature vector, each dimension is a word from the vocabulary list. The feature vector here contains NOT_ tags too. The values for the vector are the counts of the word occuring in the text.

In [12]:
def get_feature_vector(dimensions, review):
    freq = {}
    for word in review.split():
        if word in freq:
            freq[word] += 1
        else:
            freq[word] = 1
    
    feature_vector = []
    for word in dimensions:
        if word in freq:
            feature_vector.append(freq[word])
        else:
            feature_vector.append(0)
    xs = np.array(feature_vector)
    
    denom = 1
#     denom = np.linalg.norm(np.array(list(freq.values())))  # For l2 norm including all the words
#     denom = np.linalg.norm(xs)
    
    return xs / denom


def get_data(train_index1, train_index2, test_index):
    dimensions = sorted(list(vocabulary))
    train_xs = []
    train_ys = []
    
    test_xs = []
    test_ys = []
    
    for neg_review in negative_reviews[train_index1] + negative_reviews[train_index2]:
        neg_review = add_not_tag(neg_review) # CHANGED TAG
        xs = get_feature_vector(dimensions, neg_review)
        train_xs.append(xs)
        train_ys.append(-1)  # Label -1 is for negative sentiment
        
    for pos_review in positive_reviews[train_index1] + positive_reviews[train_index2]:
        pos_review = add_not_tag(pos_review)  # CHANGED TAG
        xs = get_feature_vector(dimensions, pos_review)
        train_xs.append(xs)
        train_ys.append(1)  # Label 1 for positive sentiment
        
    for pos_review in positive_reviews[test_index]:
        pos_review = add_not_tag(pos_review)  # CHANGED TAG
        xs = get_feature_vector(dimensions, pos_review)
        test_xs.append(xs)
        test_ys.append(1)  # Label 1 for positive sentiment
        
    for neg_review in negative_reviews[test_index]:
        neg_review = add_not_tag(neg_review)  # CHANGED TAG
        xs = get_feature_vector(dimensions, neg_review)
        test_xs.append(xs)
        test_ys.append(-1)  # Label -1 for positive sentiment
        
    return train_xs, train_ys, test_xs, test_ys

When not normalising vector: 0.70859469571915934

When normalising them using account for all words in review, not just words in vocabulary as above we get: 0.69855287773742702

When normalising them with just the words in vocabulary: 0.6992651284496777

When using l1 norm as above: 0.51500311800740983

When dividing each feature vector by the length of the review it came from (including words not in vocabulary): 0.51994301994302006

When dividing each feature vector by the length of the review it came from (with only words in vocabulary): Same as l1 norm as equivalent operation

When L2 normalising vectors on vocabulary words:
(0, 1, 2) = 71.37%
(1, 2, 0) = 68.88%
(0, 2, 1) = 69.53%
Average of above = 69.93%

When not normalising vectors:
(0, 1, 2) = 65.81%
(0, 2, 1) = 70.82%
(1, 2, 0) = 68.24%
Average of above = 68.29%

In [156]:
(65.81 + 70.82 + 68.24) / 3

68.29

In [13]:
def output_train_test():
    train_xs, train_ys, test_xs, test_ys = get_data(0, 1, 2)
    with open('../../svm_light/train.txt', 'w') as fout:
        for vector, label in zip(train_xs, train_ys):
            vector_ls = [str(label)]
            for index, value in enumerate(vector):
                # Model needs feature numbers to start from 1
                vector_ls.append(str(index+1)+':'+str(value))
            # NEED NEWLINE CHARACTER AT END. PYTHON AUTOMATICALLY CONVERTS THIS TO APPROPRIATE ENDING
            line = ' '.join(vector_ls)+'\n'
            fout.write(line)
            
    with open('../../svm_light/test.txt', 'w') as fout:
        for vector, label in zip(test_xs, test_ys):
            vector_ls = [str(label)]
            for index, value in enumerate(vector):
                vector_ls.append(str(index+1)+':'+str(value))
            line = ' '.join(vector_ls)+'\n'
            fout.write(line)
            
output_train_test()  

In [14]:
%%bash
cd ../../svm_light/
./svm_learn train.txt model.txt
./svm_classify test.txt model.txt output.txt
rm *.txt

Scanning examples...done
Reading examples into memory...100..200..300..400..500..600..700..800..900..OK. (934 examples read)
Setting default regularization parameter C=0.0001
Optimizing.....................................................................................................................................................................................................................................................................................................................................................................................................done. (390 iterations)
Optimization finished (252 misclassified, maxdiff=0.00100).
Runtime in cpu-seconds: 10.76
Number of SV: 805 (including 761 at upper bound)
L1 loss: loss=599.24630
Norm of weight vector: |w|=0.16067
Norm of longest example vector: |x|=235.63743
Estimated VCdim of classifier: VCdim<=1434.10464
Computing XiAlpha-estimates...done
Runtime for XiAlpha-estimates in cpu-seconds: 0.07
XiAlpha-estimate of the 

In [126]:
def test_SVM():
    classifier = SVC(kernel='linear')
    train_xs, train_ys, test_xs, test_ys = get_data(0, 1, 2)
    classifier.fit(train_xs, train_ys)
    ans1 = classifier.score(test_xs, test_ys)
    
    classifier = SVC(kernel='linear')
    train_xs, train_ys, test_xs, test_ys = get_data(0, 2, 1)
    classifier.fit(train_xs, train_ys)
    ans2 = classifier.score(test_xs, test_ys)
    
    classifier = SVC(kernel='linear')
    train_xs, train_ys, test_xs, test_ys = get_data(1, 2, 0)
    classifier.fit(train_xs, train_ys)
    ans3 = classifier.score(test_xs, test_ys)
    
    return (ans1 + ans2 + ans3) / 3

test_SVM()

0.6992651284496777

In [98]:
temp = preprocessing.normalize(np.array([[1, 2, 3], [4, 5, 6]]))
np.sum(temp ** 2, axis=1)

array([ 1.,  1.])

In [19]:
def convert_str_to_dict(review):
    freq = {}
    for word in review.split():
        if word in vocabulary:
            freq[word] = True
        else:
            freq[word] = False
    return freq


def get_data(train_index1, train_index2, test_index):
    train_data = []
    for neg_fold in negative_reviews[train_index1] + negative_reviews[train_index2]:
        for neg_review in neg_fold:
            featureset = convert_str_to_dict(neg_review)
            train_data.append((featureset, 'negative'))

    for pos_fold in positive_reviews[train_index1] + positive_reviews[train_index2]:
        for pos_review in pos_fold:
            featureset = convert_str_to_dict(pos_review)
            train_data.append((featureset, 'positive'))
            
    test_data = []
    for neg_fold in negative_reviews[test_index]:
        for neg_review in neg_fold:
            featureset = convert_str_to_dict(neg_review)
            test_data.append((featureset, 'negative'))

    for pos_fold in positive_reviews[test_index]:
        for pos_review in pos_fold:
            featureset = convert_str_to_dict(pos_review)
            test_data.append((featureset, 'positive'))
            
    return train_data, test_data

train_set, test_set = get_data(0, 1, 2)
classifier = nltk.NaiveBayesClassifier.train(train_set)
nltk.classify.accuracy(classifier, test_set)

0.521080722570108