In [1]:
import os
import pandas as pd
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import numpy as np
from sklearn.svm import LinearSVC, SVC

from IPython.core.debugger import set_trace

In [2]:
negative_reviews = [[], [], []]
to_search = './Datasets/neg/'
for f in os.listdir(to_search):
    path = to_search+f
    fold = f[:5]
    if fold < 'cv233':
        fold_index = 0
    elif fold < 'cv466':
        fold_index = 1
    else:
        fold_index = 2
    with open(path, 'r', encoding='latin-1') as fin:
            negative_reviews[fold_index].append(fin.read().strip())
            
positive_reviews = [[], [], []]
to_search = './Datasets/pos/'
for f in os.listdir(to_search):
    path = to_search+f
    fold = f[:5]
    if fold < 'cv233':
        fold_index = 0
    elif fold < 'cv466':
        fold_index = 1
    else:
        fold_index = 2
    with open(path, 'r', encoding='latin-1') as fin:
            positive_reviews[fold_index].append(fin.read().strip())

In [9]:
def tokenise_document(review, add_not_tag=False, unigram=False, bigram=False, pos=False, position=False):
    """
    Given a review (string with space after all tokens) we return a list of tokens
    """
#     stemmer = PorterStemmer()
#     review_split = [stemmer.stem(word) for word in list(map(lambda x: x.lower(), review.split()))]
    
#     stop = set(stopwords.words('english'))
#     review_split = [word for word in list(map(lambda x: x.lower(), review.split())) if word not in stop]
    
    review_split = review.split()

    tokens = []
    
    if add_not_tag:
        negation_word = {'not': True, "isn't": True, "doesn't": True, "wasn't":True, "couldn't": True, "wouldn't": True, 
                         "didn't": True}
        punctuation = {'?': True, '!': True, '.': True, ',': True, ':': True, ';':True}

        convert_word = False
        for word in review_split:
            if word in punctuation:
                convert_word = False
                tokens.append(word)
                continue

            if convert_word:
                tokens.append('NOT_'+word)
                continue

            if word in negation_word:
                convert_word = True
                tokens.append(word)
                continue
            tokens.append(word)
                
                
    if unigram:
        tokens.extend(review_split)
        
        
    if pos:
        tags = pos_tag(review_split) # tags is a list of tuples: [(token, pos tag)]
        # Append the POS tag to each word in 'tokens' to form a string. E.g. 'Peter' and 'NN' becomes 'Peter_NN'
        # tokens have NOT tag added to them
        tokens = [token+'_'+tag[1] for token, tag in zip(tokens, tags)]

        
    if bigram:
        # Since one of the above two conditions will be fulfilled, 'tokens' will always have entries
        for index in range(len(tokens) - 1):
            word_1 = tokens[index]
            word_2 = tokens[index + 1]
            tokens[index] = word_1 + ' ' + word_2
        tokens.pop()  # Take the last unigram word at end of tokens list

        
    if position:
        # 'tokens' will already have been filled by this point by either 'unigram' or 'add_not_tag'
        first_quarter_end = np.ceil(len(tokens) * 0.25)
        middle_end = np.ceil(len(tokens) * 0.75)
        # last_quarter = len(tokens)
        
        for index in range(len(tokens)):
            if index < first_quarter_end:
                to_append = 1
            elif index < middle_end:
                to_append = 2
            else:
                to_append = 3
            tokens[index] = tokens[index]+'_'+str(to_append)
        
        
    return tokens

We only use binary feature vectors in this notebook, for feature vector involving counts see the 'Unigram_freq' notebook

In [4]:
def get_vocabulary(add_not_tag=False, unigram=False, bigram=False, pos=False, adjectives=False, position=False, length=16162):
    neg_flatlist = [neg_review for fold in negative_reviews for neg_review in fold]
    pos_flatlist = [pos_review for fold in positive_reviews for pos_review in fold]
    data = neg_flatlist + pos_flatlist  # We have a list of textual reviews (strings)

    freq = {}  # Since we are using dict, vocabulary will be unique
    for review in data:
        review_tokens = tokenise_document(review, unigram=unigram, add_not_tag=add_not_tag, bigram=bigram)

        for token in review_tokens:
            if token in freq:
                freq[token] += 1
            else:
                freq[token] = 1
                
    if length is None:
        cutoff = 7 if bigram else 4
        vocabulary = set([token for token, count in freq.items() if count >= cutoff])
    else:
        sorted_freq = sorted([(count, token) for token, count in freq.items()], reverse=True)
        vocabulary = set([token for _, token in sorted_freq[:length]])  # We use set to exploit O(1) lookup time
    
    
    if pos or position:
#         print("Entering POS/Position")
        vocabulary_pos = []
        for i, review in enumerate(data):
            
#             if i % 100 == 0:
#                 print(i)
            
            unigram_tokens = np.array(tokenise_document(review, unigram=unigram, add_not_tag=add_not_tag))
            pos_tokens = np.array(tokenise_document(review, unigram=unigram, add_not_tag=add_not_tag, pos=pos, position=position))
            mask = list(map(lambda token: token in vocabulary, unigram_tokens))
            vocabulary_pos.extend(list(pos_tokens[mask]))
        
        freq = {}
        for token in vocabulary_pos:
            if token in freq:
                freq[token] += 1
            else:
                freq[token] = 1

        if length is None:
            vocabulary = set([token for token, count in freq.items() if count >= 4])
        else:
            sorted_freq = sorted([(count, token) for token, count in freq.items()], reverse=True)
            vocabulary = set([token for _, token in sorted_freq[:length]])
            
        
    if adjectives:
        # Filter the 'sored_freq' list to only contains words with tags that end in JJ/JJR/JJS
        freq = {}
        for review in data:
            review_tokens = tokenise_document(review, unigram=unigram, add_not_tag=add_not_tag, pos=True)

            for pos_token in review_tokens:
                
                if pos_token.endswith('JJ'):
                    token = pos_token[:-3]
                elif pos_token.endswith('JJR'):
                    token = pos_token[:-4]
                elif pos_token.endswith('JJS'):
                    token = pos_token[:-4]
                
                if token in freq:
                    freq[token] += 1
                else:
                    freq[token] = 1

        sorted_freq = sorted([(count, token) for token, count in freq.items()], reverse=True)
        vocabulary = set([token for _, token in sorted_freq[:length]])
    
    return vocabulary

In [5]:
class NaiveBayes:
    
    def __init__(self, vocabulary, tokenisation, use_freq=False):
        self.vocabulary = vocabulary
        self.tokenisation = tokenisation
        self.use_freq = use_freq
        

    def get_document_vector(self, document_tokens):
        freq = {v:0 for v in self.vocabulary}
        for word in document_tokens:
            if word in self.vocabulary:
                if self.use_freq:
                    freq[word] += 1
                else:
                    freq[word] = 1
        return freq
    
    
    def get_class_probabilities(self, class_data):
        """
        class_data: A list containing all the tokens in a training data document. P(f_i|c) from training documents
        """
        # We start with the add-one smoothing
        log_prob_dict = {v:1 for v in self.vocabulary}  # We encode the add one smoothing count at start
        
        total_class_tokens = 0  # Tokens contained in vocabulary
        for word in class_data:
            if word in self.vocabulary:
                log_prob_dict[word] += 1
                total_class_tokens += 1
                
        # We divide by the denominator and log the values
        for key in log_prob_dict.keys():
            log_prob_dict[key] = np.log(log_prob_dict[key] / (total_class_tokens + len(self.vocabulary)))

        return log_prob_dict

    
    def train_probabilities(self, train_index1, train_index2):
        neg_train_tokens = []
        for neg_review in negative_reviews[train_index1] + negative_reviews[train_index2]:
            neg_review = self.tokenisation(neg_review)
            neg_train_tokens.extend(neg_review)
        negative_log_probs = self.get_class_probabilities(neg_train_tokens)
        
        pos_train_tokens = []
        for pos_review in positive_reviews[train_index1] + positive_reviews[train_index2]:
            pos_review = self.tokenisation(pos_review)
            pos_train_tokens.extend(pos_review)
        positive_log_probs = self.get_class_probabilities(pos_train_tokens)
    
        return negative_log_probs, positive_log_probs
    
    
    def test_documents(self, test_index, negative_log_probs, positive_log_probs):
        correct = 0
        for pos_review in positive_reviews[test_index]:
            pos_review = self.tokenisation(pos_review)
            document_vector = self.get_document_vector(pos_review)
            
            neg_sum = 0   # For negative class
            pos_sum = 0   # For positive class
            for word, freq in document_vector.items():
                if freq == 0:
                    continue
                    
                # When we use counts in feature vector use below two    
                neg_sum += freq * negative_log_probs[word]
                pos_sum += freq * positive_log_probs[word]

            if pos_sum > neg_sum:
                correct += 1

        for neg_review in negative_reviews[test_index]:
            neg_review = self.tokenisation(neg_review)
            document_vector = self.get_document_vector(neg_review)
            
            neg_sum = 0   # For negative class
            pos_sum = 0   # For positive class
            for word, freq in document_vector.items():
                if freq == 0:
                    continue
                    
                neg_sum += freq * negative_log_probs[word]
                pos_sum += freq * positive_log_probs[word]
                
            if neg_sum > pos_sum:
                correct += 1

        return correct / (len(positive_reviews[test_index]) + len(negative_reviews[test_index]))
    
    
    def get_statistics(self):
        train_index1, train_index2, test_index = 1, 2, 0
        neg_log_prob, pos_log_prob = self.train_probabilities(train_index1, train_index2)
        accuracy1 = self.test_documents(test_index, neg_log_prob, pos_log_prob)
#         return accuracy1
        
        print("Done fold 1")
    
        train_index1, train_index2, test_index = 0, 2, 1
        neg_log_prob, pos_log_prob = self.train_probabilities(train_index1, train_index2)
        accuracy2 = self.test_documents(test_index, neg_log_prob, pos_log_prob)
        
        print("Done fold 2")
        
        train_index1, train_index2, test_index = 0, 1, 2
        neg_log_prob, pos_log_prob = self.train_probabilities(train_index1, train_index2)
        accuracy3 = self.test_documents(test_index, neg_log_prob, pos_log_prob)
        
        print("Done fold 3")
        
        return (accuracy1 + accuracy2 + accuracy3) / 3

In [10]:
# vocabulary = get_vocabulary(add_not_tag=True)
# tokenisation = lambda x: tokenise_document(x, add_not_tag=True)
vocabulary = get_vocabulary(add_not_tag=True, pos=True, length=16688)
tokenisation = lambda x: tokenise_document(x, add_not_tag=True, pos=True)
print(len(vocabulary))

16162


In [43]:
NaiveBayes(vocabulary, tokenisation).get_statistics()

Done fold 1
Done fold 2
Done fold 3


0.7856828436227579

In [178]:
# Unigram with counts
# vocabulary = get_vocabulary(add_not_tag=True)
# tokenisation = lambda x: tokenise_document(x, add_not_tag=True)
# NaiveBayes(vocabulary, tokenisation, use_freq=True).get_statistics()

# Unigram
# vocabulary = get_vocabulary(add_not_tag=True)
# tokenisation = lambda x: tokenise_document(x, add_not_tag=True)

# Unigram + Bigram
# vocabulary_unigram = get_vocabulary(add_not_tag=True)
# vocabulary_bigram = get_vocabulary(unigram=True, bigram=True)
# vocabulary = vocabulary_unigram | vocabulary_bigram
# def tokenisation(x):
#     unigrams = tokenise_document(x, add_not_tag=True)
#     bigrams = tokenise_document(x, unigram=True, bigram=True)
#     unigrams.extend(bigrams)
#     return unigrams

# Bigram
# vocabulary = get_vocabulary(unigram=True, bigram=True)
# tokenisation = lambda x: tokenise_document(x, unigram=True, bigram=True)

# Unigram + POS
# vocabulary = get_vocabulary(add_not_tag=True, pos=True, length=16688)
# tokenisation = lambda x: tokenise_document(x, add_not_tag=True, pos=True)

# Adjectives
# vocabulary = get_vocabulary(add_not_tag=True, adjectives=True, length=2631)
# tokenisation = lambda x: tokenise_document(x, add_not_tag=True)  # We don't want POS tag when tokenising documents.
# we use normal words as features. We only use POS tags to get adjectives in vocabulary

# Top Unigrams
# vocabulary = get_vocabulary(add_not_tag=True, length=2631)
# tokenisation = lambda x: tokenise_document(x, add_not_tag=True)

# Unigrams + Position
# vocabulary = get_vocabulary(add_not_tag=True, position=True, length=22407)
# tokenisation = lambda x: tokenise_document(x, add_not_tag=True, position=True)

NaiveBayes(vocabulary, tokenisation).get_statistics()

16162


0.7878226526295196

In [38]:
class SVM:
    
    def __init__(self, vocabulary, tokenisation, use_freq=False):
        self.dimensions = sorted(list(vocabulary))
        self.tokenisation = tokenisation
        self.use_freq = use_freq
        
    
    def get_feature_vector(self, review_tokens):
        freq = {}
        for word in review_tokens:
            if word in freq:
                freq[word] += 1
            else:
                freq[word] = 1

        feature_vector = []
        for word in self.dimensions:
            if word in freq:
                if self.use_freq:
                    feature_vector.append(freq[word])
                else:
                    feature_vector.append(1)
            else:
                feature_vector.append(0)
        xs = np.array(feature_vector)

        # Normalisation
        denom = np.linalg.norm(xs)

        if denom == 0:
            return xs
        
        return xs / denom
    
    
    def get_train_test_data(self, train_index1, train_index2, test_index):
        train_xs = []
        train_ys = []

        test_xs = []
        test_ys = []

        for neg_review in negative_reviews[train_index1] + negative_reviews[train_index2]:
            neg_review_tokens = self.tokenisation(neg_review)
            xs = self.get_feature_vector(neg_review_tokens)
            train_xs.append(xs)
            train_ys.append(-1)  # Label -1 is for negative sentiment

        for pos_review in positive_reviews[train_index1] + positive_reviews[train_index2]:
            pos_review_tokens = self.tokenisation(pos_review)
            xs = self.get_feature_vector(pos_review_tokens)
            train_xs.append(xs)
            train_ys.append(1)  # Label 1 for positive sentiment

        for pos_review in positive_reviews[test_index]:
            pos_review_tokens = self.tokenisation(pos_review)
            xs = self.get_feature_vector(pos_review_tokens)
            test_xs.append(xs)
            test_ys.append(1)  # Label 1 for positive sentiment

        for neg_review in negative_reviews[test_index]:
            neg_review_tokens = self.tokenisation(neg_review)
            xs = self.get_feature_vector(neg_review_tokens)
            test_xs.append(xs)
            test_ys.append(-1)  # Label -1 for negatuve sentiment

        return train_xs, train_ys, test_xs, test_ys
    
    
    def get_statistics(self):
        classifier = LinearSVC()
        train_xs, train_ys, test_xs, test_ys = self.get_train_test_data(1, 2, 0)
        classifier.fit(train_xs, train_ys)
        accuracy1 = classifier.score(test_xs, test_ys)
        
        print("Done 1 fold")
        
        classifier = LinearSVC()
        train_xs, train_ys, test_xs, test_ys = self.get_train_test_data(0, 2, 1)
        classifier.fit(train_xs, train_ys)
        accuracy2 = classifier.score(test_xs, test_ys)
        
        print("Done 2 fold")
        
        classifier = LinearSVC()
        train_xs, train_ys, test_xs, test_ys = self.get_train_test_data(0, 1, 2)
        classifier.fit(train_xs, train_ys)
        accuracy3 = classifier.score(test_xs, test_ys)
        
        print("Done 3 fold")
        
        return (accuracy1 + accuracy2 + accuracy3) / 3
    
    
    def use_svm_light(self, train_xs, train_ys, test_xs, test_ys):
        with open('../../svm_light/train.txt', 'w') as fout:
            for vector, label in zip(train_xs, train_ys):
                vector_ls = [str(label)]
                for index, value in enumerate(vector):
                    # Model needs feature numbers to start from 1
                    vector_ls.append(str(index+1)+':'+str(value))
                # NEED NEWLINE CHARACTER AT END. PYTHON AUTOMATICALLY CONVERTS THIS TO APPROPRIATE ENDING
                line = ' '.join(vector_ls)+'\n'
                fout.write(line)

        with open('../../svm_light/test.txt', 'w') as fout:
            for vector, label in zip(test_xs, test_ys):
                vector_ls = [str(label)]
                for index, value in enumerate(vector):
                    vector_ls.append(str(index+1)+':'+str(value))
                line = ' '.join(vector_ls)+'\n'
                fout.write(line)
                
        !cd ../../svm_light/ && ./svm_learn train.txt model.txt && ./svm_classify test.txt model.txt output.txt && rm *.txt
        
    
    
    def get_statistics_svmlight(self):
        train_xs, train_ys, test_xs, test_ys = self.get_train_test_data(1, 2, 0)
        self.use_svm_light(train_xs, train_ys, test_xs, test_ys)
        print("Done 1 fold")
        
        train_xs, train_ys, test_xs, test_ys = self.get_train_test_data(0, 2, 1)
        self.use_svm_light(train_xs, train_ys, test_xs, test_ys)
        print("Done 2 fold")
        
        train_xs, train_ys, test_xs, test_ys = self.get_train_test_data(0, 1, 2)
        self.use_svm_light(train_xs, train_ys, test_xs, test_ys)
        print("Done 3 fold")

In [41]:
vocabulary = get_vocabulary(add_not_tag=True)
tokenisation = lambda x: tokenise_document(x, add_not_tag=True)
print(len(vocabulary))

16688


In [42]:
SVM(vocabulary, tokenisation).get_statistics()

Done 1 fold
Done 2 fold
Done 3 fold


0.76500311800740983

In [166]:
# Unigram with counts
# vocabulary = vocabulary_unigram()
# tokenisation = lambda x: tokenise_document(x, add_not_tag=True)
# SVM(vocabulary, tokenisation, use_freq=True).get_statistics()


# Unigram
# vocabulary = vocabulary_unigram()
# tokenisation = lambda x: tokenise_document(x, add_not_tag=True)


# Bigram
# vocabulary = get_vocabulary(unigram=True, bigram=True)
# tokenisation = lambda x: tokenise_document(x, unigram=True, bigram=True)

# Unigram + bigram
# vocabulary_unigram = get_vocabulary(add_not_tag=True)
# vocabulary_bigram = get_vocabulary(unigram=True, bigram=True)
# vocabulary = vocabulary_unigram | vocabulary_bigram
# def tokenisation(x):
#     unigrams = tokenise_document(x, add_not_tag=True)
#     bigrams = tokenise_document(x, unigram=True, bigram=True)
#     unigrams.extend(bigrams)
#     return unigrams


# Unigram + POS
# vocabulary = get_vocabulary(add_not_tag=True, pos=True, length=16688)
# tokenisation = lambda x: tokenise_document(x, add_not_tag=True, pos=True)

# Adjectives
# vocabulary = get_vocabulary(add_not_tag=True, adjectives=True, length=2631)
# tokenisation = lambda x: tokenise_document(x, add_not_tag=True) 

# Top Unigrams
# vocabulary = get_vocabulary(add_not_tag=True, length=2631)
# tokenisation = lambda x: tokenise_document(x, add_not_tag=True)

# Unigram + position
# vocabulary = get_vocabulary(add_not_tag=True, position=True, length=22407)
# tokenisation = lambda x: tokenise_document(x, add_not_tag=True, position=True)

SVM(vocabulary, tokenisation).get_statistics()

0.76995830429306833

### For unigram model using binary feature vector:

We simply change the get_document_vector method to include binary features rather than count in NB. Rest of the code is the same. We get 80.71%

Using SVM and no normalisation: 0.80426

Using SVM and with normalisation with only vector: 0.81927

Using SVM_light = 79.61, 82.83, 83.33 => 81.92%

Using SVM and with normalisation of entire document length: ~.53
