In [4]:
import os
import pandas as pd
from nltk import pos_tag, word_tokenize
from nltk.stem.porter import PorterStemmer
import numpy as np
from sklearn.svm import LinearSVC
from sklearn.model_selection import StratifiedKFold

from IPython.core.debugger import set_trace

In [2]:
def get_train_data():
    """
    Return a list of strings (where words have spaces correctly between them) for each sentiment
    """
    t = pd.read_table('./Datasets/Rotten_Tomatoes/train.tsv')
    negative = t[t['Sentiment'] == 0].append(t[t['Sentiment'] == 1])
#     negative = t[t['Sentiment'] == 0]
    negative.sort_values('PhraseId', inplace=True)
    negative.drop_duplicates('SentenceId', keep='first', inplace=True)
    
    neutral = t[t['Sentiment'] == 2]
#     neutral = t[t['Sentiment'] == 1].append(t[t['Sentiment'] == 2]).append(t[t['Sentiment'] == 3])
    neutral.sort_values('PhraseId', inplace=True)
    neutral.drop_duplicates('SentenceId', keep='first', inplace=True)
    
    positive = t[t['Sentiment'] == 3].append(t[t['Sentiment'] == 4])
#     positive = t[t['Sentiment'] == 4]
    positive.sort_values('PhraseId', inplace=True)
    positive.drop_duplicates('SentenceId', keep='first', inplace=True)
    
    return negative['Phrase'].tolist(), neutral['Phrase'].tolist(), positive['Phrase'].tolist()


def prepare_data(negative_reviews_train, neutral_reviews_train, positive_reviews_train):
    """
    Since 3 classes, ensure each class has baseline of 33.33% for guessing. We also do 3-fold cross-validation so each fold needs
    to have the correct proportion of classes too to keep above baseline using stratified k fold cross validation.
    """
    min_size = min([len(negative_reviews_train), len(neutral_reviews_train), len(positive_reviews_train)])

    # So that baseline of random guessing is now 33.33%
    negative_reviews_train = negative_reviews_train[:min_size]
    neutral_reviews_train = neutral_reviews_train[:min_size]
    positive_reviews_train = positive_reviews_train[:min_size]
    
    neg_label = [-1] * min_size
    neutral_label = [0] * min_size
    pos_label = [1] * min_size
    
    data = []
    data.extend(negative_reviews_train)
    data.extend(neutral_reviews_train)
    data.extend(positive_reviews_train)
    data = np.array(data)
    
    labels = []
    labels.extend(neg_label)
    labels.extend(neutral_label)
    labels.extend(pos_label)
    labels = np.array(labels)
    
    skf = StratifiedKFold(n_splits=3)
    train_indices = []
    test_indices = []
    for train_index, test_index in skf.split(data, labels):
        train_indices.append(train_index)
        test_indices.append(test_index)
        print("TRAIN:", len(train_index), "TEST:", len(test_index))
    
    return data, labels, train_indices, test_indices

In [3]:
negative_reviews, neutral_reviews, positive_reviews = get_train_data()
print(len(negative_reviews))
print(len(neutral_reviews))
print(len(positive_reviews))

data, labels, train_indices, test_indices = prepare_data(negative_reviews, neutral_reviews, positive_reviews)

6092
8115
6794
TRAIN: 12183 TEST: 6093
TRAIN: 12183 TEST: 6093
TRAIN: 12186 TEST: 6090


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [13]:
def tokenise_document(review, add_not_tag=False, unigram=False, bigram=False, pos=False, position=False):
    """
    Given a review (string) we return a list of tokens
    """
    
#     stemmer = PorterStemmer()
#     review_split = [stemmer.stem(word) for word in list(map(lambda x: x.lower(), review.split()))]

    review_split = review.split()
    tokens = []
    
    if add_not_tag:
        negation_word = {'not': True, "isn't": True, "doesn't": True, "wasn't":True, "couldn't": True, "wouldn't": True, 
                         "didn't": True}
        punctuation = {'?': True, '!': True, '.': True, ',': True, ':': True, ';':True}

        convert_word = False
        for word in review_split:
            if word in punctuation:
                convert_word = False
                tokens.append(word)
                continue

            if convert_word:
                tokens.append('NOT_'+word)
                continue

            if word in negation_word:
                convert_word = True
                tokens.append(word)
                continue
            tokens.append(word)
                
                
    if unigram:
        tokens.extend(review_split)
        
        
    if pos:
        tags = pos_tag(review_split) # tags is a list of tuples: [(token, pos tag)]
        # Append the POS tag to each word in 'tokens' to form a string. E.g. 'Peter' and 'NN' becomes 'Peter_NN'
        # tokens have NOT tag added to them
        tokens = [token+'_'+tag[1] for token, tag in zip(tokens, tags)]

        
    if bigram:
        # Since one of the above two conditions will be fulfilled, 'tokens' will always have entries
        for index in range(len(tokens) - 1):
            word_1 = tokens[index]
            word_2 = tokens[index + 1]
            tokens[index] = word_1 + ' ' + word_2
        if len(tokens) > 0:
            tokens.pop()  # Take the last unigram word at end of tokens list

        
    if position:
        # 'tokens' will already have been filled by this point by either 'unigram' or 'add_not_tag'
        first_quarter_end = np.ceil(len(tokens) * 0.25)
        middle_end = np.ceil(len(tokens) * 0.75)
        # last_quarter = len(tokens)
        
        for index in range(len(tokens)):
            if index < first_quarter_end:
                to_append = 1
            elif index < middle_end:
                to_append = 2
            else:
                to_append = 3
            tokens[index] = tokens[index]+'_'+str(to_append)
        
        
    return tokens

In [6]:
def get_vocabulary(add_not_tag=False, unigram=False, bigram=False, pos=False, adjectives=False, position=False, length=16162):
    data = negative_reviews + neutral_reviews + positive_reviews  # We have a list of textual reviews (strings)
    
    freq = {}  # Since we are using dict, vocabulary will be unique
    for review in data:
        review_tokens = tokenise_document(review, unigram=unigram, add_not_tag=add_not_tag, bigram=bigram)

        for token in review_tokens:
            if token in freq:
                freq[token] += 1
            else:
                freq[token] = 1
                
    if length is None:
        cutoff = 7 if bigram else 4
        vocabulary = set([token for token, count in freq.items() if count >= cutoff])
    else:
        sorted_freq = sorted([(count, token) for token, count in freq.items()], reverse=True)
        vocabulary = set([token for _, token in sorted_freq[:length]])  # We use set to exploit O(1) lookup time
    
    
    if pos or position:
#         print("Entering POS/Position")
        vocabulary_pos = []
        for i, review in enumerate(data):
            
#             if i % 100 == 0:
#                 print(i)
            
            unigram_tokens = np.array(tokenise_document(review, unigram=unigram, add_not_tag=add_not_tag))
            pos_tokens = np.array(tokenise_document(review, unigram=unigram, add_not_tag=add_not_tag, pos=pos, position=position))
            mask = list(map(lambda token: token in vocabulary, unigram_tokens))
            vocabulary_pos.extend(list(pos_tokens[mask]))
        
        freq = {}
        for token in vocabulary_pos:
            if token in freq:
                freq[token] += 1
            else:
                freq[token] = 1

        if length is None:
            vocabulary = set([token for token, count in freq.items() if count >= 4])
        else:
            sorted_freq = sorted([(count, token) for token, count in freq.items()], reverse=True)
            vocabulary = set([token for _, token in sorted_freq[:length]])
            
        
    if adjectives:
        # Filter the 'sored_freq' list to only contains words with tags that end in JJ/JJR/JJS
        freq = {}
        for review in data:
            review_tokens = tokenise_document(review, unigram=unigram, add_not_tag=add_not_tag, pos=True)

            for pos_token in review_tokens:
                
                if pos_token.endswith('JJ'):
                    token = pos_token[:-3]
                elif pos_token.endswith('JJR'):
                    token = pos_token[:-4]
                elif pos_token.endswith('JJS'):
                    token = pos_token[:-4]
                
                if token in freq:
                    freq[token] += 1
                else:
                    freq[token] = 1

        sorted_freq = sorted([(count, token) for token, count in freq.items()], reverse=True)
        vocabulary = set([token for _, token in sorted_freq[:length]])
    
    return vocabulary

In [7]:
class NaiveBayes:
    
    def __init__(self, vocabulary, tokenisation):
        self.vocabulary = vocabulary
        self.tokenisation = tokenisation

        
    def get_document_vector(self, document_tokens):
        freq = {v:0 for v in self.vocabulary}
        for word in document_tokens:
            if word in self.vocabulary:
                freq[word] = 1
        return freq
    
    
    def get_class_probabilities(self, class_data):
        """
        class_data: A list containing all the tokens in a training data document. P(f_i|c) from training documents
        """
        # We start with the add-one smoothing
        log_prob_dict = {v:1 for v in self.vocabulary}  # We encode the add one smoothing count at start
        
        total_class_tokens = 0  # Tokens contained in vocabulary
        for word in class_data:
            if word in self.vocabulary:
                log_prob_dict[word] += 1
                total_class_tokens += 1
                
        # We divide by the denominator and log the values
        for key in log_prob_dict.keys():
            log_prob_dict[key] = np.log(log_prob_dict[key] / (total_class_tokens + len(self.vocabulary)))

        return log_prob_dict

    
    def split_data(self, indices):
        """
        Given the indices (list of numbers). Split 'data' into negative, neutral and positive sentiments using 'labels'
        """
        subset_data = data[indices]
        subset_labels = labels[indices]
        
        negative_sentiments = np.where(subset_labels == -1)[0]
        
        neutral_sentiments = np.where(subset_labels == 0)[0]
        
        positive_sentiments = np.where(subset_labels == 1)[0]
        
        return subset_data[negative_sentiments], subset_data[neutral_sentiments], subset_data[positive_sentiments]
    
    
    def train_probabilities(self, train_index1, train_index2):
        neg_reviews_1, neutral_reviews_1, pos_reviews_1  = self.split_data(train_indices[train_index1])
        print("Training_fold_1")
        print(len(neg_reviews_1), len(neutral_reviews_1), len(pos_reviews_1))
        
        print("Training_fold_2")
        neg_reviews_2, neutral_reviews_2, pos_reviews_2 = self.split_data(train_indices[train_index2])
        print(len(neg_reviews_2), len(neutral_reviews_2), len(pos_reviews_2))
        
        
        neg_train_tokens = []
        for neg_review in np.concatenate((neg_reviews_1, neg_reviews_2), axis=0):
            neg_review = self.tokenisation(neg_review)
            neg_train_tokens.extend(neg_review)
        negative_log_probs = self.get_class_probabilities(neg_train_tokens)
        
        print("Trained neg")
        
        pos_train_tokens = []
        for pos_review in np.concatenate((pos_reviews_1, pos_reviews_2), axis=0):
            pos_review = self.tokenisation(pos_review)
            pos_train_tokens.extend(pos_review)
        positive_log_probs = self.get_class_probabilities(pos_train_tokens)
        
        print("Trained pos")
        
        neutral_train_tokens = []
        for neutral_review in np.concatenate((neutral_reviews_1, neutral_reviews_2), axis=0):
            neutral_review = self.tokenisation(neutral_review)
            neutral_train_tokens.extend(neutral_review)
        neutral_log_probs = self.get_class_probabilities(neutral_train_tokens)
        
        print("Trained neutral")
    
        return negative_log_probs, neutral_log_probs, positive_log_probs
    
    
    def test_documents(self, test_index, negative_log_probs, neutral_log_probs, positive_log_probs):
        negative_reviews, neutral_reviews, positive_reviews = self.split_data(test_indices[test_index])
        
        correct = 0
        for pos_review in positive_reviews:
            pos_review = self.tokenisation(pos_review)
            document_vector = self.get_document_vector(pos_review)
            
            neg_sum = 0   # For negative class
            neu_sum = 0   # For neutral class
            pos_sum = 0   # For positive class
            
            for word, freq in document_vector.items():
                # When we use counts in feature vector use below two    
                if freq == 0:
                    continue
                    
                neg_sum += negative_log_probs[word]
                pos_sum += positive_log_probs[word]
                neu_sum += neutral_log_probs[word]

            if pos_sum > neg_sum and pos_sum > neu_sum:
                correct += 1
                
        print("Tested pos")
        
        for neu_review in neutral_reviews:
            neu_review = self.tokenisation(neu_review)
            document_vector = self.get_document_vector(neu_review)
            
            neg_sum = 0   # For negative class
            neu_sum = 0   # For neutral class
            pos_sum = 0   # For positive class
            
            for word, freq in document_vector.items():
                # When we use counts in feature vector use below two
                if freq == 0:
                    continue
                    
                neg_sum += document_vector[word] * negative_log_probs[word]
                pos_sum += document_vector[word] * positive_log_probs[word]
                neu_sum += document_vector[word] * neutral_log_probs[word]

            if neu_sum > neg_sum and neu_sum > pos_sum:
                correct += 1
                
        print("Tested neutral")
        
        for neg_review in negative_reviews:
            neg_review = self.tokenisation(neg_review)
            document_vector = self.get_document_vector(neg_review)
            
            neg_sum = 0   # For negative class
            neu_sum = 0
            pos_sum = 0   # For positive class
            
            for word, freq in document_vector.items():
                if freq == 0:
                    continue
                    
                neg_sum += document_vector[word] * negative_log_probs[word]
                pos_sum += document_vector[word] * positive_log_probs[word]
                neu_sum += document_vector[word] * neutral_log_probs[word]
                
            if neg_sum > pos_sum and neg_sum > neu_sum:
                correct += 1
                
        print("Tested neg")

        return correct / (len(negative_reviews) + len(positive_reviews) + len(neutral_reviews))
    
    
    def get_statistics(self):
        train_index1, train_index2, test_index = 1, 2, 0
        neg_log_prob, neu_log_prob, pos_log_prob = self.train_probabilities(train_index1, train_index2)
        print("Trained")
        accuracy1 = self.test_documents(test_index, neg_log_prob, neu_log_prob, pos_log_prob)
        print("Tested")
#         return accuracy1
        
        train_index1, train_index2, test_index = 0, 2, 1
        neg_log_prob, neu_log_prob, pos_log_prob = self.train_probabilities(train_index1, train_index2)
        accuracy2 = self.test_documents(test_index, neg_log_prob, neu_log_prob, pos_log_prob)
        
        train_index1, train_index2, test_index = 0, 1, 2
        neg_log_prob, neu_log_prob, pos_log_prob = self.train_probabilities(train_index1, train_index2)
        accuracy3 = self.test_documents(test_index, neg_log_prob, neu_log_prob, pos_log_prob)
        
        return (accuracy1 + accuracy2 + accuracy3) / 3

In [14]:
vocabulary_unigram = get_vocabulary(add_not_tag=True)
# vocabulary_bigram = get_vocabulary(unigram=True, bigram=True)
# vocabulary = vocabulary_unigram | vocabulary_bigram
def tokenisation(x):
    unigrams = tokenise_document(x, add_not_tag=True)
    bigrams = tokenise_document(x, unigram=True, bigram=True)
    unigrams.extend(bigrams)
    return unigrams

# vocabulary = get_vocabulary(add_not_tag=True)
# tokenisation = lambda x: tokenise_document(x, add_not_tag=True)

print(len(vocabulary))

29461


In [15]:
len(vocabulary_unigram)

16162

In [11]:
len(vocabulary_unigram)

13299

In [8]:
vocabulary

{'Again',
 'substance',
 'at his',
 'A small',
 'Sabara',
 'Mario',
 'has too',
 'strange and',
 'Brett',
 'Iran',
 'technical flaws',
 'trashy',
 'labours',
 'the music',
 'cursory',
 'NOT_one',
 'is an',
 'has made',
 'nonsense',
 'alongside',
 'succumbs to',
 ', absurd',
 'us a',
 'Oscar-sweeping',
 'with an',
 'produce adequate',
 'willingness',
 'fairy tale',
 '-LRB- Jaglom',
 'warm',
 'and Amazing',
 'weeks',
 'smack',
 'patriarchal',
 'fits the',
 'in her',
 'movie has',
 'helped',
 'seemingly',
 'slasher-movie',
 'inauthentic',
 'A delightful',
 'machines',
 'those places',
 'old are',
 'women ,',
 'to follow',
 'laugh-out-loud bits',
 'it portrays',
 'alone',
 'static',
 'NOT_thriller',
 'been done',
 ', political',
 'Glory',
 'sadness',
 'NOT_performance',
 'about how',
 'cattle',
 'loosely',
 'chick',
 'NOT_vulgar',
 'animation is',
 ', featuring',
 'tradition of',
 'risks',
 'dark humor',
 'Butterworth',
 'that do',
 'It should',
 'clothes and',
 'mind --',
 'no doubting',


In [None]:
NaiveBayes(vocabulary, tokenisation).get_statistics()

In [10]:
class SVM:
    
    def __init__(self, vocabulary, tokenisation):
        self.dimensions = sorted(list(vocabulary))
        self.tokenisation = tokenisation
        
    
    def get_feature_vector(self, review_tokens):
        review_tokens = set(review_tokens)
        feature_vector = []
        for word in self.dimensions:
            if word in review_tokens:
                feature_vector.append(1)
            else:
                feature_vector.append(0)
        xs = np.array(feature_vector)

        # Normalisation
        denom = np.linalg.norm(xs)
        
        if denom == 0:
            return xs

        return xs / denom
    
    def split_data(self, indices):
        """
        Given the indices (list of numbers). Split 'data' into negative, neutral and positive sentiments using 'labels'
        """
        subset_data = data[indices]
        subset_labels = labels[indices]
        
        negative_sentiments = np.where(subset_labels == -1)[0]
        neutral_sentiments = np.where(subset_labels == 0)[0]
        positive_sentiments = np.where(subset_labels == 1)[0]
        
        return subset_data[negative_sentiments], subset_data[neutral_sentiments], subset_data[positive_sentiments]
    
    
    def get_train_test_data(self, train_index1, train_index2, test_index):
        neg_reviews_1, neutral_reviews_1, pos_reviews_1  = self.split_data(train_indices[train_index1])
        neg_reviews_2, neutral_reviews_2, pos_reviews_2 = self.split_data(train_indices[train_index2])
        negative_reviews, neutral_reviews, positive_reviews = self.split_data(test_indices[test_index])
        
        train_xs = []
        train_ys = []

        test_xs = []
        test_ys = []
        
        print("Split Data")

        for neg_review in np.concatenate((neg_reviews_1, neg_reviews_2), axis=0):
            neg_review_tokens = self.tokenisation(neg_review)
            xs = self.get_feature_vector(neg_review_tokens)
            train_xs.append(xs)
            train_ys.append(-1)  # Label -1 is for negative sentiment
            
        print("Neg train")

        for pos_review in np.concatenate((pos_reviews_1, pos_reviews_2), axis=0):
            pos_review_tokens = self.tokenisation(pos_review)
            xs = self.get_feature_vector(pos_review_tokens)
            train_xs.append(xs)
            train_ys.append(1)  # Label 1 for positive sentiment
            
        print("Pos train")
            
        for neu_review in np.concatenate((neutral_reviews_1, neutral_reviews_2), axis=0):
            neu_review_tokens = self.tokenisation(neu_review)
            xs = self.get_feature_vector(neu_review_tokens)
            train_xs.append(xs)
            train_ys.append(0)  # Label 0 for neutral sentiment
            
        print("Neu train")

        for neg_review in negative_reviews:
            neg_review_tokens = self.tokenisation(neg_review)
            xs = self.get_feature_vector(neg_review_tokens)
            test_xs.append(xs)
            test_ys.append(-1)  # Label -1 for negatuve sentiment
            
        print("Neg test")
            
        for neu_review in neutral_reviews:
            neu_review_tokens = self.tokenisation(neu_review)
            xs = self.get_feature_vector(neu_review_tokens)
            test_xs.append(xs)
            test_ys.append(0)  # Label 0 for neutral sentiment
            
        print("Neu test")
            
        for pos_review in positive_reviews:
            pos_review_tokens = self.tokenisation(pos_review)
            xs = self.get_feature_vector(pos_review_tokens)
            test_xs.append(xs)
            test_ys.append(1)  # Label 1 for positive sentiment
            
        print("Pos test")

        return np.array(train_xs), np.array(train_ys), np.array(test_xs), np.array(test_ys)
    
    
    def get_statistics(self):
        classifier = LinearSVC()
        train_xs, train_ys, test_xs, test_ys = self.get_train_test_data(1, 2, 0)
        classifier.fit(train_xs, train_ys)
        print("FITTED")
        accuracy1 = classifier.score(test_xs, test_ys)

        classifier = LinearSVC()
        train_xs, train_ys, test_xs, test_ys = self.get_train_test_data(0, 2, 1)
        classifier.fit(train_xs, train_ys)
        accuracy2 = classifier.score(test_xs, test_ys)

        classifier = LinearSVC()
        train_xs, train_ys, test_xs, test_ys = self.get_train_test_data(0, 1, 2)
        classifier.fit(train_xs, train_ys)
        accuracy3 = classifier.score(test_xs, test_ys)

        return (accuracy1 + accuracy2 + accuracy3) / 3

In [11]:
# We use same vocabulary as NB above, so no need to recalculate
SVM(vocabulary, tokenisation).get_statistics()

Split Data
Neg train
Pos train
Neu train
Neg test
Neu test
Pos test
FITTED
Split Data
Neg train
Pos train
Neu train
Neg test
Neu test
Pos test
Split Data
Neg train
Pos train
Neu train
Neg test
Neu test
Pos test


0.92017155006000317