In [1]:
import os
import pandas as pd
from nltk import word_tokenize
import numpy as np
from sklearn.svm import LinearSVC

from IPython.core.debugger import set_trace

In [45]:
def get_train_data():
    """
    Return a list of strings (where words have spaces correctly between them) for each sentiment
    """
    t = pd.read_table('./Datasets/Rotten_Tomatoes/train.tsv')
#     negative = t[t['Sentiment'] == 0].append(t[t['Sentiment'] == 1])
    negative = t[t['Sentiment'] == 0]
    negative.sort_values('PhraseId', inplace=True)
    negative.drop_duplicates('SentenceId', keep='first', inplace=True)
    
#     neutral = t[t['Sentiment'] == 2]
    neutral = t[t['Sentiment'] == 1].append(t[t['Sentiment'] == 2]).append(t[t['Sentiment'] == 3])
    neutral.sort_values('PhraseId', inplace=True)
    neutral.drop_duplicates('SentenceId', keep='first', inplace=True)
    
#     positive = t[t['Sentiment'] == 3].append(t[t['Sentiment'] == 4])
    positive = t[t['Sentiment'] == 4]
    positive.sort_values('PhraseId', inplace=True)
    positive.drop_duplicates('SentenceId', keep='first', inplace=True)
    
    return negative['Phrase'].tolist(), neutral['Phrase'].tolist(), positive['Phrase'].tolist()

In [46]:
negative_reviews, neutral_reviews, positive_reviews = get_train_data()
min_size = min([len(negative_reviews), len(neutral_reviews), len(positive_reviews)])
negative_reviews_train = negative_reviews #[:min_size]
neutral_reviews_train = neutral_reviews #[:min_size]
positive_reviews_train = positive_reviews #[:min_size]
# Already tokenised, so use .split() method to get tokens
len(negative_reviews_train), len(neutral_reviews_train), len(positive_reviews_train)  

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


(2609, 8470, 3123)

In [4]:
def tokenise_document(review, add_not_tag=False, unigram=False, bigram=False):
    """
    Given a review (string) we return a list of tokens
    """
    
    review_split = word_tokenize(review)
    tokens = []
    
    if add_not_tag:
        negation_word = {'not': True, 'never':True, "isn't": True, "doesn't": True, "wasn't":True, "couldn't": True, "wouldn't": True, 
                         "didn't": True}
        punctuation = {'?': True, '!': True, '.': True, ',': True, ':': True, ';':True}

        convert_word = False
        for word in review_split:
            if word in punctuation:
                convert_word = False
                tokens.append(word)
                continue

            if convert_word:
                tokens.append('NOT_'+word)
                continue

            if word in negation_word:
                convert_word = True
                tokens.append(word)
                continue
            tokens.append(word)
                
                
    if unigram:
        tokens.extend(review_split)
        
    if bigram:
        # Since one of the above two conditions will be fulfilled, 'tokens' will always have entries
        for index in range(len(tokens) - 1):
            word_1 = tokens[index]
            word_2 = tokens[index + 1]
            tokens[index] = word_1 + ' ' + word_2
        if len(tokens) > 0:
            tokens.pop()  # Take the last unigram word at end of tokens list
        
    return tokens


def get_vocabulary(add_not_tag=False, unigram=False, bigram=False, length=16162):
    reviews = []
    to_search = './Datasets/neg/'
    for f in os.listdir(to_search):
        path = to_search+f
        with open(path, 'r', encoding='latin-1') as fin:
            reviews.append(fin.read().strip())

    to_search = './Datasets/pos/'
    for f in os.listdir(to_search):
        path = to_search+f
        with open(path, 'r', encoding='latin-1') as fin:
            reviews.append(fin.read().strip())
    
    data = negative_reviews + neutral_reviews + positive_reviews + reviews  # We have a list of textual reviews (strings)
    
    freq = {}  # Since we are using dict, vocabulary will be unique
    for review in data:
        review_tokens = tokenise_document(review, unigram=unigram, add_not_tag=add_not_tag, bigram=bigram)

        for token in review_tokens:
            if token in freq:
                freq[token] += 1
            else:
                freq[token] = 1
                
    sorted_freq = sorted([(count, token) for token, count in freq.items()], reverse=True)
    vocabulary = set([token for _, token in sorted_freq[:length]])  # We use set to exploit O(1) lookup time
    
    return vocabulary

In [5]:
class NaiveBayes:
    
    def __init__(self, vocabulary, tokenisation):
        self.vocabulary = vocabulary
        self.tokenisation = tokenisation

        
    def get_document_vector(self, document_tokens):
        freq = {v:0 for v in self.vocabulary}
        for word in document_tokens:
            if word in self.vocabulary:
                freq[word] = 1
        return freq
    
    
    def get_class_probabilities(self, class_data):
        """
        class_data: A list containing all the tokens in a training data document. P(f_i|c) from training documents
        """
        # We start with the add-one smoothing
        log_prob_dict = {v:1 for v in self.vocabulary}  # We encode the add one smoothing count at start
        
        total_class_tokens = 0  # Tokens contained in vocabulary
        for word in class_data:
            if word in self.vocabulary:
                log_prob_dict[word] += 1
                total_class_tokens += 1
                
        # We divide by the denominator and log the values
        for key in log_prob_dict.keys():
            log_prob_dict[key] = np.log(log_prob_dict[key] / (total_class_tokens + len(self.vocabulary)))

        return log_prob_dict

    
    def train_probabilities(self, negative_reviews, neutral_reviews, positive_reviews):
        neg_train_tokens = []
        for neg_review in negative_reviews:
            neg_review = self.tokenisation(neg_review)
            neg_train_tokens.extend(neg_review)
        negative_log_probs = self.get_class_probabilities(neg_train_tokens)
        
        print("Trained neg")
        
        pos_train_tokens = []
        for pos_review in positive_reviews:
            pos_review = self.tokenisation(pos_review)
            pos_train_tokens.extend(pos_review)
        positive_log_probs = self.get_class_probabilities(pos_train_tokens)
        
        print("Trained pos")
        
        neutral_train_tokens = []
        for neutral_review in neutral_reviews:
            neutral_review = self.tokenisation(neutral_review)
            neutral_train_tokens.extend(neutral_review)
        neutral_log_probs = self.get_class_probabilities(neutral_train_tokens)
        
        print("Trained neutral")
    
        return negative_log_probs, neutral_log_probs, positive_log_probs
    
    
    def get_test_data(self):
        reviews = []
        labels = []
        to_search = './Datasets/neg/'
        for f in os.listdir(to_search):
            path = to_search+f
            with open(path, 'r', encoding='latin-1') as fin:
                reviews.append(fin.read().strip())
                
        labels.extend([-1] * 700)

        to_search = './Datasets/pos/'
        for f in os.listdir(to_search):
            path = to_search+f
            with open(path, 'r', encoding='latin-1') as fin:
                reviews.append(fin.read().strip())
                
        labels.extend([1] * 700)
        
                
        reviews_sentences = list(map(lambda x: x.split('.'), reviews))
        return reviews_sentences, labels
    
    
    def test_single_document(self, negative_log_probs, neutral_log_probs, positive_log_probs, review):
        review_sentences = test_review.split('.')
        neg_sentences = []
        neu_sentences = []
        pos_sentences = []
        
        for sentence in review_sentences:
            sentence_tokens = self.tokenisation(sentence)
            sentence_vector = self.get_document_vector(sentence_tokens)
            
            neg_sum = 0   # For negative class
            neu_sum = 0   # For neutral class
            pos_sum = 0   # For positive class

            for word, freq in sentence_vector.items():
                # When we use counts in feature vector use below two    
                if freq == 0:
                    continue

                neg_sum += negative_log_probs[word]
                pos_sum += positive_log_probs[word]
                neu_sum += neutral_log_probs[word]

            if neg_sum > neu_sum and neg_sum > pos_sum:
                neg_sentences.append(sentence)
            elif neu_sum > neg_sum and neu_sum > pos_sum:
                neu_sentences.append(sentence)
            elif pos_sum > neg_sum and pos_sum > neu_sum:
                pos_sentences.append(sentence)
                
        return neg_sentences, neu_sentences, pos_sentences
    
    
    def test_documents(self, negative_log_probs, neutral_log_probs, positive_log_probs, add_full_stop=False):
        correct = 0
        
        test_reviews, test_labels = self.get_test_data()
        # test_review is a list of lists. Each inner list contains a list of sentences (string)
        
        counter = 0
        
        for review, label in zip(test_reviews, test_labels):
            counter += 1
            if counter % 100 == 0:
                print(counter)
                
            neg_count = 0
            pos_count = 0
            
            sentence_labels = []
            for sentence in review:
                if add_full_stop:
                    sentence += '.'
                sentence_tokens = self.tokenisation(sentence)
                sentence_vector = self.get_document_vector(sentence_tokens)
                
                neg_sum = 0   # For negative class
                neu_sum = 0   # For neutral class
                pos_sum = 0   # For positive class

                for word, freq in sentence_vector.items():
                    # When we use counts in feature vector use below two    
                    if freq == 0:
                        continue

                    neg_sum += negative_log_probs[word]
                    pos_sum += positive_log_probs[word]
                    neu_sum += neutral_log_probs[word]

#                 if neg_sum > neu_sum and neg_sum > pos_sum:
#                     sentence_labels.append(0)
#                 elif neu_sum > neg_sum and neu_sum > pos_sum:
#                     sentence_labels.append(1)
#                 elif pos_sum > neg_sum and pos_sum > neu_sum:
#                     sentence_labels.append(2)

                if neg_sum > pos_sum and neg_sum > neu_sum:
                    neg_count += 1
                elif pos_sum > neg_sum and pos_sum > neu_sum:
                    pos_count += 1
                
#             review_label = np.argmax(np.bincount(np.array(sentence_labels))) - 1  # -1 since we added 1 to labels for np.bincount

            review_label = 3  # Some non-sensical value
            if neg_count > pos_count:
                review_label = -1
            elif pos_count > neg_count:
                review_label = 1
            
            if review_label == label:
                correct += 1
#             else:
#                 print(counter)
#                 print("Correct label:", label, "But we gave label:", review_label)
#                 print(neg_count, pos_count)
                
        return correct / len(test_reviews)
    
    
    def get_statistics(self):
        negative_log_probs, neutral_log_probs, positive_log_probs = self.train_probabilities(negative_reviews_train, neutral_reviews_train, positive_reviews_train)
        print("TRAINED")
        accuracy = self.test_documents(negative_log_probs, neutral_log_probs, positive_log_probs, add_full_stop=False)
        return accuracy
        

In [47]:
vocabulary_unigram = get_vocabulary(add_not_tag=True)
vocabulary_bigram = get_vocabulary(unigram=True, bigram=True)
vocabulary = vocabulary_unigram | vocabulary_bigram
def tokenisation(x):
    unigrams = tokenise_document(x, add_not_tag=True)
    bigrams = tokenise_document(x, unigram=True, bigram=True)
    unigrams.extend(bigrams)
    return unigrams

# vocabulary = get_vocabulary(unigram=True, bigram=True)
# tokenisation = lambda x: tokenise_document(x, unigram=True, bigram=True)

print(len(vocabulary))

32324


In [48]:
nb = NaiveBayes(vocabulary, tokenisation)
negative_log_probs, neutral_log_probs, positive_log_probs = nb.train_probabilities(negative_reviews_train, neutral_reviews_train, positive_reviews_train)

Trained neg
Trained pos
Trained neutral


In [49]:
nb = NaiveBayes(vocabulary, tokenisation)
nb.test_documents(negative_log_probs, neutral_log_probs, positive_log_probs)

100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400


0.6207142857142857

In [37]:
test_review = "I have now seen Mr. Tommy Wiseau's cinematic tour-de-force, 'The Room' three times. With each viewing, The Room becomes more complexly entangled in and inseparable from my own life. I no longer know where The Room ends and I begin. It is, without question, the worst film ever made. But this comment is in no way meant to be discouraging. Because while The Room is the worst movie ever made it is also the greatest way to spend a blisteringly fast 100 minutes in the dark. Simply put, The Room will change your life. It\'s not just the dreadful acting or the sub-normal screenplay or the bewildering direction or the musical score so soaked in melodrama that you will throw up on yourself or the lunatic-making cinematography; no, there is something so magically wrong with this movie that it can only be the product of divine intervention. If you took the greatest filmmakers in history and gave them all the task of purposefully creating a film as spectacularly horrible as this not one of them, with all their knowledge and skill, could make anything that could even be considered as a contender. Not one line or scene would rival any moment in The Room. The centerpiece of this filmic holocaust is Mr. Tommy Wiseau himself. Without him, it would still be the worst movie ever made, but with him it is the greatest worst movie ever made. Tommy has been described as a Cajun, a Croatian cyborg, possibly from Belgium, clearly a product of Denmark, or maybe even not from this world or dimension. All of these things are true at any one moment. He is a tantalizing mystery stuffed inside an enigma wrapped in bacon and smothered in cheese. You will fall in love with this man even as you are repelled by him from the first moment he steps onto screen with his long Louis the Fourteenth style black locks and thick triangular shoulders packed into an oddly fitting suit, and his metallic steroid destroyed skin. Tommy looks out of place, out of time and out of this world. There has never been anything else like him. Nor will there ever be. The Room begins with Johnny (Tommy Wiseau) and his incomprehensibly evil fiancée Lisa (played by a woman with incongruously colored eyebrows and a propensity for removing her shirt) engaging in some light frottage, joined by, Denny, (played with a deft sense of the absurd by Phillip Haldiman), their sexually confused teenage neighbor who is clearly suffering from a form of aged decrepitude. When Denny, who looks like the human version of Gleek the monkey from Superfriends, says, in a slightly creepy yet playful tone of voice, I like to watch! as Johnny and Lisa roll around the bed in a pre-intercourse ritual revolving around rose petals, you know you are in for a very special movie. After a lengthy lovemaking scene (not to worry if you miss it the first time, they show it again in its entirety later in the movie) in which Tommy\'s bizarre scaly torso and over-anatomized rear-end are lovingly depicted over and over again as he appears to hump Lisa\'s hip, we discover that Lisa, for no particular reason, has become bored with Tommy's incessant lovemaking and decides to leave him. Just when you think the movie might lapse into an ordinary, pedestrian sort of badness, Johnny's best friend Mark, a man who\'s job seems to be to wear James Brolin\'s beard from Amityville Horror, shows up and electrifies the screen with a performance so wooden that it belongs in the lumber section of Home Depot. Incidentally, Mark is played by Greg Sestero, who, in addition to being described as a department store mannequin, was also the line producer on The Room and one of Tommy Wiseau\'s five (5!!!!!) assistants on the movie. Lisa forces Mark, amid his paltry, unconvincing protests, to have an affair with her on their uncomfortable circular stairs. For no apparent reason Lisa decides that she is made of pure evil and wants to torture her angelic and insanely devoted fiancé, Johnny. Lisa receives pointed advice from her mother who casually announces that she is dying of breast cancer and then never mentions it again. But Lisa is determined to make Johnny\'s life a living hell, in spite of the fact that she, according to her mother, cannot survive on her own in the cutthroat computer business. But not before they recycle the sex scene from earlier in the movie where we get another bird\'s eye view of Johnny\'s ludicrous naked body. Denny gets into trouble with a drug dealer. Mark shaves his beard. Tommy gets drunk on an unusual cocktail made from mixing whiskey and vodka. Lisa lies and tells everyone that Tommy hit her in a drunken rage. A balding psychologist appears out of nowhere, offers some advice, then apparently dies while softly falling on the ground in an attempt to catch a football thrown by Mark. All of these seemingly disparate events build up to two cathartic moments. The first is when Tommy expressively yells at Lisa with the line 'You are tearing me apart Lisa!'. You will cheer at this line as you realize that the film has been tearing you apart the whole time. And the second is at Tommy's birthday party where the worst actor that has ever been born plays a unidentified man wearing a silk shirt who utters a phrase that perfectly describes the experience of watching The Room, 'It feels like I'm sitting on atom bomb that is going to explode!' The shocking ending will leave you pleading for some kind of sequel. See this film at all costs. See it twice. Or three times. Or as one kid that I met from Woodland Hills has, 12 times! See it until you can recite every precious line of dialogue this movie has to offer. Let The Room become your new religion and Tommy Wiseau your prophet preaching the gospel according to Johnny. My dream is to someday buy a theater and run The Room 24 hours a day, 7 days a week until the print disintegrates. I hope it becomes your dream as well."
neg, neu, pos = nb.test_single_document(negative_log_probs, neutral_log_probs, positive_log_probs, test_review)

In [38]:
len(neg), len(pos), len(neu)

(20, 14, 11)

In [97]:
neu

[' With each viewing, The Room becomes more complexly entangled in and inseparable from my own life',
 ' If you took the greatest filmmakers in history and gave them all the task of purposefully creating a film as spectacularly horrible as this not one of them, with all their knowledge and skill, could make anything that could even be considered as a contender',
 ' Not one line or scene would rival any moment in The Room',
 ' The centerpiece of this filmic holocaust is Mr',
 ' All of these things are true at any one moment',
 ' You will fall in love with this man even as you are repelled by him from the first moment he steps onto screen with his long Louis the Fourteenth style black locks and thick triangular shoulders packed into an oddly fitting suit, and his metallic steroid destroyed skin',
 ' Tommy looks out of place, out of time and out of this world',
 ' Nor will there ever be',
 ' When Denny, who looks like the human version of Gleek the monkey from Superfriends, says, in a sli

In [19]:
class SVM:
    
    def __init__(self, vocabulary, tokenisation):
        self.dimensions = sorted(list(vocabulary))
        self.tokenisation = tokenisation
        
    
    def get_feature_vector(self, review_tokens):
        review_tokens = set(review_tokens)
        feature_vector = []
        for word in self.dimensions:
            if word in review_tokens:
                feature_vector.append(1)
            else:
                feature_vector.append(0)
        xs = np.array(feature_vector)

        # Normalisation
        denom = np.linalg.norm(xs)
        
        if denom == 0:
            return xs

        return xs / denom
    
    def get_test_data(self):
        reviews = []
        labels = []
        to_search = './Datasets/neg/'
        for f in os.listdir(to_search):
            path = to_search+f
            with open(path, 'r', encoding='latin-1') as fin:
                reviews.append(fin.read().strip())
                
        labels.extend([-1] * 700)

        to_search = './Datasets/pos/'
        for f in os.listdir(to_search):
            path = to_search+f
            with open(path, 'r', encoding='latin-1') as fin:
                reviews.append(fin.read().strip())
                
        labels.extend([1] * 700)
                
        reviews_sentences = list(map(lambda x: x.split('.'), reviews))
        return reviews_sentences, labels
    
    
    def train_classifier(self, negative_reviews, neutral_reviews, positive_reviews):
        classifier = LinearSVC()
        train_xs = []
        train_ys = []

        for neg_review in negative_reviews:
            neg_review_tokens = self.tokenisation(neg_review)
            xs = self.get_feature_vector(neg_review_tokens)
            train_xs.append(xs)
            train_ys.append(-1)  # Label -1 is for negative sentiment
            
        print("Neg train")

        for pos_review in positive_reviews:
            pos_review_tokens = self.tokenisation(pos_review)
            xs = self.get_feature_vector(pos_review_tokens)
            train_xs.append(xs)
            train_ys.append(1)  # Label 1 for positive sentiment
            
        print("Pos train")
            
        for neu_review in neutral_reviews:
            neu_review_tokens = self.tokenisation(neu_review)
            xs = self.get_feature_vector(neu_review_tokens)
            train_xs.append(xs)
            train_ys.append(0)  # Label 0 for neutral sentiment
            
        print("Neu train")
        
        classifier.fit(np.array(train_xs), np.array(train_ys))
        return classifier
    
    
    def test_classifier(self, classifier, add_full_stop=False):
        test_xs = []
        test_ys = []
        reviews_test, labels_test = self.get_test_data()
        
        correct = 0
        counter = 0
        
        for review, label in zip(reviews_test, labels_test):
            counter += 1
            if counter % 100 == 0:
                print(counter)
            
            review_vector = []
            for sentence in review:
                if add_full_stop:
                    sentence += '.'
                sentence_tokens = self.tokenisation(sentence)
                sentence_vector = self.get_feature_vector(sentence_tokens)
                review_vector.append(sentence_vector)
                
            sentence_labels = classifier.predict(np.array(review_vector))
            
            neg_count = np.sum(sentence_labels == -1)
            pos_count = np.sum(sentence_labels == 1)
            
            review_label = 3  # Some non-sensical value
            if neg_count > pos_count:
                review_label = -1
            elif pos_count > neg_count:
                review_label = 1
            
            if review_label == label:
                correct += 1
            
#             sentence_labels += 1  # We are adding 1 so we can use np.bincount()
#             review_label = np.argmax(np.bincount(sentence_labels)) - 1  # -1 since we added 1 to labels for np.bincount
            
#             if review_label == label:
#                 correct += 1
                
        return correct / len(reviews_test)
    
    
    def classify_single_document(self, classifier, review, add_full_stop=False):
        review_sentences = test_review.split('.')
        neg_sentences = []
        neu_sentences = []
        pos_sentences = []
        
        review_vector = []
        for sentence in review_sentences:
            if add_full_stop:
                sentence += '.'
            sentence_tokens = self.tokenisation(sentence)
            sentence_vector = self.get_feature_vector(sentence_tokens)
            review_vector.append(sentence_vector)
        
        review_sentences = np.array(review_sentences)
        sentence_labels = classifier.predict(np.array(review_vector))
        
        neg_sentences = review_sentences[sentence_labels == -1]
        neu_sentences = review_sentences[sentence_labels == 0]
        pos_sentences = review_sentences[sentence_labels == 1]
        
        return list(neg_sentences), list(neu_sentences), list(pos_sentences)
    
    
    def get_statistics(self):
        classifier = self.train_classifier(negative_reviews_train, neutral_reviews_train, positive_reviews_train)
        return self.test_classifier(classifier)

In [42]:
cf = SVM(vocabulary, tokenisation)
classifier = cf.train_classifier(negative_reviews_train, neutral_reviews_train, positive_reviews_train)

Neg train
Pos train
Neu train


In [43]:
cf = SVM(vocabulary, tokenisation)
cf.test_classifier(classifier)

100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400


0.7085714285714285

In [33]:
test_review = "I have now seen Mr. Tommy Wiseau's cinematic tour-de-force, 'The Room' three times. With each viewing, The Room becomes more complexly entangled in and inseparable from my own life. I no longer know where The Room ends and I begin. It is, without question, the worst film ever made. But this comment is in no way meant to be discouraging. Because while The Room is the worst movie ever made it is also the greatest way to spend a blisteringly fast 100 minutes in the dark. Simply put, The Room will change your life. It\'s not just the dreadful acting or the sub-normal screenplay or the bewildering direction or the musical score so soaked in melodrama that you will throw up on yourself or the lunatic-making cinematography; no, there is something so magically wrong with this movie that it can only be the product of divine intervention. If you took the greatest filmmakers in history and gave them all the task of purposefully creating a film as spectacularly horrible as this not one of them, with all their knowledge and skill, could make anything that could even be considered as a contender. Not one line or scene would rival any moment in The Room. The centerpiece of this filmic holocaust is Mr. Tommy Wiseau himself. Without him, it would still be the worst movie ever made, but with him it is the greatest worst movie ever made. Tommy has been described as a Cajun, a Croatian cyborg, possibly from Belgium, clearly a product of Denmark, or maybe even not from this world or dimension. All of these things are true at any one moment. He is a tantalizing mystery stuffed inside an enigma wrapped in bacon and smothered in cheese. You will fall in love with this man even as you are repelled by him from the first moment he steps onto screen with his long Louis the Fourteenth style black locks and thick triangular shoulders packed into an oddly fitting suit, and his metallic steroid destroyed skin. Tommy looks out of place, out of time and out of this world. There has never been anything else like him. Nor will there ever be. The Room begins with Johnny (Tommy Wiseau) and his incomprehensibly evil fiancée Lisa (played by a woman with incongruously colored eyebrows and a propensity for removing her shirt) engaging in some light frottage, joined by, Denny, (played with a deft sense of the absurd by Phillip Haldiman), their sexually confused teenage neighbor who is clearly suffering from a form of aged decrepitude. When Denny, who looks like the human version of Gleek the monkey from Superfriends, says, in a slightly creepy yet playful tone of voice, I like to watch! as Johnny and Lisa roll around the bed in a pre-intercourse ritual revolving around rose petals, you know you are in for a very special movie. After a lengthy lovemaking scene (not to worry if you miss it the first time, they show it again in its entirety later in the movie) in which Tommy\'s bizarre scaly torso and over-anatomized rear-end are lovingly depicted over and over again as he appears to hump Lisa\'s hip, we discover that Lisa, for no particular reason, has become bored with Tommy's incessant lovemaking and decides to leave him. Just when you think the movie might lapse into an ordinary, pedestrian sort of badness, Johnny's best friend Mark, a man who\'s job seems to be to wear James Brolin\'s beard from Amityville Horror, shows up and electrifies the screen with a performance so wooden that it belongs in the lumber section of Home Depot. Incidentally, Mark is played by Greg Sestero, who, in addition to being described as a department store mannequin, was also the line producer on The Room and one of Tommy Wiseau\'s five (5!!!!!) assistants on the movie. Lisa forces Mark, amid his paltry, unconvincing protests, to have an affair with her on their uncomfortable circular stairs. For no apparent reason Lisa decides that she is made of pure evil and wants to torture her angelic and insanely devoted fiancé, Johnny. Lisa receives pointed advice from her mother who casually announces that she is dying of breast cancer and then never mentions it again. But Lisa is determined to make Johnny\'s life a living hell, in spite of the fact that she, according to her mother, cannot survive on her own in the cutthroat computer business. But not before they recycle the sex scene from earlier in the movie where we get another bird\'s eye view of Johnny\'s ludicrous naked body. Denny gets into trouble with a drug dealer. Mark shaves his beard. Tommy gets drunk on an unusual cocktail made from mixing whiskey and vodka. Lisa lies and tells everyone that Tommy hit her in a drunken rage. A balding psychologist appears out of nowhere, offers some advice, then apparently dies while softly falling on the ground in an attempt to catch a football thrown by Mark. All of these seemingly disparate events build up to two cathartic moments. The first is when Tommy expressively yells at Lisa with the line 'You are tearing me apart Lisa!'. You will cheer at this line as you realize that the film has been tearing you apart the whole time. And the second is at Tommy's birthday party where the worst actor that has ever been born plays a unidentified man wearing a silk shirt who utters a phrase that perfectly describes the experience of watching The Room, 'It feels like I'm sitting on atom bomb that is going to explode!' The shocking ending will leave you pleading for some kind of sequel. See this film at all costs. See it twice. Or three times. Or as one kid that I met from Woodland Hills has, 12 times! See it until you can recite every precious line of dialogue this movie has to offer. Let The Room become your new religion and Tommy Wiseau your prophet preaching the gospel according to Johnny. My dream is to someday buy a theater and run The Room 24 hours a day, 7 days a week until the print disintegrates. I hope it becomes your dream as well."
cf = SVM(vocabulary, tokenisation)
neg, neu, pos = cf.classify_single_document(classifier, test_review)

In [34]:
len(neg), len(pos), len(neu)

(20, 10, 18)

In [36]:
pos

[' Simply put, The Room will change your life',
 ' Without him, it would still be the worst movie ever made, but with him it is the greatest worst movie ever made',
 ' He is a tantalizing mystery stuffed inside an enigma wrapped in bacon and smothered in cheese',
 ' There has never been anything else like him',
 ' The Room begins with Johnny (Tommy Wiseau) and his incomprehensibly evil fiancée Lisa (played by a woman with incongruously colored eyebrows and a propensity for removing her shirt) engaging in some light frottage, joined by, Denny, (played with a deft sense of the absurd by Phillip Haldiman), their sexually confused teenage neighbor who is clearly suffering from a form of aged decrepitude',
 " Incidentally, Mark is played by Greg Sestero, who, in addition to being described as a department store mannequin, was also the line producer on The Room and one of Tommy Wiseau's five (5!!!!!) assistants on the movie",
 ' Tommy gets drunk on an unusual cocktail made from mixing whiske