## Classify

In [23]:
import nltk.classify.util
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk import ngrams

stoplist = set(stopwords.words("english"))

# Method from 
# https://stackoverflow.com/questions/48003907/how-to-train-naive-bayes-classifier-for-n-gram-movie-reviews
def create_ngram_features(words, n=2):
    ngram_vocab = ngrams(words, n)
    my_dict = dict([(ng, True) for ng in ngram_vocab])
    return my_dict

In [4]:
with open('./DATASET/train/truthful.txt') as t, open('./DATASET/train/deceptive.txt') as d:
    truthful_train_txt = t.read()
    deceptive_train_txt = d.read()

In [25]:
def add_start_character(review):
    return '<s> ' + review

def preprocess(text):
    review_list = text.split('\n')[:-1]
    processed_review_list = [add_start_character(review) for review in review_list]
    return processed_review_list

In [26]:
truthful_data = [(create_ngram_features(review.split()), 'truthful') for review in preprocess(truthful_train_txt)]
deceptive_data = [(create_ngram_features(review.split()), 'deceptive') for review in preprocess(deceptive_train_txt)]
training_data = deceptive_data + truthful_data 

In [27]:
classifier = NaiveBayesClassifier.train(training_data)

## Validation

In [5]:
with open('./DATASET/validation/truthful.txt') as t, open('./DATASET/validation/deceptive.txt') as d:
    truthful_val_txt = t.read()
    deceptive_val_txt = d.read()

In [29]:
truthful_data_v = [(create_ngram_features(review.split()), 'truthful') for review in preprocess(truthful_val_txt)]
deceptive_data_v = [(create_ngram_features(review.split()), 'deceptive') for review in preprocess(deceptive_val_txt)]
validation_data = truthful_data_v + deceptive_data_v 

In [30]:
accuracy = nltk.classify.util.accuracy(classifier, validation_data)

In [31]:
accuracy

0.9140625

In [32]:
classifier.classify({'This is a test': True}) #This is not the right paramater i think

'truthful'

In [33]:
review = "This is a review"
review = add_start_character(review).split()
review = create_ngram_features(review)

## Test

In [74]:
with open('./DATASET/test/test.txt') as t:
    test_txt = t.read()
test_reviews = [add_start_character(review) for review in test_txt.split('\n')]
test_reviews_features = [create_ngram_features(review.split()) for review in test_reviews]

In [84]:
for review in test_reviews_features:
    print(classifier.classify(review))

truthful
truthful
deceptive
deceptive
truthful
truthful
deceptive
deceptive
deceptive
truthful
deceptive
deceptive
deceptive
truthful
deceptive
deceptive
deceptive
truthful
deceptive
truthful
truthful
deceptive
deceptive
deceptive
deceptive
deceptive
deceptive
truthful
deceptive
truthful
deceptive
truthful
truthful
truthful
deceptive
truthful
deceptive
truthful
deceptive
deceptive
deceptive
deceptive
deceptive
deceptive
deceptive
deceptive
truthful
deceptive
deceptive
deceptive
truthful
deceptive
truthful
deceptive
deceptive
deceptive
truthful
deceptive
truthful
deceptive
deceptive
deceptive
truthful
deceptive
truthful
deceptive
truthful
truthful
truthful
deceptive
deceptive
truthful
truthful
truthful
truthful
deceptive
deceptive
deceptive
deceptive
deceptive
truthful
truthful
truthful
truthful
truthful
truthful
truthful
truthful
truthful
truthful
deceptive
truthful
truthful
truthful
deceptive
deceptive
deceptive
deceptive
truthful
truthful
truthful
truthful
truthful
deceptive
truthful

In [1]:
import nltk.classify.util
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk import ngrams

In [10]:
class bigram_nb_classifier():
    '''
    A bigram naive bayes classifier built on top of the NLTK library.
    
    '''
    
    def __init__(self, truthful_train_txt, deceptive_train_txt):
        '''
        Initialize with the raw text of the reviews from the text files, delimited by newlines.
        
        truthful_train_txt: One string containing all truthful reviews to be trained, delimited by newlines.
        deceptive_train_txt: One string all truthful reviews to be trained, delimited by newlines.
        '''
        truthful_data = [(self.create_ngram_features(review.split()), 'truthful') \
                         for review in self.preprocess(truthful_train_txt)]
        deceptive_data = [(self.create_ngram_features(review.split()), 'deceptive') \
                          for review in self.preprocess(deceptive_train_txt)]
        self.training_data = deceptive_data + truthful_data
        self.classifier = NaiveBayesClassifier.train(self.training_data)
        self.latest_accuracy = -1
        
        stoplist = set(stopwords.words("english"))
    
    # Method from 
    # https://stackoverflow.com/questions/48003907/how-to-train-naive-bayes-classifier-for-n-gram-movie-reviews
    def create_ngram_features(self, words, n=2):
        '''
        Will create a dictionary of bigrams in the form {(word1, word2), True} for input into the NLTK 
        classifier. 
        
        words: words of 1 review to be converted into Bigrams.
        n: n value in n-grams.
        '''
        ngram_vocab = ngrams(words, n)
        my_dict = dict([(ng, True) for ng in ngram_vocab])
        return my_dict
    
    def compute_accuracy(self, truthful_val_txt, deceptive_val_txt):
        '''
        Computes the accuracy against the validation set.
        
        truthful_val_txt: One string containing all truthful reviews to be validated, 
            delimited by newlines.
        deceptive_val_txt: One string containing all deceptive reviews to be validated, 
            delimited by newlines.
        '''
        truthful_data_v = [(self.create_ngram_features(review.split()), 'truthful') \
                           for review in preprocess(truthful_val_txt)]
        deceptive_data_v = [(self.create_ngram_features(review.split()), 'deceptive') \
                            for review in preprocess(deceptive_val_txt)]
        validation_data = truthful_data_v + deceptive_data_v 
        self.latest_accuracy = nltk.classify.util.accuracy(classifier, validation_data)
        return self.latest_accuracy
    
    def add_start_character(self, review):
        '''
        Adds a start token (<s>) to the beginning of the string.
        
        review: Review to add the token to the start of.
        '''
        return '<s> ' + review

    def preprocess(self, text):
        '''
        Splits reviews from the review list into individual reviews and adds a start character to 
        each one. 
        
        text: Raw text from training text as a string.
        '''
        review_list = text.split('\n')[:-1]
        processed_review_list = [self.add_start_character(review) for review in review_list]
        return processed_review_list
    
    def classify_review(self, review):
        '''
        Classifies a review as either truthful or deceptive.
        
        review: Review as a string to be classified.
        '''
        review = self.add_start_character(review)
        return self.classifier.classify(self.create_ngram_features(review.split()))
        
        

In [11]:
bnb = bigram_nb_classifier(truthful_train_txt, deceptive_train_txt)

In [13]:
bnb.classify_review('the room smelled bad')

'deceptive'