In [155]:
import re
from collections import Counter
import math
import pandas as pd

class bigram_nb_classifier():
    def __init__(self, training_truthful_text, training_deceptive_text):
        self.truthful_text = self.add_start_characters(training_truthful_text)
        self.deceptive_text = self.add_start_characters(training_deceptive_text)
        
        #AKA the unigram counts
        self.truthful_reviews_Counter = Counter(self.truthful_text.split())
        self.deceptive_reviews_Counter = Counter(self.deceptive_text.split())
        
        self.truthful_total_words = sum(self.truthful_reviews_Counter.values())
        self.deceptive_total_words = sum(self.deceptive_reviews_Counter.values())
        
        self.truthful_reviews_len = len(self.truthful_text.split('\n'))
        self.deceptive_reviews_len = len(self.truthful_text.split('\n'))

        self.both_reviews_Counter = self.truthful_reviews_Counter + self.deceptive_reviews_Counter
        self.vocabulary_size = len(self.both_reviews_Counter.keys())
        self.word_pattern = re.compile("(\w+|<s> |[,.!;])")

        self.truthful_bigram_counts = self.get_bigram_counts(self.truthful_text.split())
        self.deceptive_bigram_counts = self.get_bigram_counts(self.deceptive_text.split())

        self.smoothed_truthful_bigram_counts = \
            self.get_smoothed_bigram_corpus(self.truthful_reviews_Counter , self.truthful_bigram_counts)
        self.smoothed_deceptive_bigram_counts = \
            self.get_smoothed_bigram_corpus(self.deceptive_reviews_Counter , self.deceptive_bigram_counts)

    def add_start_characters(self, words):
        words = '<s> ' + words
        words = words.replace('\n', ' <s> ')
        return words[:-5]
        
    def get_bigram_counts(self, word_list):
        corpus = {}
        for i, word in enumerate(word_list[1:], start=1):
            if word != '<s>':
                if (word_list[i-1], word) not in corpus:
                    corpus[(word_list[i-1], word)] = 1
                else:
                    corpus[(word_list[i-1], word)] += 1
        return corpus
    
    
    def check_for_unk_words(self, word_list, corpus):        
        for i, word in enumerate(word_list):
            if word not in corpus:
                  word_list[i] = '<UNK>'
        return word_list
    
    
    def get_smoothed_bigram_corpus(self, unigram_corpus, bigrams):
        unigram_corpus['<UNK>'] = 1
        df = pd.DataFrame(1, index = unigram_corpus, columns = unigram_corpus) 
        for bigram in bigrams:
            df.loc[bigram[0], bigram[1]] += bigrams[bigram]
        return df

    # given an inputted review, return a list of bigrams, used for parsing reviews in the classify method
    def get_bigram_list(self, review, corpus):
        word_list = [word for word in self.word_pattern.findall(review)]
        word_list = self.check_for_unk_words(word_list, corpus)
        list_of_bigrams = []
        for i, word in enumerate(word_list[1:], start=1):
            list_of_bigrams.append((word_list[i-1], word))

        return list_of_bigrams


    def get_smoothed_bigram_log_prob(self, bigram, smoothed_bigram_corpus):
        return math.log(smoothed_bigram_corpus.loc[bigram[0], bigram[1]]/smoothed_bigram_corpus.loc[bigram[0]].sum())
    

    def smoothed_review_log_prob(self, review, counter, smoothed_bigram_corpus, unigram_corpus):
        bigram_list = self.get_bigram_list(review, unigram_corpus)
        log_prob = 0.0
        for bigram in bigram_list:
            log_prob += self.get_smoothed_bigram_log_prob(bigram, smoothed_bigram_corpus)
        return log_prob
    

    def classify_review(self, review):
        review = '<s> ' + review        
        
        truthful_prob = self.smoothed_review_log_prob(review,
                            self.truthful_bigram_counts, self.smoothed_truthful_bigram_counts, \
                            self.truthful_reviews_Counter)
        deceptive_prob = self.smoothed_review_log_prob(review,
                                self.deceptive_bigram_counts, self.smoothed_deceptive_bigram_counts, \
                                self.deceptive_reviews_Counter)
        
    
        # get ratio between the two (since this training set has the same number of reviews, this code is optional)
#         truthful_prob = truthful_prob + \
#             math.log(self.truthful_reviews_len/(self.truthful_reviews_len + self.deceptive_reviews_len))
#         deceptive_prob = deceptive_prob + \
#             math.log(self.deceptive_reviews_len /(self.truthful_reviews_len + self.deceptive_reviews_len))

        return 'truthful' if truthful_prob >= deceptive_prob else 'deceptive'


In [156]:
with open('./DATASET/train/truthful.txt') as t, open('./DATASET/train/deceptive.txt') as d:
    truthful_training_text = t.read()
    deceptive_training_text = d.read()

In [157]:
bnb = bigram_nb_classifier(truthful_training_text, deceptive_training_text)

In [158]:
with open('./DATASET/validation/truthful.txt') as t, open('./DATASET/validation/deceptive.txt') as d:
    truthful_validation_text = t.read()
    deceptive_validation_text = d.read()

In [159]:
truthful_validation_classifications = \
    [bnb.classify_review(review) for review in truthful_validation_text.split('\n')]
truthful_validation_accuracy_counts = Counter(truthful_validation_classifications)
print(truthful_validation_accuracy_counts)
print('Accuracy rate:', 
      truthful_validation_accuracy_counts['truthful']/sum(truthful_validation_accuracy_counts.values()))

Counter({'deceptive': 106, 'truthful': 23})
Accuracy rate: 0.17829457364341086


In [160]:
deceptive_validation_classifications = \
    [bnb.classify_review(review) for review in deceptive_validation_text.split('\n')]
deceptive_validation_accuracy_counts = Counter(deceptive_validation_classifications)
print(deceptive_validation_accuracy_counts)
print('Accuracy rate:', 
      deceptive_validation_accuracy_counts['deceptive']/sum(deceptive_validation_accuracy_counts.values()))

Counter({'deceptive': 128, 'truthful': 1})
Accuracy rate: 0.9922480620155039
