In [7]:
from collections import Counter
import pandas as pd
import math
import re

In [21]:

class bag_of_words_nb_classifier():
    def __init__(self, training_truthful_text, training_deceptive_text):
        self.truthful_text = self.add_start_characters(training_truthful_text)
        self.deceptive_text = self.add_start_characters(training_deceptive_text)
        
        self.truthful_reviews_Counter = Counter(self.truthful_text.split())
        self.deceptive_reviews_Counter = Counter(self.deceptive_text.split())
        
        self.truthful_total_words = sum(self.truthful_reviews_Counter.values())
        self.deceptive_total_words = sum(self.deceptive_reviews_Counter.values())
        
        self.truthful_reviews_len = len(self.truthful_text.split('\n'))
        self.deceptive_reviews_len = len(self.truthful_text.split('\n'))

        self.both_reviews_Counter = self.truthful_reviews_Counter + self.deceptive_reviews_Counter
        self.vocabulary_size = len(self.both_reviews_Counter.keys())
        self.word_pattern = re.compile("(\w+|<s> |[,.!;])")
        
        self.k = 0.2

    def add_start_characters(self, words):
        words = '<s> ' + words
        words = words.replace('\n', ' <s> ')
        return words[:-5]
        
    def smoothed_word_log_prob(self, word, counter, total):
        return math.log((counter[word] + self.k) / (total + (self.vocabulary_size*self.k)))
    
    
    def smoothed_review_log_prob(self, review, counter, total):
        log_prob = 0.0
        for word in self.word_pattern.findall(review):
            log_prob += self.smoothed_word_log_prob(word, counter, total)
        return log_prob

    
    def classify_review(self, review):
        review = '<s> ' + review
        truthful_prob = self.smoothed_review_log_prob(review,
                            self.truthful_reviews_Counter, self.truthful_total_words)
        deceptive_prob = self.smoothed_review_log_prob(review,
                                self.deceptive_reviews_Counter, self.deceptive_total_words)

        # get ratio between the two (since this training set has the same number of reviews, this code is optional)
        truthful_prob = truthful_prob + \
            math.log(self.truthful_reviews_len/(self.truthful_reviews_len + self.deceptive_reviews_len))
        deceptive_prob = deceptive_prob + \
            math.log(self.deceptive_reviews_len /(self.truthful_reviews_len + self.deceptive_reviews_len))

        return 0 if truthful_prob >= deceptive_prob else 1


## Training

In [24]:
with open('./DATASET/train/truthful.txt') as t, open('./DATASET/train/deceptive.txt') as d:
    truthful = t.read()
    deceptive = d.read()
nb = bag_of_words_nb_classifier(truthful, deceptive)

## Validation

In [10]:
with open('./DATASET/validation/truthful.txt') as t, open('./DATASET/validation/deceptive.txt') as d:
    truthful_validation_text = t.read()
    deceptive_validation_text = d.read()

In [15]:
truthful_validation_classifications = \
    [nb.classify_review(review) for review in truthful_validation_text.split('\n')]
truthful_validation_accuracy_counts = Counter(truthful_validation_classifications)
print(truthful_validation_accuracy_counts)
print('Accuracy rate:', 
      truthful_validation_accuracy_counts['truthful']/sum(truthful_validation_accuracy_counts.values()))

Counter({'truthful': 120, 'deceptive': 9})
Accuracy rate: 0.9302325581395349


In [12]:
deceptive_validation_classifications = \
    [nb.classify_review(review) for review in deceptive_validation_text.split('\n')]
deceptive_validation_accuracy_counts = Counter(deceptive_validation_classifications)
print(deceptive_validation_accuracy_counts)
print('Accuracy rate:', 
      deceptive_validation_accuracy_counts['deceptive']/sum(deceptive_validation_accuracy_counts.values()))

Counter({'deceptive': 121, 'truthful': 8})
Accuracy rate: 0.937984496124031


In [13]:
import csv

In [25]:
with open('./DATASET/test/test.txt') as t:
    test_text = t.read()

    deceptive_validation_classifications = \
    test_results=[nb.classify_review(review) for review in test_text.split('\n')]

In [30]:
test_results
test_ids = [id_ for id_ in range(0, len(test_text.split('\n')))]

In [31]:
test_ids
df = pd.DataFrame({'Id'})

In [37]:
df = pd.DataFrame({'Id' : test_ids, 'Prediction':test_results}, columns=['Id', 'Prediction'])
df.to_csv('naive_bayes_unigram.csv')

321