In [1]:
from collections import Counter
import pandas as pd

In [2]:
with open('./DATASET/train/truthful.txt') as t, open('./DATASET/train/deceptive.txt') as d:
    truthful = t.read()
    deceptive = d.read()

In [3]:
def add_start_characters(words):
    words = '<s> ' + words
    words = words.replace('\n', ' <s> ')
    return words[:-5]

## Unigrams

In [4]:
truthful_unigram = dict(Counter(add_start_characters(truthful).split()))
deceptive_unigram = dict(Counter(add_start_characters(deceptive).split()))

In [5]:
def get_unigram_prob(corpus, unigram_key):
    total_words = 0
    for key in corpus:
        total_words += corpus[key]

    return corpus.get(unigram_key, 0)/total_words

In [6]:
get_unigram_prob(deceptive_unigram, 'I')

0.02905073649754501

## Unsmoothed Bigrams

In [7]:
def get_bigram_counts(word_list):
    corpus = {}
    for i, word in enumerate(word_list[1:], start=1):
        if word != '<s>':
            if (word_list[i-1], word) not in corpus:
                corpus[(word_list[i-1], word)] = 1
            else:
                corpus[(word_list[i-1], word)] += 1
        
    return corpus

In [8]:
truthful_bigram = get_bigram_counts(add_start_characters(truthful).split())
deceptive_bigram = get_bigram_counts(add_start_characters(deceptive).split())

In [9]:
def get_bigram_prob(corpus, bigram_to_test):
    total_words = 0
    for key in corpus:
        if key[0] == bigram_to_test[0]:
            total_words += corpus[key]
    return corpus.get(bigram_to_test, 0)/total_words

In [10]:
get_bigram_prob(deceptive_bigram, ('I', 'lhjlkh'))

0.0

### Smoothed Bigrams

In [52]:
def get_smoothed_bigram_corpus(unigram_corpus, bigrams):
    unigram_corpus['<UNK>'] = 1
    df = pd.DataFrame(1, index = unigram_corpus, columns = unigram_corpus) 
    for bigram in bigrams:
        df.loc[bigram[0], bigram[1]] += bigrams[bigram]
    return df

In [53]:
df = pd.DataFrame(1, index =deceptive_unigram, columns =deceptive_unigram) 

In [54]:
get_smoothed_bigram_corpus(deceptive_unigram, deceptive_bigram)

Unnamed: 0,<s>,I,was,here,on,business,so,needed,to,get,...,rockin,ROYAL,gifts,allergy-friendly,tree,informal,tranquility,cleaners,beutiful,<UNK>
<s>,1,135,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
I,1,1,326,1,1,1,2,15,5,12,...,1,1,1,1,1,1,1,1,1,1
was,1,3,1,4,19,1,38,1,15,1,...,1,1,1,1,1,1,1,1,1,1
here,1,1,4,1,4,1,1,1,2,1,...,1,1,1,1,1,1,1,1,1,1
on,1,1,1,1,1,15,1,1,2,1,...,1,1,1,1,1,1,1,1,1,1
business,1,5,4,1,1,1,3,1,1,1,...,1,1,1,1,1,1,1,1,1,1
so,1,35,2,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
needed,1,1,2,1,1,1,1,1,11,1,...,1,1,1,1,1,1,1,1,1,1
to,1,1,3,1,2,1,3,1,1,75,...,1,1,1,1,1,1,1,1,1,1
get,1,1,1,1,3,1,1,1,15,1,...,1,1,1,1,1,1,1,1,1,1


In [55]:
def get_smoothed_bigram_prob(bigram, smoothed_bigram_corpus):
    return smoothed_bigram_corpus.loc[bigram[0], bigram[1]]/smoothed_bigram_corpus.loc[bigram[0]].sum()

In [15]:
get_smoothed_bigram_prob(('I', 'am'), df)

0.00016840687100033681

## Part 4 – Validation

In [16]:
with open('./DATASET/validation/truthful.txt') as t, open('./DATASET/validation/deceptive.txt') as d:
    truthful_v = t.read()
    deceptive_v =d.read()

In [48]:
# replace all unknown words with <UNK> token
def check_for_unk_words(word_list, corpus):
    for i, word in enumerate(word_list):
        if word not in corpus:
              word_list[i] = '<UNK>'
    return word_list

## Validation Unigrams

In [18]:
# Cleaning & handling unknown words for truthful validation
cleaned_truthful_word_list_v = check_for_unk_words(add_start_characters(truthful_v).split(), truthful_unigram)
truthful_unigram_v = Counter(cleaned_truthful_word_list_v)

# Cleaning & handling unknown words for deceptive validation
cleaned_deceptive_word_list_v = check_for_unk_words(add_start_characters(deceptive_v).split(), deceptive_unigram)
deceptive_unigram_v = Counter(cleaned_deceptive_word_list_v)

## Validation Bigrams

In [19]:
truthful_bigram_v = get_bigram_counts(cleaned_truthful_word_list_v)
deceptive_bigram_v = get_bigram_counts(cleaned_deceptive_word_list_v)

In [20]:
get_smoothed_bigram_corpus(truthful_unigram_v, truthful_bigram_v)

<class 'collections.Counter'>


Unnamed: 0,<s>,I,stayed,for,four,nights,while,attending,a,conference,...,NOTHING,training,opted,own,sterile,lives,indoor,movie,Mag,affordable
<s>,1,28,1,1,1,1,1,1,2,1,...,1,1,1,1,1,1,1,1,1,1
I,1,1,21,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
stayed,1,1,1,3,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
for,1,1,1,1,3,1,1,1,43,1,...,1,1,1,1,1,1,1,1,1,1
four,1,1,1,1,1,3,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
nights,1,1,1,4,1,1,3,1,1,1,...,1,1,1,1,1,1,1,1,1,1
while,1,3,1,1,1,1,1,3,2,1,...,1,1,1,1,1,1,1,1,1,1
attending,1,1,1,1,1,1,1,1,4,1,...,1,1,1,1,1,1,1,1,1,1
a,1,1,1,1,1,1,2,1,1,5,...,1,2,1,1,1,1,1,1,1,1
conference,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1


## Perplexity

In [21]:
def get_n_gram_model(wordlist, n):
    # wordlist is an in-order list of the words on which to build the model
    # n determines whether to use unigram or bigram estimation (n must be 1 or 2)
    n_gram_model = {}
    for i, word in enumerate(wordlist):
        if i + n > len(wordlist) - 1:
            break
        key = tuple([wordlist[i+x] for x in range(n)])
        if n == 1:
            # by convention, we now use strings as keys for unigram models and tuples for bigram models
            key = word
        if key not in n_gram_model:
            n_gram_model[key] = 1
        else:
            n_gram_model[key] += 1 
    return n_gram_model

In [22]:
def get_n_gram_prob(sequence, model):
    # model is a dictionary representing the n-grams and counts for some dataset
    
    # this just gets n implied from the model
    for key in model:
        if isinstance(key, str):
            n = 1
        else:
            n = len(key)
        break
        
    if n == 1:
        # case if model uses unigram estimation
        result = 1
        for word in sequence:
            result = result * get_unigram_prob(model, word)
        return result
    elif n == 2:
        # case if model uses bigram estimation
        bigrams = get_bigram_counts(sequence)
        result = 1
        for bigram in bigrams:
            result = result * get_bigram_prob(model, bigram)
        return result
    else:
        # else cannot compute
        return 0

In [23]:
get_n_gram_prob(['I', 'like', 'hotels'], get_n_gram_model(cleaned_truthful_word_list_v, 2))

0.0

In [24]:
import math
def compute_perplexity(N, model, n):
    # N is the number of tokens, so the length of the unigram corpus
    acc = 0

    for i, n_gram in enumerate(model):
        if i == N:
            break
        acc -= math.log(get_n_gram_prob(n_gram, model))
    return math.e ** ((1/N) * acc)

In [25]:
compute_perplexity(len(truthful_unigram_v), get_n_grams(cleaned_truthful_word_list_v, 1), 1)

NameError: name 'get_n_grams' is not defined

## Language Model Classifier

In [26]:
#review is the unigram for the review
compute_perplexity(review)

NameError: name 'review' is not defined

## Bag of Words Naïve Bayes Classifier
* A lot of the code was adapted from my INFO 2950 class from a homework where we had to write our own naive bayes text classifier
* Does smoothing

In [37]:
with open('./DATASET/train/truthful.txt') as t, open('./DATASET/train/deceptive.txt') as d:
    truthful = t.read()
    deceptive = d.read()

In [43]:
import re
class bag_of_words_nb_classifier():
    def __init__(self, training_truthful_text, training_deceptive_text):
        self.truthful_text = self.add_start_characters(training_truthful_text)
        self.deceptive_text = self.add_start_characters(training_deceptive_text)
        
        self.truthful_reviews_Counter = Counter(self.truthful_text.split())
        self.deceptive_reviews_Counter = Counter(self.deceptive_text.split())
        
        self.truthful_total_words = sum(self.truthful_reviews_Counter.values())
        self.deceptive_total_words = sum(self.deceptive_reviews_Counter.values())
        
        self.truthful_reviews_len = len(self.truthful_text.split('\n'))
        self.deceptive_reviews_len = len(self.truthful_text.split('\n'))

        self.both_reviews_Counter = self.truthful_reviews_Counter + self.deceptive_reviews_Counter
        self.vocabulary_size = len(self.both_reviews_Counter.keys())
        self.word_pattern = re.compile("(\w+|<s> |[,.!;])")
        
        self.k = 0.2

    def add_start_characters(self, words):
        words = '<s> ' + words
        words = words.replace('\n', ' <s> ')
        return words[:-5]
        
    def smoothed_word_log_prob(self, word, counter, total):
        return math.log((counter[word] + self.k) / (total + (self.vocabulary_size*self.k)))
    
    
    def smoothed_review_log_prob(self, review, counter, total):
        log_prob = 0.0
        for word in self.word_pattern.findall(review):
            log_prob += self.smoothed_word_log_prob(word, counter, total)
        return log_prob

    
    def classify_review(self, review):
        review = '<s> ' + review
        truthful_prob = self.smoothed_review_log_prob(review,
                            self.truthful_reviews_Counter, self.truthful_total_words)
        deceptive_prob = self.smoothed_review_log_prob(review,
                                self.deceptive_reviews_Counter, self.deceptive_total_words)

        # get ratio between the two (since this training set has the same number of reviews, this code is optional)
        truthful_prob = truthful_prob + \
            math.log(self.truthful_reviews_len/(self.truthful_reviews_len + self.deceptive_reviews_len))
        deceptive_prob = deceptive_prob + \
            math.log(self.deceptive_reviews_len /(self.truthful_reviews_len + self.deceptive_reviews_len))

        return 'truthful' if truthful_prob >= deceptive_prob else 'deceptive'


In [44]:
nb = bag_of_words_nb_classifier(truthful, deceptive)

## Validating Bag of Words Naive Bayes Classifier

In [45]:
with open('./DATASET/validation/truthful.txt') as t, open('./DATASET/validation/deceptive.txt') as d:
    truthful_validation_text = t.read()
    deceptive_validation_text = d.read()

In [46]:
truthful_validation_classifications = \
    [nb.classify_review(review) for review in truthful_validation_text.split('\n')]
truthful_validation_accuracy_counts = Counter(truthful_validation_classifications)
print(truthful_validation_accuracy_counts)
print('Accuracy rate:', 
      truthful_validation_accuracy_counts['truthful']/sum(truthful_validation_accuracy_counts.values()))

Counter({'truthful': 120, 'deceptive': 9})
Accuracy rate: 0.9302325581395349


In [47]:
deceptive_validation_classifications = \
    [nb.classify_review(review) for review in deceptive_validation_text.split('\n')]
deceptive_validation_accuracy_counts = Counter(deceptive_validation_classifications)
print(deceptive_validation_accuracy_counts)
print('Accuracy rate:', 
      deceptive_validation_accuracy_counts['deceptive']/sum(deceptive_validation_accuracy_counts.values()))

Counter({'deceptive': 121, 'truthful': 8})
Accuracy rate: 0.937984496124031


---


## Bigram Naive Bayes Classifier

In [33]:
with open('./DATASET/train/truthful.txt') as t, open('./DATASET/train/deceptive.txt') as d:
    truthful = t.read()
    deceptive = d.read()
truthful_unigram = dict(Counter(add_start_characters(truthful).split()))
deceptive_unigram = dict(Counter(add_start_characters(deceptive).split()))

truthful_bigram = get_bigram_counts(add_start_characters(truthful).split())
deceptive_bigram = get_bigram_counts(add_start_characters(deceptive).split())

---

In [34]:
class bigram_nb_classifier():
    def __init__(self, truthful_text, deceptive_text):
        self.truthful_text = truthful_text
        self.deceptive_text = deceptive_text
        
        self.truthful_unigram = dict(Counter(add_start_characters(truthful).split()))
        self.deceptive_unigram = dict(Counter(add_start_characters(deceptive).split()))
        
        self.truthful_bigram = get_bigram_counts(add_start_characters(truthful).split())
        self.deceptive_bigram = get_bigram_counts(add_start_characters(deceptive).split())
    
    

In [35]:
smoothed_truthful_bigram_counts = get_smoothed_bigram_corpus(truthful_unigram, truthful_bigram)
smoothed_deceptive_bigram_counts = get_smoothed_bigram_corpus(deceptive_unigram, deceptive_bigram)

<class 'dict'>
<class 'dict'>


In [36]:
df = get_smoothed_bigram_corpus(truthful_unigram, truthful_bigram)

<class 'dict'>


KeyboardInterrupt: 

In [None]:
get_smoothed_bigram_prob(('I', 'am'), df)

---

## TextBlob Naive Bayes Classifier

In [132]:
from textblob.classifiers import NaiveBayesClassifier

In [133]:
with open('./DATASET/train/truthful.txt') as t, open('./DATASET/train/deceptive.txt') as d:
    truthful_training_text = t.read()
    deceptive_training_text = d.read()

In [134]:
training_truthful = [(review, 'truthful') for review in truthful_training_text.split('\n')]
training_deceptive = [(review, 'deceptive') for review in deceptive_training_text.split('\n')]
training = training_truthful + training_deceptive

In [135]:
classifier = NaiveBayesClassifier(training)

## Validating TextBlob Naive Bayes Classifier

In [61]:
with open('./DATASET/validation/truthful.txt') as t, open('./DATASET/validation/deceptive.txt') as d:
    truthful_validation_text = t.read()
    deceptive_validation_text = d.read()

In [62]:
truthful_validation_classifications = \
    [classifier.classify(review) for review in truthful_validation_text.split('\n')]
truthful_validation_accuracy_counts = Counter(truthful_validation_classifications)
print(truthful_validation_accuracy_counts)
print('Accuracy rate:', 
      truthful_validation_accuracy_counts['truthful']/sum(truthful_validation_accuracy_counts.values()))

NameError: name 'classifier' is not defined

In [63]:
deceptive_validation_classifications = \
    [classifier.classify(review) for review in deceptive_validation_text.split('\n')]
deceptive_validation_accuracy_counts = Counter(deceptive_validation_classifications)
print(deceptive_validation_accuracy_counts)
print('Accuracy rate:', 
      deceptive_validation_accuracy_counts['deceptive']/sum(deceptive_validation_accuracy_counts.values()))

NameError: name 'classifier' is not defined