In [15]:
from collections import Counter
import pandas as pd

In [16]:
with open('./DATASET/train/truthful.txt') as t, open('./DATASET/train/deceptive.txt') as d:
    truthful = t.read()
    deceptive = d.read()

In [17]:
def add_start_characters(words):
    words = '<s> ' + words
    words = words.replace('\n', ' <s> ')
    return words[:-5]

## Unigrams

In [18]:
truthful_unigram = dict(Counter(add_start_characters(truthful).split()))
deceptive_unigram = dict(Counter(add_start_characters(deceptive).split()))

In [19]:
def get_unigram_prob(corpus, unigram_key):
    total_words = 0
    for key in corpus:
        total_words += corpus[key]

    return corpus.get(unigram_key, 0)/total_words

In [20]:
get_unigram_prob(deceptive_unigram, 'I')

0.02905073649754501

## Unsmoothed Bigrams

In [21]:
def get_bigram_counts(word_list):
    corpus = {}
    for i, word in enumerate(word_list[1:], start=1):
        if word != '<s>':
            if (word_list[i-1], word) not in corpus:
                corpus[(word_list[i-1], word)] = 1
            else:
                corpus[(word_list[i-1], word)] += 1
        
    return corpus

In [22]:
truthful_bigram = get_bigram_counts(add_start_characters(truthful).split())
deceptive_bigram = get_bigram_counts(add_start_characters(deceptive).split())

In [23]:
def get_bigram_prob(corpus, bigram_to_test):
    total_words = 0
    for key in corpus:
        if key[0] == bigram_to_test[0]:
            total_words += corpus[key]
    return corpus.get(bigram_to_test, 0)/total_words

In [24]:
get_bigram_prob(deceptive_bigram, ('I', 'am'))

0.01488933601609658

### Smoothed Bigrams

In [25]:
def get_smoothed_bigram_corpus(unigram_corpus, bigrams):
    df = pd.DataFrame(1, index = unigram_corpus, columns = unigram_corpus) 
    for bigram in bigrams:
        df.loc[bigram[0], bigram[1]] += bigrams[bigram]
    return df

In [26]:
df = pd.DataFrame(1, index =deceptive_unigram, columns =deceptive_unigram) 

In [27]:
get_smoothed_bigram_corpus(deceptive_unigram, deceptive_bigram)

Unnamed: 0,<s>,I,was,here,on,business,so,needed,to,get,...,recoup,rockin,ROYAL,gifts,allergy-friendly,tree,informal,tranquility,cleaners,beutiful
<s>,1,135,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
I,1,1,326,1,1,1,2,15,5,12,...,1,1,1,1,1,1,1,1,1,1
was,1,3,1,4,19,1,38,1,15,1,...,1,1,1,1,1,1,1,1,1,1
here,1,1,4,1,4,1,1,1,2,1,...,1,1,1,1,1,1,1,1,1,1
on,1,1,1,1,1,15,1,1,2,1,...,1,1,1,1,1,1,1,1,1,1
business,1,5,4,1,1,1,3,1,1,1,...,1,1,1,1,1,1,1,1,1,1
so,1,35,2,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
needed,1,1,2,1,1,1,1,1,11,1,...,1,1,1,1,1,1,1,1,1,1
to,1,1,3,1,2,1,3,1,1,75,...,1,1,1,1,1,1,1,1,1,1
get,1,1,1,1,3,1,1,1,15,1,...,1,1,1,1,1,1,1,1,1,1


In [28]:
def get_smoothed_bigram_prob(bigram, smoothed_bigram_corpus):
    return smoothed_bigram_corpus.loc[bigram[0], bigram[1]]/df.loc[bigram[0]].sum()

In [29]:
get_smoothed_bigram_prob(('I', 'am'), df)

0.00016840687100033681

## Part 4 – Validation

In [30]:
with open('./DATASET/validation/truthful.txt') as t, open('./DATASET/validation/deceptive.txt') as d:
    truthful_v = t.read()
    deceptive_v =d.read()

In [31]:
# replace all unknown words with <UNK> token
def check_for_unk_words(word_list, corpus):
    for i, word in enumerate(word_list):
        if word not in corpus:
              word_list[i] = '<UNK>'
    return word_list

## Validation Unigrams

In [32]:
# Cleaning & handling unknown words for truthful validation
cleaned_truthful_word_list_v = check_for_unk_words(add_start_characters(truthful_v).split(), truthful_unigram)
truthful_unigram_v = Counter(cleaned_truthful_word_list_v)

# Cleaning & handling unknown words for deceptive validation
cleaned_deceptive_word_list_v = check_for_unk_words(add_start_characters(deceptive_v).split(), deceptive_unigram)
deceptive_unigram_v = Counter(cleaned_deceptive_word_list_v)

## Validation Bigrams

In [33]:
truthful_bigram_v = get_bigram_counts(cleaned_truthful_word_list_v)
deceptive_bigram_v = get_bigram_counts(cleaned_deceptive_word_list_v)

In [34]:
get_smoothed_bigram_corpus(truthful_unigram_v, truthful_bigram_v)

Unnamed: 0,<s>,I,stayed,for,four,nights,while,attending,a,conference,...,NOTHING,training,opted,own,sterile,lives,indoor,movie,Mag,affordable
<s>,1,28,1,1,1,1,1,1,2,1,...,1,1,1,1,1,1,1,1,1,1
I,1,1,21,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
stayed,1,1,1,3,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
for,1,1,1,1,3,1,1,1,43,1,...,1,1,1,1,1,1,1,1,1,1
four,1,1,1,1,1,3,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
nights,1,1,1,4,1,1,3,1,1,1,...,1,1,1,1,1,1,1,1,1,1
while,1,3,1,1,1,1,1,3,2,1,...,1,1,1,1,1,1,1,1,1,1
attending,1,1,1,1,1,1,1,1,4,1,...,1,1,1,1,1,1,1,1,1,1
a,1,1,1,1,1,1,2,1,1,5,...,1,2,1,1,1,1,1,1,1,1
conference,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1


## Perplexity

In [35]:
def get_n_gram_model(wordlist, n):
    # wordlist is an in-order list of the words on which to build the model
    # n determines whether to use unigram or bigram estimation (n must be 1 or 2)
    n_gram_model = {}
    for i, word in enumerate(wordlist):
        if i + n > len(wordlist) - 1:
            break
        key = tuple([wordlist[i+x] for x in range(n)])
        if n == 1:
            # by convention, we now use strings as keys for unigram models and tuples for bigram models
            key = word
        if key not in n_gram_model:
            n_gram_model[key] = 1
        else:
            n_gram_model[key] += 1 
    return n_gram_model

In [36]:
def get_n_gram_prob(sequence, model):
    # model is a dictionary representing the n-grams and counts for some dataset
    
    # this just gets n implied from the model
    for key in model:
        if isinstance(key, str):
            n = 1
        else:
            n = len(key)
        break
        
    if n == 1:
        # case if model uses unigram estimation
        result = 1
        for word in sequence:
            result = result * get_unigram_prob(model, word)
        return result
    elif n == 2:
        # case if model uses bigram estimation
        bigrams = get_bigram_counts(sequence)
        result = 1
        for bigram in bigrams:
            result = result * get_bigram_prob(model, bigram)
        return result
    else:
        # else cannot compute
        return 0

In [37]:
get_n_gram_prob(['I', 'like', 'hotels'], get_n_gram_model(cleaned_truthful_word_list_v, 2))

0.0

In [38]:
import math
def compute_perplexity(N, model, n):
    # N is the number of tokens, so the length of the unigram corpus
    acc = 0

    for i, n_gram in enumerate(model):
        if i == N:
            break
        acc -= math.log(get_n_gram_prob(n_gram, model))
    return math.e ** ((1/N) * acc)

In [39]:
compute_perplexity(len(truthful_unigram_v), get_n_grams(cleaned_truthful_word_list_v, 1), 1)

NameError: name 'get_n_grams' is not defined

## Language Model Classifier

In [None]:
#review is the unigram for the review
compute_perplexity(review)

## Bryant's Naïve Bayes Classifier
* A lot of the code was adapted from my INFO 2950 class from a homework where we had to write our own naive bayes text classifier
* Does smoothing

In [211]:
with open('./DATASET/train/truthful.txt') as t, open('./DATASET/train/deceptive.txt') as d:
    truthful = t.read()
    deceptive = d.read()

In [182]:
truthful_reviews_Counter = Counter(truthful.split())
deceptive_reviews_Counter = Counter(deceptive.split())
truthful_total_words = sum(truthful_unigram_Counter.values())
deceptive_total_words = sum(deceptive_unigram_Counter.values())

In [150]:
both_counts = truthful_unigram_Counter + deceptive_unigram_Counter
vocabulary_size = len(both_counts.keys())
print(vocabulary_size)

9716


In [210]:
import re
#to delinate punctuation, as is done in the training data
word_pattern = re.compile("\w+")

In [164]:
truthful_reviews = len(truthful.split('\n'))
deceptive_reviews = len(truthful.split('\n'))

513


In [208]:
# Calculates the probability of a word occuring within the corpus
def smoothed_word_log_prob(word, counter, total):
    return math.log((counter[word] + 1) / (total + vocabulary_size))

# calculates the probability of all words occuring in a review, given the corpus
def smoothed_review_log_prob(review, counter, total):
    log_prob = 0.0
    for word in word_pattern.findall(review):
        log_prob += smoothed_word_log_prob(word, counter, total)
    return log_prob

def classify_review_nb(review):
    truthful_prob = smoothed_review_log_prob(review,
                        truthful_unigram_Counter, truthful_total_words)
    deceptive_prob = smoothed_review_log_prob(review,
                            deceptive_unigram_Counter, deceptive_total_words)
    
    # get ratio between the two (since this training set has the same number of reviews, this code is optional)
    truthful_prob = truthful_prob +  math.log(truthful_reviews / (truthful_reviews + deceptive_reviews))
    deceptive_prob = deceptive_prob + math.log(deceptive_reviews / (truthful_reviews + deceptive_reviews))

    return 'truthful' if truthful_prob >= deceptive_prob else 'deceptive'

In [204]:
classify_review_nb('This hotel was great! It rooms smelled a bit weird, but I enjoyed myself overall.')

'deceptive'

## Bigram Naive Bayes Classification

In [216]:
with open('./DATASET/train/truthful.txt') as t, open('./DATASET/train/deceptive.txt') as d:
    truthful = t.read()
    deceptive = d.read()
truthful_unigram = dict(Counter(add_start_characters(truthful).split()))
deceptive_unigram = dict(Counter(add_start_characters(deceptive).split()))

truthful_bigram = get_bigram_counts(add_start_characters(truthful).split())
deceptive_bigram = get_bigram_counts(add_start_characters(deceptive).split())

In [217]:
def get_smoothed_bigram_corpus(unigram_corpus, bigrams):
    df = pd.DataFrame(1, index = unigram_corpus, columns = unigram_corpus) 
    for bigram in bigrams:
        df.loc[bigram[0], bigram[1]] += bigrams[bigram]
    return df

In [223]:
smoothed_truthful_bigram_counts = get_smoothed_bigram_corpus(truthful_unigram, truthful_bigram)
smoothed_deceptive_bigram_counts = get_smoothed_bigram_corpus(deceptive_unigram, deceptive_bigram)

In [224]:
smoothed_deceptive_bigram_counts

Unnamed: 0,<s>,I,was,here,on,business,so,needed,to,get,...,recoup,rockin,ROYAL,gifts,allergy-friendly,tree,informal,tranquility,cleaners,beutiful
<s>,1,135,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
I,1,1,326,1,1,1,2,15,5,12,...,1,1,1,1,1,1,1,1,1,1
was,1,3,1,4,19,1,38,1,15,1,...,1,1,1,1,1,1,1,1,1,1
here,1,1,4,1,4,1,1,1,2,1,...,1,1,1,1,1,1,1,1,1,1
on,1,1,1,1,1,15,1,1,2,1,...,1,1,1,1,1,1,1,1,1,1
business,1,5,4,1,1,1,3,1,1,1,...,1,1,1,1,1,1,1,1,1,1
so,1,35,2,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
needed,1,1,2,1,1,1,1,1,11,1,...,1,1,1,1,1,1,1,1,1,1
to,1,1,3,1,2,1,3,1,1,75,...,1,1,1,1,1,1,1,1,1,1
get,1,1,1,1,3,1,1,1,15,1,...,1,1,1,1,1,1,1,1,1,1


## Validating my own Naive Bayes Classifier

In [205]:
with open('./DATASET/validation/truthful.txt') as t, open('./DATASET/validation/deceptive.txt') as d:
    truthful_validation_text = t.read()
    deceptive_validation_text = d.read()

In [206]:
truthful_validation_classifications = \
    [classify_review_nb(review) for review in truthful_validation_text.split('\n')]
truthful_validation_accuracy_counts = Counter(truthful_validation_classifications)
print(truthful_validation_accuracy_counts)
print('Accuracy rate:', 
      truthful_validation_accuracy_counts['truthful']/sum(truthful_validation_accuracy_counts.values()))

Counter({'truthful': 108, 'deceptive': 21})
Accuracy rate: 0.8372093023255814


In [207]:
deceptive_validation_classifications = \
    [classify_review_nb(review) for review in deceptive_validation_text.split('\n')]
deceptive_validation_accuracy_counts = Counter(deceptive_validation_classifications)
print(deceptive_validation_accuracy_counts)
print('Accuracy rate:', 
      deceptive_validation_accuracy_counts['deceptive']/sum(deceptive_validation_accuracy_counts.values()))

Counter({'deceptive': 122, 'truthful': 7})
Accuracy rate: 0.9457364341085271


---


## sklearn Naive Bayes Classifier (under construction)

In [1]:
from sklearn.naive_bayes import GaussianNB
from sklearn import preprocessing
from sklearn import datasets

In [2]:
with open('./DATASET/train/truthful.txt') as t, open('./DATASET/train/deceptive.txt') as d:
    truthful_training_text = t.read()
    deceptive_training_text = d.read()

In [9]:
truthful_reviews_list = truthful_training_text.split('\n')
deceptive_reviews_list = deceptive_training_text.split('\n')

In [4]:
# making a list of review types that matches up with the reviews so that sklearn can process it
truthful_category_list = ['truthful' for review in truthful_reviews_list]
deceptive_category_list = ['deceptive' for review in deceptive_reviews_list]
review_categories = truthful_category_list + deceptive_category_list

In [5]:
# We want to turn our categorical data into numerical representations
# 1 = truthful, 0 = deceptive
le = preprocessing.LabelEncoder()
review_categories_encoded=le.fit_transform(review_categories)

In [6]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()

In [73]:
analyze = vectorizer.build_analyzer()
vect = vectorizer.fit(truthful_reviews_list)
# vectorizer.get_feature_names()

---

## TextBlob Naive Bayes Classifier

In [132]:
from textblob.classifiers import NaiveBayesClassifier

In [133]:
with open('./DATASET/train/truthful.txt') as t, open('./DATASET/train/deceptive.txt') as d:
    truthful_training_text = t.read()
    deceptive_training_text = d.read()

In [134]:
training_truthful = [(review, 'truthful') for review in truthful_training_text.split('\n')]
training_deceptive = [(review, 'deceptive') for review in deceptive_training_text.split('\n')]
training = training_truthful + training_deceptive

In [135]:
classifier = NaiveBayesClassifier(training)

## Validating TextBlob Naive Bayes Classifier

In [139]:
with open('./DATASET/validation/truthful.txt') as t, open('./DATASET/validation/deceptive.txt') as d:
    truthful_validation_text = t.read()
    deceptive_validation_text = d.read()

In [140]:
truthful_validation_classifications = \
    [classifier.classify(review) for review in truthful_validation_text.split('\n')]
truthful_validation_accuracy_counts = Counter(truthful_validation_classifications)
print(truthful_validation_accuracy_counts)
print('Accuracy rate:', 
      truthful_validation_accuracy_counts['truthful']/sum(truthful_validation_accuracy_counts.values()))

Counter({'truthful': 110, 'deceptive': 19})
Accuracy rate: 0.8527131782945736


In [141]:
deceptive_validation_classifications = \
    [classifier.classify(review) for review in deceptive_validation_text.split('\n')]
deceptive_validation_accuracy_counts = Counter(deceptive_validation_classifications)
print(deceptive_validation_accuracy_counts)
print('Accuracy rate:', 
      deceptive_validation_accuracy_counts['deceptive']/sum(deceptive_validation_accuracy_counts.values()))

Counter({'deceptive': 121, 'truthful': 8})
Accuracy rate: 0.937984496124031
