In [47]:
from collections import Counter
import pandas as pd

In [48]:
with open('./DATASET/train/truthful.txt') as t, open('./DATASET/train/deceptive.txt') as d:
    truthful = t.read()
    deceptive = d.read()

In [49]:
def add_start_characters(words):
    words = '<s> ' + words
    words = words.replace('\n', ' <s> ')
    return words[:-5]

## Unigrams

In [50]:
truthful_unigram = dict(Counter(add_start_characters(truthful).split()))
deceptive_unigram = dict(Counter(add_start_characters(deceptive).split()))

In [51]:
def get_unigram_prob(corpus, unigram_to_test):
    total_words = 0
    for key in corpus:
        total_words += corpus[key]

    return corpus.get(unigram_to_test, 0)/total_words

In [52]:
get_unigram_prob(deceptive_unigram, 'I')

0.02905073649754501

## Unsmoothed Bigrams

In [53]:
def get_bigram_counts(word_list):
    corpus = {}
    for i, word in enumerate(word_list[1:], start=1):
        if word != '<s>':
            if (word_list[i-1], word) not in corpus:
                corpus[(word_list[i-1], word)] = 1
            else:
                corpus[(word_list[i-1], word)] += 1
        
    return corpus

In [54]:
truthful_bigram = get_bigram_counts(add_start_characters(truthful).split())
deceptive_bigram = get_bigram_counts(add_start_characters(deceptive).split())

In [55]:
def get_bigram_prob(corpus, bigram_to_test):
    total_words = 0
    for key in corpus:
        if key[0] == bigram_to_test[0]:
            total_words += corpus[key]
    return corpus.get(bigram_to_test, 0)/total_words

In [56]:
get_bigram_prob(deceptive_bigram, ('I', 'am'))

0.01488933601609658

### Smoothed Bigrams

In [57]:
def get_smoothed_bigram_corpus(unigram_corpus, bigrams):
    df = pd.DataFrame(1, index = unigram_corpus, columns = unigram_corpus) 
    for bigram in bigrams:
        df.loc[bigram[0], bigram[1]] += bigrams[bigram]
    return df

In [58]:
df = pd.DataFrame(1, index =deceptive_unigram, columns =deceptive_unigram) 

In [59]:
get_smoothed_bigram_corpus(deceptive_unigram, deceptive_bigram)

Unnamed: 0,<s>,I,was,here,on,business,so,needed,to,get,...,recoup,rockin,ROYAL,gifts,allergy-friendly,tree,informal,tranquility,cleaners,beutiful
<s>,1,135,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
I,1,1,326,1,1,1,2,15,5,12,...,1,1,1,1,1,1,1,1,1,1
was,1,3,1,4,19,1,38,1,15,1,...,1,1,1,1,1,1,1,1,1,1
here,1,1,4,1,4,1,1,1,2,1,...,1,1,1,1,1,1,1,1,1,1
on,1,1,1,1,1,15,1,1,2,1,...,1,1,1,1,1,1,1,1,1,1
business,1,5,4,1,1,1,3,1,1,1,...,1,1,1,1,1,1,1,1,1,1
so,1,35,2,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
needed,1,1,2,1,1,1,1,1,11,1,...,1,1,1,1,1,1,1,1,1,1
to,1,1,3,1,2,1,3,1,1,75,...,1,1,1,1,1,1,1,1,1,1
get,1,1,1,1,3,1,1,1,15,1,...,1,1,1,1,1,1,1,1,1,1


In [60]:
def get_smoothed_bigram_prob(bigram, smoothed_bigram_corpus):
    return smoothed_bigram_corpus.loc[bigram[0], bigram[1]]/df.loc[bigram[0]].sum()

In [61]:
get_smoothed_bigram_prob(('I', 'am'), df)

0.00016840687100033681

## Part 4 – Validation

In [62]:
with open('./DATASET/validation/truthful.txt') as t, open('./DATASET/validation/deceptive.txt') as d:
    truthful_v = t.read()
    deceptive_v =d.read()

In [63]:
# replace all unknown words with <UNK> token
def check_for_unk_words(word_list, corpus):
    for i, word in enumerate(word_list):
        if word not in corpus:
              word_list[i] = '<UNK>'
    return word_list

## Validation Unigrams

In [70]:
# Cleaning & handling unknown words for truthful validation
cleaned_truthful_word_list_v = check_for_unk_words(add_start_characters(truthful_v).split(), truthful_unigram)
truthful_unigram_v = Counter(cleaned_truthful_word_list_v)

# Cleaning & handling unknown words for deceptive validation
cleaned_deceptive_word_list_v = check_for_unk_words(add_start_characters(deceptive_v).split(), deceptive_unigram)
deceptive_unigram_v = Counter(cleaned_deceptive_word_list_v)

## Validation Bigrams

In [71]:
truthful_bigram_v = get_bigram_counts(cleaned_truthful_word_list_v)
deceptive_bigram_v = get_bigram_counts(cleaned_deceptive_word_list_v)

In [73]:
get_smoothed_bigram_corpus(truthful_unigram_v, truthful_bigram_v)

Unnamed: 0,<s>,I,stayed,for,four,nights,while,attending,a,conference,...,NOTHING,training,opted,own,sterile,lives,indoor,movie,Mag,affordable
<s>,1,28,1,1,1,1,1,1,2,1,...,1,1,1,1,1,1,1,1,1,1
I,1,1,21,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
stayed,1,1,1,3,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
for,1,1,1,1,3,1,1,1,43,1,...,1,1,1,1,1,1,1,1,1,1
four,1,1,1,1,1,3,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
nights,1,1,1,4,1,1,3,1,1,1,...,1,1,1,1,1,1,1,1,1,1
while,1,3,1,1,1,1,1,3,2,1,...,1,1,1,1,1,1,1,1,1,1
attending,1,1,1,1,1,1,1,1,4,1,...,1,1,1,1,1,1,1,1,1,1
a,1,1,1,1,1,1,2,1,1,5,...,1,2,1,1,1,1,1,1,1,1
conference,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1


## Perplexity

In [74]:
#leaving in start characters
def get_n_grams(word_list, n):
    n_gram_corpus = {}
    for i, word in enumerate(word_list):
        if i + n > len(word_list) - 1:
            break
        
        temp = [word_list[i+x] for x in range(n)]
        temp = tuple(temp)
        
        if temp not in n_gram_corpus:
            n_gram_corpus[temp] = 1
        else:
            n_gram_corpus[temp] += 1 
    
    return n_gram_corpus

In [75]:
def get_n_gram_prob(n_gram, n_gram_corpus):
    # assumes that the n_gram is present in the corpus
    numerator = n_gram_corpus[n_gram]
    denominator = 0
    n = len(n_gram)
    base = tuple([n_gram[i] for i in range(n-1)])
    for key in n_gram_corpus:
        key_base = tuple([key[i] for i in range(n-1)])
        if key_base == base:
            denominator += n_gram_corpus[key]
    
    return float(numerator)/float(denominator)

In [78]:
word_list = cleaned_deceptive_word_list_v
get_n_gram_prob(('My', 'stay', 'at'), get_n_grams(word_list, 3))

0.8

In [79]:
import math
def compute_perplexity(unigram, n_gram_corpus, n):
    # N is the number of tokens, so the length of the unigram corpus
    N = len(unigram)
    acc = 0

    for i, n_gram in enumerate(n_gram_corpus):
        if i == N:
            break
        acc -= math.log(get_n_gram_prob(n_gram, n_gram_corpus))
    return math.e ** ((1/N) * acc)

In [None]:
compute_perplexity(truthful_unigram_v, get_n_grams(cleaned_truthful_word_list_v, 2), 2)

## Language Model Classifier

## Naïve Bayes Classifier

In [40]:
from sklearn.naive_bayes import GaussianNB

In [41]:
nb = GaussianNB()
# y_pred = gnb.fit(iris.data, iris.target).predict(iris.data)