In [9]:
from collections import Counter
import pandas as pd

In [10]:
with open('./DATASET/train/truthful.txt') as t, open('./DATASET/train/deceptive.txt') as d:
    truthful = t.read()
    deceptive = d.read()

In [11]:
def clean_text(words):
    words = '<s> ' + words
    words = words.replace('\n', ' <s> ')
    return words[:-5]

## Make Unigrams

In [12]:
truthful_unigram = dict(Counter(clean_text(truthful).split()))
deceptive_unigram = dict(Counter(clean_text(deceptive).split()))

In [13]:
def get_unigram_prob(corpus, unigram_to_test):
    total_words = 0
    for key in corpus:
        total_words += corpus[key]

    return corpus[unigram_to_test]/total_words

In [14]:
get_unigram_prob(deceptive_unigram, 'I')

0

## Make Bigrams

In [15]:
def get_bigram_counts(word_list):
    corpus = {}
    for i, word in enumerate(word_list[1:], start=1):
        if word != '<s>':
            if (word_list[i-1], word) not in corpus:
                corpus[(word_list[i-1], word)] = 1
            else:
                corpus[(word_list[i-1], word)] += 1
        
    return corpus

In [16]:
truthful_bigram = get_bigram_counts(clean_text(truthful).split())
deceptive_bigram = get_bigram_counts(clean_text(deceptive).split())

In [17]:
def get_bigram_prob(corpus, bigram_to_test):
    total_words = 0
    for key in corpus:
        if key[0] == bigram_to_test[0]:
            total_words += corpus[key]

    return corpus[bigram_to_test]/total_words

In [18]:
get_bigram_prob(deceptive_bigram, ('I', 'am'))

0

### Make Bigram Corpus

In [19]:
def get_smoothed_bigram_corpus(unigram_corpus, bigrams):
    df = pd.DataFrame(1, index = unigram_corpus, columns = unigram_corpus) 
    for bigram in bigrams:
        df.loc[bigram[0], bigram[1]] += bigrams[bigram]
    return df

In [20]:
df = pd.DataFrame(1, index =deceptive_unigram, columns =deceptive_unigram) 

In [21]:
get_smoothed_bigram_corpus(deceptive_unigram, deceptive_bigram)

Unnamed: 0,knicked,yellow,four,asian,preface,Until,someplace,fronts,up-,remodeled,...,priced,friends,experienc,Omni,dripped,intentionally,portion,extras,understand,shopped
knicked,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
yellow,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
four,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
asian,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
preface,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
Until,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
someplace,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
fronts,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
up-,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
remodeled,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1


In [22]:
def get_smoothed_bigram_prob(bigram, smoothed_bigram_corpus):
    return smoothed_bigram_corpus.loc[bigram[0], bigram[1]]/df.loc[bigram[0]].sum()

In [23]:
get_smoothed_bigram_prob(('I', 'am'), df)

0

## Part 4 – Validation

In [24]:
with open('./DATASET/validation/truthful.txt') as t, open('./DATASET/validation/deceptive.txt') as d:
    truthful_v = t.read()
    deceptive_v =d.read()

In [25]:
# replace all unknown words with <UNK> token
def check_for_unk_words(words, corpus):
    for i,word in enumerate(words):
        if word not in corpus:
              words[i] = '<UNK>'
    return words

## Validation Unigrams

In [26]:
# Cleaning & handling unknown words for truthful validation
cleaned_validation_truthful = check_for_unk_words(clean_text(truthful_v).split(), truthful_unigram)
truthful_unigram_v = Counter(cleaned_validation_truthful)

# Cleaning & handling unknown words for deceptive validation
cleaned_validation_deceptive = check_for_unk_words(clean_text(deceptive_v).split(), deceptive_unigram)
deceptive_unigram_v = Counter(cleaned_validation_deceptive)

## Validation Bigrams

In [27]:
truthful_bigram_v = get_bigram_counts(cleaned_validation_truthful)
deceptive_bigram_v = get_bigram_counts(cleaned_validation_deceptive)

In [28]:
get_smoothed_bigram_corpus(truthful_unigram_v, truthful_bigram_v)

Unnamed: 0,limited,copy,child,four,facilities,sleep,NYC,ridiculous,saved,poorly,...,plate,class,stay,chance,N,friends,South,lukewarm,Omni,understand
limited,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
copy,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
child,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
four,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
facilities,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
sleep,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
NYC,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
ridiculous,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
saved,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
poorly,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1


## Perplexity

In [29]:
#leaving in start characters
def get_n_grams(word_list, n):
    n_gram_corpus = {}
    for i, word in enumerate(word_list):
        if i + n > len(word_list) - 1:
            break
        
        temp = [word_list[i+x] for x in range(n)]
        temp = tuple(temp)
        
        if temp not in n_gram_corpus:
            n_gram_corpus[temp] = 1
        else:
            n_gram_corpus[temp] += 1 
    
    return n_gram_corpus

In [58]:
def get_n_gram_prob(n_gram, n_gram_corpus):
    # assumes that the n_gram is present in the corpus
    numerator = n_gram_corpus[n_gram]
    denominator = 0
    n = len(n_gram)
    base = tuple([n_gram[i] for i in range(n-1)])
    for key in n_gram_corpus:
        key_base = tuple([key[i] for i in range(n-1)])
        if key_base == base:
            denominator += n_gram_corpus[key]
    
    print('numerator: {}'.format(numerator))
    print('denominator: {}'.format(denominator))
    return float(numerator)/float(denominator)

In [62]:
# get_n_grams(cleaned_validation_deceptive, 2)
word_list = cleaned_validation_deceptive
# print(word_list)
get_n_gram_prob(('My', 'stay', 'at'), get_n_grams(word_list, 3))

numerator: 8
denominator: 10


0.8

In [37]:
import math
def compute_perplexity(unigram, n_gram_corpus):
    # N is the number of tokens, so the length of the unigram corpus
    N = len(unigram)
    acc = 0
    for i in range(N):
        n_gram = n_gram_corpus[i]
        acc -= math.log(get_n_gram_prob(n_gram, n_gram_corpus))
    return math.e ** ((1/N) * acc)

9