In [35]:
from collections import Counter
import pandas as pd
import math

with open('./DATASET/train/truthful.txt') as t, open('./DATASET/train/deceptive.txt') as d:
    T_TRAIN = t.read()
    D_TRAIN = d.read()
    
with open('./DATASET/validation/truthful.txt') as t, open('./DATASET/validation/deceptive.txt') as d:
    T_VAL = t.read()
    D_VAL = d.read()
    
with open('./DATASET/test/test.txt') as t:
    TEST = t.read()

In [36]:
def preprocess(text):
    return add_start_characters(text).split()
    
def add_start_characters(words):
    words = '<s> ' + words
    words = words.replace('\n', ' <s> ')
    return words[:-5]

def check_for_unk_words(wordlist, tokenlist):
    # replace all unknown words with <UNK> token
    for i, token in enumerate(wordlist):
        if token not in tokenlist:
              wordlist[i] = '<UNK>'
    return wordlist

In [140]:
def get_unigram_corpus(wordlist):
    return dict(Counter(wordlist))

def get_unigram_prob(unigram, unigram_corpus):
    total_words = 0
    for key in unigram_corpus:
        total_words += unigram_corpus[key]
    # I changed the default value to 1 from 0 to reflect smoothing
    return unigram_corpus.get(unigram, 1)/total_words

def get_bigram_corpus(wordlist):
    corpus = {}
    for i, word in enumerate(wordlist[1:], start=1):
        if word != '<s>':
            if (wordlist[i-1], word) not in corpus:
                corpus[(wordlist[i-1], word)] = 1
            else:
                corpus[(wordlist[i-1], word)] += 1
    return corpus

# added <UNK> as a token when creating the smooth bigram corpus 
# not added as token for unigram corpus
def get_smooth_bigram_corpus(tokenlist, bigram_corpus):
    tokenlist.append('<UNK>')
    df = pd.DataFrame(1, index = tokenlist, columns = tokenlist) 
    for bigram in bigram_corpus:
        df.loc[bigram[0], bigram[1]] += bigram_corpus[bigram]
    return df

def get_smooth_bigram_prob(bigram, smooth_bigram_corpus):
    return smooth_bigram_corpus.loc[bigram[0], bigram[1]]/smooth_bigram_corpus.loc[bigram[0]].sum()

def get_n_gram_corpus(wordlist, n):
    # wordlist is a preprocessed list of words (strings)
    n_gram_corpus = {}
    for i, word in enumerate(wordlist):
        # general case for n > 2
        if i + n > len(wordlist) - 1:
            break
        n_gram = tuple([wordlist[i+x] for x in range(n)])
        # special case when n is 1
        if n == 1:
            # by convention, we now use strings as keys for unigram models and tuples for bigram models
            n_gram = word
        # special case when n is 2
        if n == 2:
            if wordlist[i+1] == '<s>':
                continue
        if n_gram not in n_gram_corpus:
            n_gram_corpus[n_gram] = 1
        else:
            n_gram_corpus[n_gram] += 1 
    return n_gram_corpus

In [171]:
class NGramModel():
    def __init__(self, *args):
        super(NGramModel, self).__init__()
    
    def get_prob(self, *args):
        return
    
    def get_perp(self, *args):
        return
        
    
        
class UnigramModel(NGramModel):
    # assumes data is preprocessed list of words (strings)   
    # with unknown words NOT yet handled !!!
    def __init__(self, data):
        super(UnigramModel, self).__init__()
        self.corpus = get_unigram_corpus(data)

    # assumes sequence is preprocessed a list of words (strings)
    def get_prob(self, sequence):
        product = 1
        for word in sequence:
            product = product * get_unigram_prob(word, self.corpus)
        # added ternary operation for rare case where sequence is empty
        return product if sequence != [] else 0
    
    def get_perp(self, test_corpus):
        # assumes test_corpus is a preprocessed list of words (strings)
        n_gram_corpus = get_n_gram_corpus(test_corpus, 1)        
        N = len(n_gram_corpus)
        acc = 0
        for n_gram in n_gram_corpus:
            acc -= math.log(self.get_prob(n_gram))
        return math.e ** ((1/N) * acc)
    
class SmoothBigramModel(NGramModel):
    # assumes data is preprocessed list of words (strings)
    # with unknown words NOT yet handled !!!
    def __init__(self, data):
        super(SmoothBigramModel, self).__init__()
        self.tokens = list(get_unigram_corpus(data).keys())
        corpus = get_bigram_corpus(data)
        self.corpus = get_smooth_bigram_corpus(self.tokens, corpus)

    # assumes sequence is a preprocessed list of words (strings)
    def get_prob(self, sequence):
        bigrams = list(get_bigram_corpus(sequence).keys())
        product = 1
        for bigram in bigrams:
            product = product * get_smooth_bigram_prob(bigram, self.corpus)
        # added ternary operation for rare case where sequence is fewer than 2 words
        return product if bigrams != [] else 0
    
    def get_perp(self, test_corpus):
        # assumes test_corpus is a preprocessed list of words (strings)
        n_gram_corpus = get_n_gram_corpus(test_corpus, 2)   
        # length - 1 since we don't compute probability of first words
        N = len(n_gram_corpus) - 1
        acc = 0
        for n_gram in n_gram_corpus:
            acc -= math.log(self.get_prob(n_gram))
        return math.e ** ((1/N) * acc)

In [168]:
truthful_unigram_model = UnigramModel(preprocess(T_TRAIN))
deceptive_unigram_model = UnigramModel(preprocess(D_TRAIN))

truthful_smooth_bigram_model = SmoothBigramModel(preprocess(T_TRAIN))
deceptive_smooth_bigram_model = SmoothBigramModel(preprocess(D_TRAIN))

In [169]:
T_VAL_CLEAN = check_for_unk_words(preprocess(T_VAL), truthful_smooth_bigram_model.tokens)
D_VAL_CLEAN = check_for_unk_words(preprocess(D_VAL), deceptive_smooth_bigram_model.tokens)

In [170]:
print(truthful_unigram_model.get_perp(T_VAL_CLEAN))
# truthful_unigram_model.get_perp(D_VAL_CLEAN)

# deceptive_unigram_model.get_perp(T_VAL_CLEAN)
print(deceptive_unigram_model.get_perp(D_VAL_CLEAN))

print(truthful_smooth_bigram_model.get_perp(T_VAL_CLEAN))
# truthful_smooth_bigram_model.get_perp(D_VAL_CLEAN)

# deceptive_smooth_bigram_model.get_perp(T_VAL_CLEAN)
print(deceptive_smooth_bigram_model.get_perp(D_VAL_CLEAN))

125249218788.12532
54835933847.04513
3543.50724405135
2730.1681604013293
