In [2]:
from collections import Counter
import pandas as pd
import math

with open('./DATASET/train/truthful.txt') as t, open('./DATASET/train/deceptive.txt') as d:
    T_Train = t.read()
    D_Train = d.read()
    
with open('./DATASET/validation/truthful.txt') as t, open('./DATASET/validation/deceptive.txt') as d:
    T_Val = t.read()
    D_Val = d.read()
    
with open('./DATASET/test/test.txt') as t:
    Test = t.read()

In [4]:
def preprocess(text):
    return add_start_characters(text.split())
    
def add_start_characters(words):
    words = '<s> ' + words
    words = words.replace('\n', ' <s> ')
    return words[:-5]

def check_for_unk_words(word_list, corpus):
    # replace all unknown words with <UNK> token
    for i, word in enumerate(word_list):
        if word not in corpus:
              word_list[i] = '<UNK>'
    return word_list

In [5]:
def get_unigram_prob(corpus, unigram_key):
    total_words = 0
    for key in corpus:
        total_words += corpus[key]
    return corpus.get(unigram_key, 0)/total_words

def get_bigram_counts(word_list):
    corpus = {}
    for i, word in enumerate(word_list[1:], start=1):
        if word != '<s>':
            if (word_list[i-1], word) not in corpus:
                corpus[(word_list[i-1], word)] = 1
            else:
                corpus[(word_list[i-1], word)] += 1
    return corpus

def get_smoothed_bigram_corpus(token_list, bigrams):
    df = pd.DataFrame(1, index = token_list, columns = token_list) 
    for bigram in bigrams:
        df.loc[bigram[0], bigram[1]] += bigrams[bigram]
    return df

def get_smoothed_bigram_prob(bigram, smoothed_bigram_corpus):
    return smoothed_bigram_corpus.loc[bigram[0], bigram[1]]/df.loc[bigram[0]].sum()

In [8]:
class NGramModel():
    def __init__(self, *args):
        super(Model, self).__init__()
    
    def get_probability(self, *args):
        return
        
        
class UnigramModel(NGramModel):
    # assumes dataset is a cleaned, in-order list of the words in the desired input string 
    # with unknown words NOT yet handled !!!
    def __init__(self, data):
        super(UnigramModel, self).__init__()
        self.data = check_for_unk_words(data, self.model)
        self.model = get_n_gram_model(self.data, 1)

    def get_probability(self, sequence):
        product = 1
        for word in sequence:
            product = product * get_unigram_prob(model, word)
        return product
    
    
    
class SmoothBigramModel(NGramModel):
    # assumes dataset is a cleaned, in-order list of the words in the desired input string 
    # with unknown words NOT yet handled !!!
    def __init__(self, data):
        super(SmoothBigramModel, self).__init__()
        self.data = check_for_unk_words(data, self.tokens)
        self.tokens = get_n_gram_model(self.data, 1)
        self.model = get_n_gram_model(self.data, 2)
        self.table = get_smoothed_bigram_corpus(self.tokens, self.model)

    def get_probability(self, sequence):
        bigrams = get_bigram_counts(sequence)
        product = 1
        for bigram in bigrams:
            product = product * get_smoothed_bigram_prob(bigram, self.table)
        return product

In [None]:
def get_n_gram_model(wordlist, n):
    # wordlist is an in-order list of the words on which to build the model
    # n determines whether to use unigram or bigram estimation (n must be 1 or 2)
    n_gram_model = {}
    for i, word in enumerate(wordlist):
        if i + n > len(wordlist) - 1:
            break
        key = tuple([wordlist[i+x] for x in range(n)])
        if n == 1:
            # by convention, we now use strings as keys for unigram models and tuples for bigram models
            key = word
        if key not in n_gram_model:
            n_gram_model[key] = 1
        else:
            n_gram_model[key] += 1 
    return n_gram_model

def get_n_gram_prob(sequence, model):
    # model is a dictionary representing the n-grams and counts for some dataset
    
    # this just gets n implied from the model
    for key in model:
        if isinstance(key, str):
            n = 1
        else:
            n = len(key)
        break
        
    if n == 1:
        # case if model uses unigram estimation
        result = 1
        for word in sequence:
            result = result * get_unigram_prob(model, word)
        return result
    elif n == 2:
        # case if model uses bigram estimation
        bigrams = get_bigram_counts(sequence)
        result = 1
        for bigram in bigrams:
            result = result * get_bigram_prob(model, bigram)
        return result
    else:
        # else cannot compute
        return 0
    
def compute_perplexity(test_corpus, model):
    # test_corpus: the corpus on which to compute the model's perplexity
    # test_corpus is a (unigram-count) dictionary of key:value pairiings where keys are all unique tokens 
    # for the desired input and values are their counts
    N = len(test_corpus)
    acc = 0
    for token in test_corpus:
        acc -= math.log(get_n_gram_prob(n_gram, model))
    return math.e ** ((1/N) * acc)