# 2 : Dataset

In [380]:
from collections import Counter
import pandas as pd
import math

with open('./DATASET/train/truthful.txt') as t, open('./DATASET/train/deceptive.txt') as d:
    T_TRAIN = t.read()
    D_TRAIN = d.read()
    
with open('./DATASET/validation/truthful.txt') as t, open('./DATASET/validation/deceptive.txt') as d:
    T_VAL = t.read()
    D_VAL = d.read()

# 3 : Unsmoothed N-Grams
### 3.1 : Preprocessing

In [398]:
def preprocess(text):
    return add_start_characters(text).split()
    
def add_start_characters(words):
    words = '<s> ' + words
    words = words.replace('\n', ' <s> ')
    return words[:-5] if words[-5:] == ' <s> ' else words

### 3.2 : Unsmoothed Unigram Probability

In [399]:
def get_unigram_corpus(wordlist):
    return dict(Counter(wordlist))

def get_unigram_prob(unigram, unigram_corpus):
    acc = 0
    for key in unigram_corpus:
        acc += unigram_corpus[key]
    # I changed the default value to 1 from 0 to reflect smoothing
    return unigram_corpus.get(unigram, 1)/acc

### 3.3 : Unsmoothed Bigram Probability

In [400]:
def get_bigram_corpus(wordlist):
    corpus = {}
    for i, word in enumerate(wordlist[1:], start=1):
        if word != '<s>':
            if (wordlist[i-1], word) not in corpus:
                corpus[(wordlist[i-1], word)] = 1
            else:
                corpus[(wordlist[i-1], word)] += 1
    return corpus

def get_bigram_prob(bigram, bigram_corpus):
    acc = 0
    for key in bigram_corpus:
        if key[0] == bigram[0]:
            acc += bigram_corpus[key]
    return bigram_corpus[bigram] / acc

# 4 : Smoothing and Unknown Words
### 4.1 : Unknown Word Handling

In [401]:
def check_for_unk_words(wordlist, tokenlist):
    # replace all unknown words with <UNK> token
    for i, token in enumerate(wordlist):
        if token not in tokenlist:
              wordlist[i] = '<UNK>'
    return wordlist

### 4.2 : Smooth Bigram Probability

In [402]:
# added <UNK> as a token when creating the smooth bigram corpus 
def get_smooth_bigram_corpus(tokenlist, bigram_corpus):
    tokenlist.append('<UNK>')
    df = pd.DataFrame(1, index = tokenlist, columns = tokenlist) 
    for bigram in bigram_corpus:
        df.loc[bigram[0], bigram[1]] += bigram_corpus[bigram]
    return df

def get_smooth_bigram_prob(bigram, smooth_bigram_corpus):
    return smooth_bigram_corpus.loc[bigram[0], bigram[1]]/smooth_bigram_corpus.loc[bigram[0]].sum()

# 5 : Perplexity

In [403]:
class NGramModel():
    def __init__(self, *args):
        super(NGramModel, self).__init__()
    
    def get_perp(self, *args):
        return
        
    
        
class UnigramModel(NGramModel):
    # assumes data is preprocessed list of words (strings)   
    def __init__(self, data):
        super(UnigramModel, self).__init__()
        self.corpus = get_unigram_corpus(data)

    # assumes test_corpus is a preprocessed list of words (strings)
    def get_perp(self, test_corpus):
        N = len(test_corpus)
        acc = 0
        for word in test_corpus:
            acc -= math.log(get_unigram_prob(word, self.corpus))
        return math.exp((1/N) * acc)

    
    
class BigramModel(NGramModel):
    # assumes data is preprocessed list of words (strings)
    def __init__(self, data):
        super(BigramModel, self).__init__()
        self.corpus = get_bigram_corpus(data)
    
    # assumes test_corpus is a preprocessed list of words (strings)
    def get_perp(self, test_corpus):
        N = len(test_corpus)
        acc = 0
        for i, word in enumerate(test_corpus):
            if i == 0:
                # we divide by N-1 because we don't compute the probability of the first term in the corpus
                continue
            bigram = (test_corpus[i-1], word)
            acc -= math.log(get_bigram_prob(bigram, self.corpus))
        return math.exp((1/(N-1)) * acc)
    
    
    
class SmoothBigramModel(NGramModel):
    # assumes data is preprocessed list of words (strings)
    def __init__(self, data):
        super(SmoothBigramModel, self).__init__()
        self.tokens = list(get_unigram_corpus(data).keys())
        corpus = get_bigram_corpus(data)
        self.corpus = get_smooth_bigram_corpus(self.tokens, corpus)
    
    # assumes test_corpus is a preprocessed list of words (strings)
    def get_perp(self, test_corpus):
        N = len(test_corpus)
        acc = 0
        for i, word in enumerate(test_corpus):
            if i == 0:
                # we divide by N-1 because we don't compute the probability of the first term in the corpus
                continue
            bigram = (test_corpus[i-1], word)
            acc -= math.log(get_smooth_bigram_prob(bigram, self.corpus))
        return math.exp((1/(N-1)) * acc)

In [404]:
truthful_unigram_model = UnigramModel(preprocess(T_TRAIN))
deceptive_unigram_model = UnigramModel(preprocess(D_TRAIN))

truthful_smooth_bigram_model = SmoothBigramModel(preprocess(T_TRAIN))
deceptive_smooth_bigram_model = SmoothBigramModel(preprocess(D_TRAIN))

In [405]:
# for computing perplexity of both validation sets on truthfully-trained unigram/bigram models
T_VAL_CLEAN_T = check_for_unk_words(preprocess(T_VAL), truthful_smooth_bigram_model.tokens)
D_VAL_CLEAN_T = check_for_unk_words(preprocess(D_VAL), truthful_smooth_bigram_model.tokens)

# for computing perplexity of both validation sets on deceptively-trained unigram/bigram models
T_VAL_CLEAN_D = check_for_unk_words(preprocess(T_VAL), deceptive_smooth_bigram_model.tokens)
D_VAL_CLEAN_D = check_for_unk_words(preprocess(D_VAL), deceptive_smooth_bigram_model.tokens)

In [406]:
print(truthful_unigram_model.get_perp(T_VAL_CLEAN_T))
print(truthful_unigram_model.get_perp(D_VAL_CLEAN_T))

print(deceptive_unigram_model.get_perp(T_VAL_CLEAN_D))
print(deceptive_unigram_model.get_perp(D_VAL_CLEAN_D))

print(truthful_smooth_bigram_model.get_perp(T_VAL_CLEAN_T))
print(truthful_smooth_bigram_model.get_perp(D_VAL_CLEAN_T))

print(deceptive_smooth_bigram_model.get_perp(T_VAL_CLEAN_D))
print(deceptive_smooth_bigram_model.get_perp(D_VAL_CLEAN_D))

575.8911106150354
507.2203959017036
615.8537298038397
463.8680901825986
1454.340989231253
1281.9503153726475
1324.6771814849794
958.4681090581158


In [407]:
def separate_and_label_reviews(wordlist, label):
    start = 0
    reviews = {}
    for i, word in enumerate(wordlist):
        if word == '<s>' and i != 0:
            review = tuple(wordlist[start:i])
            reviews[review] = label
            start = i+1
    return reviews

# for testing the accuracy of the language based classifier on the validation set
# TOKENS = list(set().union(truthful_smooth_bigram_model.tokens, deceptive_smooth_bigram_model.tokens))
TOKENS = check_for_unk_words(preprocess(T_VAL), truthful_smooth_bigram_model.tokens)
TOKENS = check_for_unk_words(TOKENS, deceptive_smooth_bigram_model.tokens)

REVIEWS_T = check_for_unk_words(preprocess(T_VAL), TOKENS)
REVIEWS_D = check_for_unk_words(preprocess(D_VAL), TOKENS)

REVIEWS = {**separate_and_label_reviews(REVIEWS_T, 1), **separate_and_label_reviews(REVIEWS_D, 0)}
# now REVIEWS is a dictionary where the keys are the validation (truthful or deceptive) reviews
# and the values are the classification labels

In [408]:
# return value of 1 ==> truthful, 0 ==> deceptive
def classify(review, truthful_model, deceptive_model):
    return 1 if truthful_model.get_perp(review) < deceptive_model.get_perp(review) else 0

In [409]:
def validate(reviews, truthful_model, deceptive_model):
    acc = 0
    for review in reviews:
        if classify(review, truthful_model, deceptive_model) == reviews[review]:
            acc += 1
    return acc / len(reviews)

In [410]:
validate(REVIEWS, truthful_smooth_bigram_model, deceptive_smooth_bigram_model)

0.5590551181102362

In [411]:
validate(REVIEWS, truthful_unigram_model, deceptive_unigram_model)

0.8700787401574803

In [416]:
toy_train = preprocess('I like to run and I hate to have to do homework')
toy_model = SmoothBigramModel(toy_train)
print(toy_model.corpus)
toy_val = check_for_unk_words(preprocess('I like homework'), toy_model.tokens)
print(toy_model.get_perp(toy_val))

          <s>  I  like  to  run  and  hate  have  do  homework  <UNK>
<s>         1  2     1   1    1    1     1     1   1         1      1
I           1  1     2   1    1    1     2     1   1         1      1
like        1  1     1   2    1    1     1     1   1         1      1
to          1  1     1   1    2    1     1     2   2         1      1
run         1  1     1   1    1    2     1     1   1         1      1
and         1  2     1   1    1    1     1     1   1         1      1
hate        1  1     1   2    1    1     1     1   1         1      1
have        1  1     1   2    1    1     1     1   1         1      1
do          1  1     1   1    1    1     1     1   1         2      1
homework    1  1     1   1    1    1     1     1   1         1      1
<UNK>       1  1     1   1    1    1     1     1   1         1      1
7.7639360766563055
