In [4]:
import numpy as np
import os
from collections import defaultdict
from nltk.stem import PorterStemmer
import math

In [108]:
def uni_generator(name, k=1, unkp=0.0005):    
    ps = PorterStemmer()

    total = 0
    count = defaultdict(int)
    f = open("./train/" + name + ".txt", "r")
    for review in f:
        words = review.split(" ")
        total += len(review)
        for word in words:
            word = ps.stem(word)
            count[word] += 1

    # least frequent words         
    unk_words = set(sorted(count.keys(), key=lambda x:count[x])[:int(unkp*len(count))])
    
    # redo counting
    count = defaultdict(int)
    f = open("./train/" + name + ".txt", "r")
    for review in f:
        words = review.split(" ")
        for word in words:
            word = ps.stem(word)
            if word in unk_words:
                count["<UNK>"] += 1
            else:
                count[word] += 1

    # probability with smooth 
    prob_uni = {word: (count[word]+k)/(total + len(count)) for word in count}
    return prob_uni

In [194]:
def bi_generator(name, unkp=0.0005):
    ps = PorterStemmer()
    
    # least frequent words
    count_uni = defaultdict(int)
    count_bi = defaultdict(int)
    f = open("./train/" + name + ".txt", "r")
    for review in f:      
        words = review.split(" ")
        for word in words:
            word = ps.stem(word)
            count_uni[word] += 1
        
        for idx in range(len(words)-1):
            combo = ps.stem(words[idx]) + " " + ps.stem(words[idx+1])
            count_bi[combo] += 1
    unk_uni = set(sorted(count_uni.keys(), key=lambda x:count_uni[x])[:int(unkp*len(count_uni))])
    unk_bi = set(sorted(count_bi.keys(), key=lambda x:count_bi[x])[:int(unkp*len(count_bi))])
    
    # redo counting
    count_uni = defaultdict(int)
    count_bi = defaultdict(int)
    f = open("./train/" + name + ".txt", "r")
    for review in f:      
        words = review.split(" ")
        
        # insert start symbol
        words = ["<S>"] + words
        i = 0
        while i < len(words)-1:
            if ps.stem(words[i]) in [".", "?", "!"]:
                words.insert(i+1, "<E>")
                words.insert(i+2, "<S>")
            i += 1
        words.append("<E>")
            
        # uni
        for word in words:
            word = ps.stem(word)
            if word in unk_uni:
                count_uni["<UNK>"] += 1
            else:
                count_uni[word] += 1       
        
        # bi
        # add start symbol
        for idx in range(len(words)-1):
            word1 = ps.stem(words[idx])
            word2 = ps.stem(words[idx+1])
            

            # add unk
            # smooth sceneria
            if word1 in unk_uni and word2 in unk_uni:
                count_bi["<UNK> <UNK>"] += 1
            elif word1 in unk_uni:
                count_bi["<UNK> " + word2] += 1
            elif word2 in unk_uni:
                count_bi[word1 + " <UNK>"] += 1
            else:
                combo = word1 + " " + word2
                count_bi[combo] += 1       
    return count_uni, count_bi   

In [195]:
prob_uni_d = uni_generator("deceptive")

In [196]:
count_uni_d, count_bi_d  = bi_generator("deceptive")

In [197]:
prob_uni_t = uni_generator("truthful")

In [198]:
count_uni_t, count_bi_t = bi_generator("truthful")

In [199]:
# perplexity
def uni_predict(words, prob_uni):
    ps = PorterStemmer()
    temp = 0
    for word in words:
        word = ps.stem(word)
        if word not in prob_uni:
            word = "<UNK>"
        temp -=math.log(prob_uni[word])
    return math.exp(temp/len(words))

In [200]:
# perplexity
def bi_predict(words, count_uni, count_bi, k=1):
    ps = PorterStemmer()
    
    
    # insert start symbol
    words = ["<S>"] + words
    i = 0
    while i < len(words):
        if ps.stem(words[i]) in [".", "?", "!"]:
            words.insert(i+1, "<E>")
            words.insert(i+2, "<S>")
        i += 1
    words.append("<E>")
        
    # calculate
    temp = 0
    for i in range(len(words)-1):
        word1 = ps.stem(words[i])
        word2 = ps.stem(words[i+1])
        
        # unk scenaria
        if word1 not in count_uni and word2 not in count_uni:
            combo = "<UNK> <UNK>"
        elif word1 not in count_uni:
            combo = "<UNK> " + word2
        elif word2 not in count_uni:
            combo = word1 + " <UNK>"
        else:
            combo = word1 + " " + word2
        
        if word1 not in count_uni:
            word1 = "<UNK>"
        
        p = (count_bi[combo] + k)/(count_uni[word1] + k*len(count_uni))
        temp -=math.log(p)
    return math.exp(temp/len(words)-1)

In [201]:
classes = ["deceptive", "truthful"]


num_corpus = 0
correct_uni = 0
correct_bi = 0
for label in classes:
    f = open("./validation/" +  label + ".txt", "r")
    for review in f:
        num_corpus += 1
        words = review.split(" ")

        # uni-gram
        p_uni_d = uni_predict(words, prob_uni_d)
        p_uni_t = uni_predict(words, prob_uni_t)
        if p_uni_d < p_uni_t:
            if label == "deceptive":
                correct_uni += 1
#                 print("UNI " + label + ": " + str(p_uni_d))
        else:
            if label == "truthful":
                correct_uni += 1
#                 print("UNI " + label + ": " + str(p_uni_t))
        
#         print()
        # bi-gram
        p_bi_d = bi_predict(words, count_uni_d, count_bi_d, k=0.01)
        p_bi_t = bi_predict(words, count_uni_t, count_bi_t, k=0.01)
        if p_bi_d < p_bi_t:
            if label == "deceptive":
                correct_bi += 1
#                 print("BI " + label + ": " + str(p_bi_d))
        else:
            if label == "truthful":
                correct_bi += 1
#                 print("BI " + label + ": " + str(p_bi_t))

print(correct_uni/num_corpus)
print(correct_bi/num_corpus)

0.921875
0.84765625


In [120]:
k = 0.01
result = []
f = open("./test/test.txt", "r")
for idx, review in enumerate(f):
    words = review.split(" ")
    # bi-gram
    p_d = bi_predict(words, count_uni_d, count_bi_d, k)
    p_t = bi_predict(words, count_uni_t, count_bi_t, k)
    if p_d < p_t:
        result.append(str(1))
    else:
        result.append(str(0))


In [202]:
k = 0.01
result = []
f = open("./test/test.txt", "r")
for idx, review in enumerate(f):
    words = review.split(" ")
    # bi-gram
    p_d = uni_predict(words, prob_uni_d)
    p_t = uni_predict(words, prob_uni_t)
    if p_d < p_t:
        result.append(str(1))
    else:
        result.append(str(0))

In [203]:
with open('prediction_LM.csv','a') as file:
    file.write("Id,Prediction\n")
    for idx, val in enumerate(result):
        file.write(",".join([str(idx), val]))
        file.write('\n')