In [1]:
from collections import Counter
import pandas as pd

In [2]:
with open('./DATASET/train/truthful.txt') as t, open('./DATASET/train/deceptive.txt') as d:
    truthful = t.read()
    deceptive =d.read()

In [3]:
def clean_text(words):
    words = '<s> ' + words
    words = words.replace('\n', ' <s> ')
    return words[:-5]

In [4]:
truthful_unigram = dict(Counter(clean_text(truthful).split()))
deceptive_unigram = dict(Counter(clean_text(deceptive).split()))

In [5]:
def get_bigram_counts(words):
    words = clean_text(words)
    
    word_list = words.split()
    corpus = {}
    for i, word in enumerate(word_list[1:], start=1):
        if word != '<s>':
            if (word_list[i-1], word) not in corpus:
                corpus[(word_list[i-1], word)] = 1
            else:
                corpus[(word_list[i-1], word)] += 1
        
    return corpus

In [6]:
truthful_bigram = get_bigram_counts(truthful)
deceptive_bigram = get_bigram_counts(deceptive)

In [7]:
def get_unigram_prob(corpus, unigram_to_test):
    total_words = 0
    for key in corpus:
        total_words += corpus[key]

    return corpus[unigram_to_test]/total_words

In [8]:
get_unigram_prob(deceptive_unigram, 'I')

0.02905073649754501

In [9]:
def get_bigram_prob(corpus, bigram_to_test):
    total_words = 0
    for key in corpus:
        if key[0] == bigram_to_test[0]:
            total_words += corpus[key]

    return corpus[bigram_to_test]/total_words

In [10]:
get_bigram_prob(deceptive_bigram, ('I', 'am'))

0.01488933601609658

In [11]:
def get_smoothed_bigram_corpus(unigram_corpus, bigrams):
    df = pd.DataFrame(1, index = unigram_corpus, columns = unigram_corpus) 
    for bigram in bigrams:
        df.loc[bigram[0], bigram[1]] += bigrams[bigram]
    return df

In [12]:
df = pd.DataFrame(1, index =deceptive_unigram, columns =deceptive_unigram) 

In [13]:
get_smoothed_bigram_corpus(deceptive_unigram, deceptive_bigram)

Unnamed: 0,<s>,I,was,here,on,business,so,needed,to,get,...,recoup,rockin,ROYAL,gifts,allergy-friendly,tree,informal,tranquility,cleaners,beutiful
<s>,1,135,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
I,1,1,326,1,1,1,2,15,5,12,...,1,1,1,1,1,1,1,1,1,1
was,1,3,1,4,19,1,38,1,15,1,...,1,1,1,1,1,1,1,1,1,1
here,1,1,4,1,4,1,1,1,2,1,...,1,1,1,1,1,1,1,1,1,1
on,1,1,1,1,1,15,1,1,2,1,...,1,1,1,1,1,1,1,1,1,1
business,1,5,4,1,1,1,3,1,1,1,...,1,1,1,1,1,1,1,1,1,1
so,1,35,2,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
needed,1,1,2,1,1,1,1,1,11,1,...,1,1,1,1,1,1,1,1,1,1
to,1,1,3,1,2,1,3,1,1,75,...,1,1,1,1,1,1,1,1,1,1
get,1,1,1,1,3,1,1,1,15,1,...,1,1,1,1,1,1,1,1,1,1


In [14]:
deceptive_bigram

{('<s>', 'I'): 134,
 ('I', 'was'): 325,
 ('was', 'here'): 3,
 ('here', 'on'): 3,
 ('on', 'business'): 14,
 ('business', 'so'): 2,
 ('so', 'I'): 34,
 ('I', 'needed'): 14,
 ('needed', 'to'): 10,
 ('to', 'get'): 74,
 ('get', 'some'): 5,
 ('some', 'rest'): 1,
 ('rest', 'the'): 1,
 ('the', 'next'): 41,
 ('next', 'day'): 11,
 ('day', 'but'): 1,
 ('but', 'the'): 50,
 ('the', 'walls'): 14,
 ('walls', 'were'): 6,
 ('were', 'so'): 17,
 ('so', 'thin'): 3,
 ('thin', 'that'): 1,
 ('that', 'I'): 80,
 ('I', 'could'): 72,
 ('could', 'here'): 1,
 ('here', 'all'): 1,
 ('all', 'the'): 57,
 ('the', 'carrying'): 1,
 ('carrying', 'on'): 1,
 ('on', 'in'): 5,
 ('in', 'the'): 360,
 ('the', 'neighbors'): 2,
 ('neighbors', 'room'): 1,
 ('room', 'till'): 1,
 ('till', '3'): 1,
 ('3', 'am'): 1,
 ('am', '.'): 2,
 ('.', 'This'): 82,
 ('This', 'was'): 22,
 ('was', 'very'): 118,
 ('very', 'beautiful'): 2,
 ('beautiful', 'building'): 1,
 ('building', 'the'): 1,
 ('the', 'room'): 206,
 ('room', 'was'): 151,
 ('was', 'a')

In [15]:
def get_smoothed_bigram_prob(bigram, smoothed_bigram_corpus):
    return df.loc[bigram[0], bigram[1]]/df.to_numpy().sum()

In [16]:
df = get_smoothed_bigram_corpus(deceptive_unigram, deceptive_bigram)

In [17]:
get_smoothed_bigram_prob(('I', 'am'), df)

1.0751206002386995e-06