In [1]:
data_folder = 'data/aclImdb/'

In [21]:
import os
from collections import Counter, defaultdict
import nltk
import re
from tqdm import tqdm_notebook
import numpy as np

In [4]:
rp = os.path.join(data_folder, 'train/pos')
train_positive = [os.path.join(rp, f) for f in os.listdir(rp)]
rp = os.path.join(data_folder, 'train/neg')
train_negative = [os.path.join(rp, f) for f in os.listdir(rp)]

rp = os.path.join(data_folder, 'test/pos')
test_positive = [os.path.join(rp, f) for f in os.listdir(rp)]
rp = os.path.join(data_folder, 'test/neg')
test_negative = [os.path.join(rp, f) for f in os.listdir(rp)]

### 

- Document $d = w_1w_2...w_n$, where $w_i$ are the tokens of the document and $n$ is the total number of tokens in the document $d$.

- Dataset is ${d_i, s_i}$

- Each document d_i is associated with sentiment $s_i \in \{0,1\}$, $0$ being negative sentiment and $1$ being positive sentiment.

- $p(s|doc) = \frac{p(doc|s)p(s)}{p(doc|s)p(s) + p(doc|\bar{s})p(\bar{s})}$

- $p(d|s) = p(w_1,w_2,..., w_n|s) = p(w_1|s)p(w_2|s)...p(w_n|s)$

- We have $p(s) = 0.5$ and $p(\bar{s})=0.5$.

- $ p(s|d) = \frac{p(d|s)}{p(d|s) + p(d|\bar{s})} $

- If we assign threshold of $p_T(s|d) = 0.5$ for deciding the final label, the model simplifies to,
 -  $y=1$ if $p(d|s=1) \geq p(d|s=0)$.

-----------
- For numerical stability, we will convert the probabilities to log probability,
 - $\log p(d|s) = \log p(w_1,w_2,..., w_n|s) = \log p(w_1|s) + \log p(w_2|s) + ...+ \log p(w_n|s)$

In [5]:
re_html_cleaner = re.compile(r"<.*?>")

In [6]:
positive_word_counts = Counter()

for _fname in tqdm_notebook(train_positive):
    with open(_fname) as f:
        text = f.read().strip()
        text = re_html_cleaner.sub(" ", text)
        positive_word_counts += Counter(nltk.word_tokenize(text))

HBox(children=(IntProgress(value=0, max=12500), HTML(value='')))




In [7]:
negative_word_counts = Counter()

for _fname in tqdm_notebook(train_negative):
    with open(_fname) as f:
        text = f.read().strip()
        text = re_html_cleaner.sub(" ", text)
        negative_word_counts += Counter(nltk.word_tokenize(text))

HBox(children=(IntProgress(value=0, max=12500), HTML(value='')))




In [27]:
len_corpus_pos = sum(positive_word_counts.values())
len_corpus_neg = sum(negative_word_counts.values())
epsilon = 0.1
log_p_vocab_pos = defaultdict(lambda: np.log(epsilon/len_corpus_pos), {w:np.log(c/len_corpus_pos) for w,c in positive_word_counts.items()})
log_p_vocab_neg = defaultdict(lambda: np.log(epsilon/len_corpus_neg), {w:np.log(c/len_corpus_neg) for w,c in negative_word_counts.items()})

In [31]:
p_data_pos = len(train_positive)/(len(train_positive) + len(train_negative))

In [32]:
def get_prob_pos(doc):
    text = doc.strip()
    text = re_html_cleaner.sub(" ", text)
    tokens = nltk.word_tokenize(text)
    p_pos = 1
    p_neg = 1
    for token in tokens:
        p_pos += log_p_vocab_pos[token]
        p_neg += log_p_vocab_neg[token]
        
    return 1.0*(p_pos > p_neg) #/(p_pos+p_neg)

In [37]:
results = []
for _fname in tqdm_notebook(test_positive):
    with open(_fname) as f:
        results.append((1, get_prob_pos(f.read())))
        

for _fname in tqdm_notebook(test_negative):
    with open(_fname) as f:
        results.append((0, get_prob_pos(f.read())))

HBox(children=(IntProgress(value=0, max=12500), HTML(value='')))




HBox(children=(IntProgress(value=0, max=12500), HTML(value='')))




In [39]:
true_pos = 0
false_pos = 0
true_neg = 0
false_neg = 0
for true_label, pred_label in results:
    if true_label == 1 and pred_label == 1:
        true_pos += 1
    elif true_label == 1 and pred_label == 0:
        false_neg += 1
    elif true_label == 0 and pred_label == 1:
        false_pos += 1
    elif true_label == 0 and pred_label == 0:
        true_neg += 1

In [41]:
print(f"Accuracy: {(true_pos + true_neg)/(true_pos + true_neg + false_pos + false_neg)}")
print(f"Recall: {(true_pos)/(true_pos + false_neg)}")
print(f"Precision: {(true_pos + true_neg)/(true_pos + false_pos + false_neg)}")

Accuracy: 0.80188
Recall: 0.73576
