### Import

In [1]:
import numpy as np
import math
import torch
import torch.nn as nn
import torch.optim as optim
from torchtext import data
from tqdm import tnrange, tqdm_notebook
from collections import Counter

from data_loader import DataLoader

%load_ext autoreload
%autoreload 2

### Load Data

In [2]:
data_loader = DataLoader()
train, valid = data_loader.small_train_valid()

aclImdb_v1.tar.gz:   0%|          | 0.00/84.1M [00:00<?, ?B/s]

loading data...
downloading aclImdb_v1.tar.gz


aclImdb_v1.tar.gz: 100%|██████████| 84.1M/84.1M [00:09<00:00, 8.78MB/s]
.vector_cache/glove.6B.zip: 0.00B [00:00, ?B/s]

splitting data...
building vocabulary...


.vector_cache/glove.6B.zip: 862MB [02:34, 5.58MB/s]                              
100%|█████████▉| 399490/400000 [00:28<00:00, 14130.20it/s]

In [55]:
print('%d training examples' %len(train))
print('%d validation examples' %len(valid))

# true label for train 'pos' > 1, 'neg' > 0
y_train = [1 if train[i].label == ['pos'] else 0 for i in range(len(train))]

# true label for valid 'pos' > 1, 'neg' > 0
y_valid = [1 if valid[i].label == ['pos'] else 0 for i in range(len(valid))]



1250 training examples
1250 validation examples


### Naive Bayes

In [56]:
def nb_train(train):
    
    vocab_dict = Counter()
    for i in range(len(train)):
        for word in train[i].text:
            vocab_dict[word] = 1
                
    vocab_size = len(vocab_dict)
        
    pos_word = []
    neg_word = []

    pos_dict = vocab_dict.copy()
    neg_dict = vocab_dict.copy()
    
    for i in range(len(train)):
        if y_train[i] == 1:
            for word in train[i].text:
                pos_word.append(word)
                pos_dict[word] += 1
        else:
            for word in train[i].text:
                neg_word.append(word)
                neg_dict[word] += 1
               
        p_class1 = sum(y_train)/len(y_train)
        
    for k,v in pos_dict.items():
        pos_dict[k] = math.log((pos_dict[k])/(vocab_size + len(pos_word)))
        neg_dict[k] = math.log((neg_dict[k])/(vocab_size + len(neg_word)))
            
    return pos_dict, neg_dict, p_class1

        
def evaluate(valid, pos_dict, neg_dict, p_class1):
        
    pred = []
        
    for i in range(len(valid)):
        p_pos = 0
        p_neg = 0 
        for word in valid[i].text:
            p_pos += pos_dict[word]
            p_neg += neg_dict[word]
                
        p_pos += np.log(p_class1)
        p_neg += np.log(1-p_class1)

        if p_pos > p_neg:
            pred.append(1)
        else:                
            pred.append(0)      
        
    return pred
    
def accuracy(pred, y):
    
    acc = np.where(np.array(pred) - np.array(y)==0, 1, 0)
    return sum(acc)/len(pred)

In [57]:
pos_dict, neg_dict, p_class1 = nb_train(train)

In [58]:
pred_valid = evaluate(valid, pos_dict, neg_dict, p_class1)

In [59]:
accuracy(pred_valid, y_valid)

0.7936

In [60]:
pre_train = evaluate(train, pos_dict, neg_dict, p_class1)

In [61]:
accuracy(pre_train, y_train)

0.9792