### Import

In [1]:
import numpy as np
import math
import torch
import torch.nn as nn
import torch.optim as optim
from torchtext import data
from tqdm import tnrange, tqdm_notebook
from collections import Counter

from data_loader import DataLoader

%load_ext autoreload
%autoreload 2


### Load Data

In [26]:
data_loader = DataLoader()
train, valid = data_loader.small_train_valid()


loading data...
splitting data...
building vocabulary...


In [27]:
print('%d training examples' %len(train))
print('%d validation examples' %len(valid))

# true label for train 'pos' > 1, 'neg' > 0
y_train = [1 if train[i].label == ['pos'] else 0 for i in range(len(train))]

# true label for valid 'pos' > 1, 'neg' > 0
y_valid = [1 if valid[i].label == ['pos'] else 0 for i in range(len(valid))]



1250 training examples
1250 validation examples


### Naive Bayes - Unigram

In [70]:
def nb_train_unigram(train):
    """
    Build positive/negative word dictionary from training data.
    Return postive log prob dictionary, negative lob prob dictionary, posistive prob
       
    """
    
    vocab_dict = Counter()
    for i in range(len(train)):
        for word in train[i].text:
            vocab_dict[word] = 1
                
    vocab_size = len(vocab_dict)
        
    pos_word = []
    neg_word = []

    pos_dict = vocab_dict.copy()
    neg_dict = vocab_dict.copy()
    
    for i in range(len(train)):
        if y_train[i] == 1:
            for word in train[i].text:
                pos_word.append(word)
                pos_dict[word] += 1
        else:
            for word in train[i].text:
                neg_word.append(word)
                neg_dict[word] += 1
               
        p_class1 = sum(y_train)/len(y_train)
        
    for k,v in pos_dict.items():
        pos_dict[k] = math.log((pos_dict[k])/(vocab_size + len(pos_word)))
        neg_dict[k] = math.log((neg_dict[k])/(vocab_size + len(neg_word)))
            
    return pos_dict, neg_dict, p_class1

        
def nb_evaluate_unigram(valid, pos_dict, neg_dict, p_class1):
    """
    Make prediction from valid data based on positive/negative dictionary.
    Return prediction array 
    """
        
    pred = []
        
    for i in range(len(valid)):
        p_pos = 0
        p_neg = 0 
        for word in valid[i].text:
            if word in pos_dict:
                p_pos += pos_dict[word]
                p_neg += neg_dict[word]
                
        p_pos += np.log(p_class1)
        p_neg += np.log(1-p_class1)

        if p_pos > p_neg:
            pred.append(1)
        else:                
            pred.append(0)      
        
    return pred
    

### Naive Bayes - Bigrams

In [71]:
def nb_train_bigrams(train):
    """
    Build positive/negative bigrams dictionary from training data.
    Return postive log prob dictionary, negative lob prob dictionary, posistive prob
       
    """
    vocab_dict = Counter()
    for i in range(len(train)):
        train[i].text.insert(0,"START")
        train[i].text.append("STOP")
        for j in range(len(train[i].text)-1):
            bigrams = train[i].text[j] + " " + train[i].text[j+1]
            vocab_dict[bigrams] = 1 
        
    vocab_size = len(vocab_dict)
        
    pos_word = []
    neg_word = []

    pos_dict = vocab_dict.copy()
    neg_dict = vocab_dict.copy()
    
    for i in range(len(train)):
        for j in range(len(train[i].text)-1):
            bigrams = train[i].text[j] + " " + train[i].text[j+1]
            if y_train[i] == 1:
                pos_word.append(bigrams)
                pos_dict[bigrams] += 1
            else:
                neg_word.append(bigrams)
                neg_dict[bigrams] += 1
               
        p_class1 = sum(y_train)/len(y_train)
        
    for k,v in pos_dict.items():
        pos_dict[k] = math.log((pos_dict[k])/(vocab_size + len(pos_word)))
        neg_dict[k] = math.log((neg_dict[k])/(vocab_size + len(neg_word)))
            
    return pos_dict, neg_dict, p_class1
    
    
def nb_evaluate_bigrams(valid, pos_dict, neg_dict, p_class1):
    """
    Make prediction from valid data based on positive/negative bigrams dictionary.
    Return prediction array 
    """
        
    pred = []
        
    for i in range(len(valid)):
        p_pos = 0
        p_neg = 0        
        valid[i].text.insert(0,"START")
        valid[i].text.append("STOP")
        for j in range(len(valid[i].text)-1):
            bigrams = valid[i].text[j] + " " + valid[i].text[j+1]
            if bigrams in pos_dict:
                p_pos += pos_dict[bigrams]
                p_neg += neg_dict[bigrams]
                
        p_pos += np.log(p_class1)
        p_neg += np.log(1-p_class1)

        if p_pos > p_neg:
            pred.append(1)
        else:                
            pred.append(0)      
        
    return pred

    

### Accuracy

In [58]:
def accuracy(pred, y):
    acc = np.where(np.array(pred) - np.array(y)==0, 1, 0)
    return sum(acc)/len(pred)

In [59]:
pos_dict_uni, neg_dict_uni, p_class1_uni = nb_train(train)


In [72]:
uni_pred_valid = evaluate(valid, pos_dict_uni, neg_dict_uni, p_class1_uni)
uni_pred_train = evaluate(train, pos_dict_uni, neg_dict_uni, p_class1_uni)
print("unigram - train dataset accuracy", accuracy(uni_pred_train, y_train))
print("unigram - valid dataset accuracy", accuracy(uni_pred_valid, y_valid))


unigram - train dataset accuracy 0.9792
unigram - valid dataset accuracy 0.7936


In [73]:
bi_pos_dict, bi_neg_dict, bi_p_class1 = nb_train_bigrams(train)

In [74]:
bi_pred_valid = evaluate_bigrams(valid, bi_pos_dict, bi_neg_dict, bi_p_class1)
bi_pred_train = evaluate_bigrams(train, bi_pos_dict, bi_neg_dict, bi_p_class1)
print("bigrams - train dataset accuracy", accuracy(bi_pred_train, y_train))
print("bigrams - valid dataset accuracy", accuracy(bi_pred_valid, y_valid))


bigrams - train dataset accuracy 1.0
bigrams - valid dataset accuracy 0.8064
