### Import

In [1]:
import numpy as np
import math
import random

import torch
import torch.nn as nn
import torch.optim as optim
from torchtext import data
from tqdm import tnrange, tqdm_notebook
from data_loader import DataLoader

from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix 
import pickle

from sklearn.feature_extraction.text import CountVectorizer

%load_ext autoreload
%autoreload 2


### Load Data

In [2]:
data_loader = DataLoader()
train, valid = data_loader.small_train_valid()

aclImdb_v1.tar.gz:   0%|          | 0.00/84.1M [00:00<?, ?B/s]

building data...
downloading aclImdb_v1.tar.gz


aclImdb_v1.tar.gz: 100%|██████████| 84.1M/84.1M [00:09<00:00, 8.77MB/s]


saving data at dataset/train_data.pt, dataset/test_data.pt


.vector_cache/glove.6B.zip: 0.00B [00:00, ?B/s]

splitting data...
building vocabulary...


.vector_cache/glove.6B.zip: 862MB [01:39, 8.63MB/s]                              
100%|█████████▉| 398165/400000 [00:20<00:00, 19473.86it/s]

In [3]:
print('%d training examples' %len(train))
print('%d validation examples' %len(valid))

1250 training examples
1250 validation examples


In [4]:
def make_corpus(torchtext_data):
    corpus = [' '.join(data.text).replace("<br />","").replace("< br />","") for data in torchtext_data]
    return corpus

def corpus_vectorize(train_corpus, vectorizer, valid_corpus):
    
    train_matrix = vectorizer.fit(train_corpus)
    train_matrix = vectorizer.transform(train_corpus)
    train_matrix = train_matrix.toarray()
    
    valid_matrix = vectorizer.transform(valid_corpus)
    valid_matrix = valid_matrix.toarray()
   
    return train_matrix, valid_matrix

def true_label(data):    
    label = [1 if data[i].label[0] == 'pos' else 0 for i in range(len(data))]
    return label

In [5]:
y_train = true_label(train)
y_valid = true_label(valid)

train_corpus = make_corpus(train)
valid_corpus = make_corpus(valid)

uni_vectorizer = CountVectorizer(max_features= 20000)
bi_vectorizer = CountVectorizer(ngram_range= (2,2), max_features= 20000)

## Naive Bayes - Unigram 

In [6]:
uni_train, uni_valid = corpus_vectorize(train_corpus, uni_vectorizer, valid_corpus)
uniNB = MultinomialNB()
uniNB.fit(uni_train, y_train)
print("unigram - train dataset accuracy", uniNB.score(uni_train, y_train))
print("unigram - valid dataset accuracy", uniNB.score(uni_valid, y_valid))

unigram - train dataset accuracy 0.9776
unigram - valid dataset accuracy 0.7984


## Naive Bayes - Bigrams

In [7]:
bi_train, bi_valid = corpus_vectorize(train_corpus, bi_vectorizer, valid_corpus)
biNB = MultinomialNB()
biNB.fit(bi_train, y_train)
print("bigram - train dataset accuracy", biNB.score(bi_train, y_train))
print("bigram - valid dataset accuracy", biNB.score(bi_valid, y_valid))

bigram - train dataset accuracy 0.9944
bigram - valid dataset accuracy 0.8208


## Adversary - swap word (black box)

In [11]:
with open('test.pickle', 'rb') as handle:
    test = pickle.load(handle)

In [12]:
# Because we perform one swap per word, but do not alter the first or last letters.
# This noise is only applied to words of length > 3.
# noise -> niose

def swap(word):
    rand = random.randint(1,len(word)-3)
    return word[:rand] +  word[rand:rand+2][::-1] + word[rand+2:]

def adversary_swap(adversary_data, propotion):
    """
    Randomly swap characters in words in text, parameter 'propotion' decides how many words are s
    e.g. noise -> niose 
    """
    for i in range(len(adversary_data)):
        rand_sample = random.sample(np.arange(0, len(adversary_data[i].text)).tolist(), int(len(adversary_data[i].text)*propotion))
        for pick in rand_sample:
            if len(adversary_data[i].text[pick]) > 3:
                # print(adversary_data[i].text[pick], "->", swap(adversary_data[i].text[pick]))
                adversary_data[i].text[pick] = swap(adversary_data[i].text[pick])
        # print(" ")
        # print(i, "SWAP END")
        # print(" ")
        
    return adversary_data


In [13]:
adversary = adversary_swap(test, 0.3)

### Adversary

In [14]:
adv_corpus = make_corpus(adversary)
uni_train, uni_adv = corpus_vectorize(train_corpus, uni_vectorizer, adv_corpus)
bi_train, bi_adv = corpus_vectorize(train_corpus, bi_vectorizer, adv_corpus)

In [15]:
print("unigram - valid dataset accuracy", uniNB.score(uni_valid, y_valid))
print("unigram - adversary dataset accuracy", uniNB.score(uni_adv, y_valid))
print("")
print("bigram - valid dataset accuracy", biNB.score(bi_valid, y_valid))
print("bigram - adversary dataset accuracy", biNB.score(bi_adv, y_valid))

unigram - valid dataset accuracy 0.7984
unigram - adversary dataset accuracy 0.7904

bigram - valid dataset accuracy 0.8208
bigram - adversary dataset accuracy 0.7792


In [16]:
print("unigram confusion_matrix - before")
print(confusion_matrix(y_valid, uniNB.predict(uni_valid)))
print("unigram confusion_matrix - after")
print(confusion_matrix(y_valid, uniNB.predict(uni_adv)))
print("")

print("bigram confusion_matrix - before")
print(confusion_matrix(y_valid, biNB.predict(bi_valid)))
print("bigram confusion_matrix - after")
print(confusion_matrix(y_valid, biNB.predict(bi_adv)))

unigram confusion_matrix - before
[[545  72]
 [180 453]]
unigram confusion_matrix - after
[[540  77]
 [185 448]]

bigram confusion_matrix - before
[[531  86]
 [138 495]]
bigram confusion_matrix - after
[[500 117]
 [159 474]]


In [None]:
# try word embedding
len(data_loader.TEXT.vocab.itos)
pretrained_embeddings = data_loader.TEXT.vocab.vectors
len(pretrained_embeddings)

### Save

In [18]:
# save model 
filename = 'uniNB_model.sav'
pickle.dump(uniNB, open(filename, 'wb'))

filename = 'biNB_model.sav'
pickle.dump(biNB, open(filename, 'wb'))


In [None]:
# load model
uniNB = pickle.load(open('./saved_model/uniNB_modle.sav', 'rb'))
biNB = pickle.load(open('./saved_model/biNB_model.sav', 'rb'))

In [None]:
# save test data
for data in valid:
    print(data)
    break

import pickle
raw_X = [data for data in valid]
with open('test.pickle', 'wb') as handle:
    pickle.dump(raw_X, handle)

In [10]:
raw_X = [data for data in valid]
with open('test.pickle', 'wb') as handle:
    pickle.dump(raw_X, handle)