# Doc2vec with how Yoon Kim did it

Steps:
* Tokenize punctuations as if they are their own words
* Determine the longest review's word count, then pad other reviews so that they are all as long as the longest review

In [1]:
import glob
import re
import sys
import gensim
import logging
from bs4 import BeautifulSoup
from gensim.models import Doc2Vec
from gensim.models.doc2vec import LabeledSentence, TaggedDocument

Using gpu device 0: GeForce GTX 1060 6GB (CNMeM is disabled, cuDNN 5105)


In [2]:
# Easily changable settings
# We will only train from training/unlabeled set
text_corpus_files = ['aclImdb/train/pos/*.txt', 'aclImdb/train/neg/*.txt', 'aclImdb/train/unsup/*.txt']
word_vector_dims = 100

In [3]:
def preprocess_text(text):
    #1 Remove HTML (inspired by Kaggle)
    text = BeautifulSoup(text, "html.parser").getText()

    #2 Tokenize (stolen from Yoon Kim's CNN)
    text = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", text)     
    text = re.sub(r"\'s", " \'s", text) 
    text = re.sub(r"\'ve", " \'ve", text) 
    text = re.sub(r"n\'t", " n\'t", text) 
    text = re.sub(r"\'re", " \'re", text) 
    text = re.sub(r"\'d", " \'d", text) 
    text = re.sub(r"\'ll", " \'ll", text) 
    text = re.sub(r",", " , ", text) 
    text = re.sub(r"!", " ! ", text) 
    text = re.sub(r"\(", " \( ", text) 
    text = re.sub(r"\)", " \) ", text) 
    text = re.sub(r"\?", " \? ", text) 
    text = re.sub(r"\s{2,}", " ", text)
    
    #3 Lower cap
    return text.lower()

In [4]:
def pad_text_list(text_list, pad_token="<PAD/>", pad_width=0):
    return text_list + ([pad_token] * (pad_width - len(text_list)))

def text_to_padded_list(text, pad_token="<PAD/>", pad_width=0):
    text_list = preprocess_text(text).split()
    return pad_text_list(text_list, pad_token, pad_width)

In [5]:
processed_texts = []
file_names = []
file_count = 0
for folder_files in text_corpus_files:
    for text_file in glob.glob(folder_files):
        with(open(text_file, 'r')) as f:
            processed_texts.append(text_to_padded_list(f.read()))
            file_names.append(text_file)
            file_count += 1
            if file_count % 100 == 0:
                sys.stdout.write('\rLoading text file {0:d}'.format(file_count))
                sys.stdout.flush()
                
max_processed_text_len = len(max(processed_texts, key=len))
print('\nLongest text list: {0:d}'.format(max_processed_text_len))
# for i, text_list in enumerate(processed_texts):
#     processed_texts[i] = pad_text_list(text_list, pad_width=max_processed_text_len)
#     if (i + 1) % 1000 == 0:
#         sys.stdout.write('\rPadding text list {0:d}'.format(i+1))
#         sys.stdout.flush()

Loading text file 75000
Longest text list: 2773


In [6]:
class LabeledReview(object):
    def __init__(self, docs_list, labels_list):
        self.docs_list = docs_list
        self.labels_list = labels_list
        
    def __iter__(self):
        for idx, doc in enumerate(self.docs_list):
            yield TaggedDocument(words=doc, tags=[self.labels_list[idx]])

In [7]:
it = LabeledReview(processed_texts, file_names)

model = Doc2Vec(size=100, window=8, min_count=1, workers=4, alpha=0.025, min_alpha=0.025)
model.build_vocab(it)

for epoch in range(10):
    print("Beginning epoch {0:d}".format(epoch+1))
    model.train(it)
    model.alpha -= 0.002
    model.min_alpha = model.alpha
    model.train(it)

Beginning epoch 1
Beginning epoch 2
Beginning epoch 3
Beginning epoch 4
Beginning epoch 5
Beginning epoch 6
Beginning epoch 7
Beginning epoch 8
Beginning epoch 9
Beginning epoch 10


In [8]:
model.save_word2vec_format('word2vec/d2v-not-padded-300d.bin', binary=True)
model.save('word2vec/d2v-not-padded-300d.model')

# Test loading from file

In [9]:
# test_model = gensim.models.Doc2Vec.load_word2vec_format('word2vec/d2v-padded.bin', binary=True)
test_model = Doc2Vec.load('word2vec/d2v-not-padded-300d.model')

In [10]:
test_model.most_similar('robot')

[(u'maniac', 0.7654415965080261),
 (u'geek', 0.739655077457428),
 (u'demon', 0.7359193563461304),
 (u'prostitute', 0.7281798720359802),
 (u'doctor', 0.7246631383895874),
 (u'dog', 0.7230076193809509),
 (u'policeman', 0.7188103199005127),
 (u'lawyer', 0.7156832218170166),
 (u'psychopath', 0.7153109312057495),
 (u'bird', 0.7108577489852905)]

In [11]:
def infer_vector(text):
    test_model.infer_vector(text_to_padded_list(text))

In [12]:
infer_vector('Apple decides to kill ornage')

In [13]:
test_model.infer_vector('what the fuck')

array([  1.74797489e-03,  -1.32032363e-02,   1.30325938e-02,
         1.42958378e-02,  -1.42836208e-02,  -5.90778328e-02,
         2.33219527e-02,  -2.11221278e-02,   5.03734611e-02,
        -3.08535080e-02,  -1.44617874e-02,  -1.04359677e-02,
         4.01574671e-02,   7.72434473e-03,  -1.30693289e-03,
         1.57768670e-02,  -2.20668390e-02,  -4.21150662e-02,
         1.96832586e-02,   4.25131656e-02,  -2.02304125e-02,
         4.90427576e-02,   1.87942851e-02,  -5.24087213e-02,
         3.67905851e-03,   1.68974716e-02,   1.02010611e-02,
        -6.92933053e-03,  -8.07955116e-03,  -3.62554900e-02,
        -2.07417347e-02,  -4.66159768e-02,  -1.19143194e-02,
        -1.00062266e-02,  -7.33764516e-03,  -1.53856128e-02,
         7.23951906e-02,   3.38784128e-04,   1.08476970e-02,
         1.61488727e-02,  -2.71307435e-02,   5.93591258e-02,
         2.09669843e-02,  -3.16876685e-04,   2.54739262e-02,
        -1.11382911e-02,   5.29330447e-02,  -1.56346373e-02,
        -7.49982754e-03,