# Doc2vec with how Yoon Kim did it

Steps:
* Tokenize punctuations as if they are their own words
* Determine the longest review's word count, then pad other reviews so that they are all as long as the longest review

In [1]:
import glob
import re
import sys
import gensim
import logging
from bs4 import BeautifulSoup
from gensim.models import Doc2Vec
from gensim.models.doc2vec import LabeledSentence, TaggedDocument

Using gpu device 0: GeForce GTX 1060 6GB (CNMeM is disabled, cuDNN 5105)


In [2]:
# Easily changable settings
# We will only train from training/unlabeled set
text_corpus_files = ['aclImdb/train/pos/*.txt', 'aclImdb/train/neg/*.txt', 'aclImdb/train/unsup/*.txt']
word_vector_dims = 50

In [3]:
def preprocess_text(text):
    #1 Remove HTML (inspired by Kaggle)
    text = BeautifulSoup(text, "html.parser").getText()

    #2 Tokenize (stolen from Yoon Kim's CNN)
    text = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", text)     
    text = re.sub(r"\'s", " \'s", text) 
    text = re.sub(r"\'ve", " \'ve", text) 
    text = re.sub(r"n\'t", " n\'t", text) 
    text = re.sub(r"\'re", " \'re", text) 
    text = re.sub(r"\'d", " \'d", text) 
    text = re.sub(r"\'ll", " \'ll", text) 
    text = re.sub(r",", " , ", text) 
    text = re.sub(r"!", " ! ", text) 
    text = re.sub(r"\(", " \( ", text) 
    text = re.sub(r"\)", " \) ", text) 
    text = re.sub(r"\?", " \? ", text) 
    text = re.sub(r"\s{2,}", " ", text)
    
    #3 Lower cap
    return text.lower()

In [4]:
def pad_text_list(text_list, pad_token="<PAD/>", pad_width=0):
    return text_list + ([pad_token] * (pad_width - len(text_list)))

def text_to_padded_list(text, pad_token="<PAD/>", pad_width=0):
    text_list = preprocess_text(text).split()
    return pad_text_list(text_list, pad_token, pad_width)

In [5]:
processed_texts = []
file_names = []
file_count = 0
for folder_files in text_corpus_files:
    for text_file in glob.glob(folder_files):
        with(open(text_file, 'r')) as f:
            processed_texts.append(text_to_padded_list(f.read()))
            file_names.append(text_file)
            file_count += 1
            if file_count % 100 == 0:
                sys.stdout.write('\rLoading text file {0:d}'.format(file_count))
                sys.stdout.flush()
                
max_processed_text_len = len(max(processed_texts, key=len))
print('\nLongest text list: {0:d}'.format(max_processed_text_len))
# for i, text_list in enumerate(processed_texts):
#     processed_texts[i] = pad_text_list(text_list, pad_width=max_processed_text_len)
#     if (i + 1) % 1000 == 0:
#         sys.stdout.write('\rPadding text list {0:d}'.format(i+1))
#         sys.stdout.flush()

Loading text file 75000
Longest text list: 2773


In [6]:
class LabeledReview(object):
    def __init__(self, docs_list, labels_list):
        self.docs_list = docs_list
        self.labels_list = labels_list
        
    def __iter__(self):
        for idx, doc in enumerate(self.docs_list):
            yield TaggedDocument(words=doc, tags=[self.labels_list[idx]])

In [7]:
it = LabeledReview(processed_texts, file_names)

model = Doc2Vec(size=100, window=8, min_count=1, workers=4, alpha=0.025, min_alpha=0.025, dm=1)
model.build_vocab(it)

for epoch in range(10):
    print("Beginning epoch {0:d}".format(epoch+1))
    model.train(it)
    model.alpha -= 0.002
    model.min_alpha = model.alpha
    model.train(it)

Beginning epoch 1
Beginning epoch 2
Beginning epoch 3
Beginning epoch 4
Beginning epoch 5
Beginning epoch 6
Beginning epoch 7
Beginning epoch 8
Beginning epoch 9
Beginning epoch 10


In [8]:
model.save_word2vec_format('word2vec/d2v-dm-50d.bin', binary=True)
model.save('word2vec/d2v-dm-50d.model')

# Test loading from file

In [9]:
# test_model = gensim.models.Doc2Vec.load_word2vec_format('word2vec/d2v-padded.bin', binary=True)
test_model = Doc2Vec.load('word2vec/d2v-dm-50d.model')

In [10]:
test_model.most_similar('robot')

[(u'boy', 0.7213273048400879),
 (u'policeman', 0.7198787331581116),
 (u'maniac', 0.7197606563568115),
 (u'nerd', 0.7159204483032227),
 (u'lawyer', 0.7151718139648438),
 (u'kid', 0.711284875869751),
 (u'prostitute', 0.7106501460075378),
 (u'thief', 0.7083243131637573),
 (u'doctor', 0.7071986794471741),
 (u'woman', 0.7043994069099426)]

In [11]:
def infer_vector(text):
    test_model.infer_vector(text_to_padded_list(text))

In [12]:
infer_vector('Apple decides to kill ornage')

In [13]:
test_model.infer_vector('Can also accept string but who knows?')

array([-0.01595143, -0.02850123, -0.00850443,  0.02607614, -0.00635684,
        0.00328236, -0.02458174,  0.01933051,  0.00154752, -0.00923351,
        0.01828594,  0.02338634,  0.04594881,  0.01582849, -0.01805806,
        0.01653051, -0.00788338,  0.00617298, -0.00092188,  0.00387195,
       -0.03214197,  0.03041311,  0.02434262, -0.05064768, -0.02673751,
        0.00489244,  0.06051378, -0.01317   , -0.04376006,  0.04648232,
       -0.03026671, -0.00244751, -0.01427396, -0.02382843,  0.00972601,
        0.03035809,  0.00535701, -0.02562642, -0.00141815,  0.04683141,
       -0.01686425,  0.00672396,  0.01222186, -0.02510577,  0.0065539 ,
       -0.02291234,  0.02027738, -0.0202756 ,  0.00473335,  0.00840169,
       -0.01521209, -0.07518742, -0.02587005,  0.04780743,  0.00617817,
        0.05715486,  0.0128863 ,  0.00041511, -0.0046965 ,  0.01947872,
       -0.04566262,  0.0783381 , -0.00718412, -0.03953593, -0.02257931,
        0.00202419,  0.01255951, -0.04387324, -0.03954568, -0.04