# Doc2vec with how Yoon Kim did it

Steps:
* Tokenize punctuations as if they are their own words
* Determine the longest review's word count, then pad other reviews so that they are all as long as the longest review

In [1]:
import glob
import re
import sys
import gensim
import logging
from bs4 import BeautifulSoup

Using gpu device 0: GeForce GTX 1060 6GB (CNMeM is disabled, cuDNN 5105)


In [2]:
# Easily changable settings
text_corpus_files = ['aclImdb/train/pos/*.txt', 'aclImdb/train/neg/*.txt', 'aclImdb/train/unsup/*.txt']
word_vector_dims = 100

In [3]:
def preprocess_text(text):
    #1 Remove HTML (inspired by Kaggle)
    text = BeautifulSoup(text, "html.parser").getText()

    #2 Tokenize (stolen from Yoon Kim's CNN)
    text = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", text)     
    text = re.sub(r"\'s", " \'s", text) 
    text = re.sub(r"\'ve", " \'ve", text) 
    text = re.sub(r"n\'t", " n\'t", text) 
    text = re.sub(r"\'re", " \'re", text) 
    text = re.sub(r"\'d", " \'d", text) 
    text = re.sub(r"\'ll", " \'ll", text) 
    text = re.sub(r",", " , ", text) 
    text = re.sub(r"!", " ! ", text) 
    text = re.sub(r"\(", " \( ", text) 
    text = re.sub(r"\)", " \) ", text) 
    text = re.sub(r"\?", " \? ", text) 
    text = re.sub(r"\s{2,}", " ", text)
    
    #3 Lower cap
    return text.lower()

In [4]:
def pad_text_list(text_list, pad_token="<PAD/>", pad_width=0):
    return text_list + ([pad_token] * (pad_width - len(text_list)))

def text_to_padded_list(text, pad_token="<PAD/>", pad_width=0):
    text_list = preprocess_text(text).split()
    return pad_text_list(text_list, pad_token, pad_width)

In [5]:
processed_texts = []
file_count = 0
for folder_files in text_corpus_files:
    for text_file in glob.glob(folder_files):
        with(open(text_file, 'r')) as f:
            processed_texts.append(text_to_padded_list(f.read()))
            file_count += 1
            if file_count % 100 == 0:
                sys.stdout.write('\rLoading text file {0:d}'.format(file_count))
                sys.stdout.flush()
                
max_processed_text_len = len(max(processed_texts, key=len))
print('\nLongest text list: {0:d}'.format(max_processed_text_len))
for i, text_list in enumerate(processed_texts):
    processed_texts[i] = pad_text_list(text_list, pad_width=max_processed_text_len)
    if (i + 1) % 1000 == 0:
        sys.stdout.write('\rPadding text list {0:d}'.format(i+1))
        sys.stdout.flush()

Loading text file 75000
Longest text list: 2773
Padding text list 75000

In [9]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
model = gensim.models.Word2Vec(processed_texts, min_count=1, workers=4)

2016-10-01 18:27:51,080 : INFO : collecting all words and their counts
2016-10-01 18:27:51,081 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2016-10-01 18:27:52,742 : INFO : PROGRESS: at sentence #10000, processed 27730000 words, keeping 54831 word types
2016-10-01 18:27:54,398 : INFO : PROGRESS: at sentence #20000, processed 55460000 words, keeping 74809 word types
2016-10-01 18:27:56,064 : INFO : PROGRESS: at sentence #30000, processed 83190000 words, keeping 90742 word types
2016-10-01 18:27:57,745 : INFO : PROGRESS: at sentence #40000, processed 110920000 words, keeping 105208 word types
2016-10-01 18:27:59,426 : INFO : PROGRESS: at sentence #50000, processed 138650000 words, keeping 116872 word types
2016-10-01 18:28:01,115 : INFO : PROGRESS: at sentence #60000, processed 166380000 words, keeping 127656 word types
2016-10-01 18:28:02,794 : INFO : PROGRESS: at sentence #70000, processed 194110000 words, keeping 136901 word types
2016-10-01 18:28:03,639 

In [None]:
model.save_word2vec_format('word2vec/w2v-padded.bin', binary=True)

2016-10-01 18:32:05,026 : INFO : storing 141295x100 projection weights into word2vec/w2v-padded.bin


# Test loading from file

In [None]:
test_model = gensim.models.Word2Vec.load_word2vec_format('word2vec/w2v-padded.bin', binary=True)

In [None]:
test_model.most_similar('good')