In [1]:
import re
import glob
import gensim
import os.path
from gensim.models.doc2vec import TaggedDocument
from os import listdir
from os.path import isfile, join
from collections import namedtuple
from io import open

import sys  

reload(sys)  
sys.setdefaultencoding('utf8')

In [2]:
def normalize_text(text):
    # delete line break html tags
    normalized_text = text.replace('<br />', ' ')
    
    # lower cap
    normalized_text = normalized_text.lower()
    
    # pad punctuations
    for char in [',', '.', '?', '!', '(', ')', ':', ';', '"']:
        normalized_text = normalized_text.replace(char, ' ' + char + ' ')
    
    return normalized_text

In [None]:
folders = ['aclImdb/train/neg', 'aclImdb/train/pos', 'aclImdb/test/neg', 'aclImdb/test/pos', 'aclImdb/train/unsup']

if not os.path.isfile('tmp/alldata-id.txt'):
    all_data = u''
    for folder in folders:
        temp = u''
        output = folder.replace('/', '-') + '.txt'
        txt_files = glob.glob('/'.join([folder, '*.txt']))
        
        for txt_file in txt_files:
            with open(txt_file, 'r', encoding='utf-8') as t:
                control_chars = [unichr(0x85)]
                t_clean = t.read()

                for c in control_chars:
                    t_clean = t_clean.replace(c, ' ')

                temp += t_clean

            temp += "\n"

        temp_norm = normalize_text(temp)
        all_data += temp_norm
        with open('/'.join(['tmp', output]), 'w', encoding='utf-8') as f:
            f.write(temp_norm)
            f.close

    with open('/'.join(['tmp', 'alldata-id.txt']), 'w', encoding='utf-8') as f:
        for idx, line in enumerate(all_data.splitlines()):
            num_line = "_*{0} {1}\n".format(idx, line)
            f.write(num_line)

In [None]:
SentimentDocument = namedtuple('SentimentDocument', 'words tags split sentiment')

alldocs = []  # will hold all docs in original order
with open('tmp/alldata-id.txt', encoding='utf-8') as alldata:
    for line_no, line in enumerate(alldata):
#         tokens = gensim.utils.to_unicode(line).split()
        tokens = line.split()
        words = tokens[1:]
        tags = [line_no] # `tags = [tokens[0]]` would also work at extra memory cost
        split = ['train','test','extra','extra'][line_no//25000]  # 25k train, 25k test, 25k extra
        sentiment = [1.0, 0.0, 1.0, 0.0, None, None, None, None][line_no//12500] # [12.5K pos, 12.5K neg]*2 then unknown
        alldocs.append(SentimentDocument(words, tags, split, sentiment))

train_docs = [doc for doc in alldocs if doc.split == 'train']
test_docs = [doc for doc in alldocs if doc.split == 'test']
doc_list = alldocs[:]  # for reshuffling per pass

print('%d docs: %d train-sentiment, %d test-sentiment' % (len(doc_list), len(train_docs), len(test_docs)))

In [None]:
from gensim.models import Doc2Vec
import gensim.models.doc2vec
from collections import OrderedDict
import multiprocessing

In [None]:
cores = multiprocessing.cpu_count()
assert gensim.models.doc2vec.FAST_VERSION > -1, "this will be painfully slow otherwise"

simple_models = [
    # PV-DM w/concatenation - window=5 (both sides) approximates paper's 10-word total window size
    Doc2Vec(dm=1, dm_concat=1, size=100, window=5, negative=5, hs=0, min_count=2, workers=cores),
    # PV-DBOW 
#     Doc2Vec(dm=0, size=100, negative=5, hs=0, min_count=2, workers=cores),
    # PV-DM w/average
    # Doc2Vec(dm=1, dm_mean=1, size=100, window=10, negative=5, hs=0, min_count=2, workers=cores),
]

# speed setup by sharing results of 1st model's vocabulary scan
simple_models[0].build_vocab(alldocs)  # PV-DM/concat requires one special NULL word so it serves as template
print(simple_models[0])
for model in simple_models[1:]:
    model.reset_from(simple_models[0])
    print(model)

models_by_name = OrderedDict((str(model), model) for model in simple_models)

In [None]:
from gensim.test.test_doc2vec import ConcatenatedDoc2Vec
# models_by_name['dbow+dmm'] = ConcatenatedDoc2Vec([simple_models[1], simple_models[2]])
# models_by_name['dbow+dmc'] = ConcatenatedDoc2Vec([simple_models[1], simple_models[0]])

In [None]:
import numpy as np
import statsmodels.api as sm
from random import sample

# for timing
from contextlib import contextmanager
from timeit import default_timer
import time 

@contextmanager
def elapsed_timer():
    start = default_timer()
    elapser = lambda: default_timer() - start
    yield lambda: elapser()
    end = default_timer()
    elapser = lambda: end-start
    
def logistic_predictor_from_data(train_targets, train_regressors):
    logit = sm.Logit(train_targets, train_regressors)
    predictor = logit.fit(disp=0)
    #print(predictor.summary())
    return predictor

def error_rate_for_model(test_model, train_set, test_set, infer=False, infer_steps=3, infer_alpha=0.1, infer_subsample=0.1):
    """Report error rate on test_doc sentiments, using supplied model and train_docs"""

    train_targets, train_regressors = zip(*[(doc.sentiment, test_model.docvecs[doc.tags[0]]) for doc in train_set])
    train_regressors = sm.add_constant(train_regressors)
    predictor = logistic_predictor_from_data(train_targets, train_regressors)

    test_data = test_set
    if infer:
        if infer_subsample < 1.0:
            test_data = sample(test_data, int(infer_subsample * len(test_data)))
        test_regressors = [test_model.infer_vector(doc.words, steps=infer_steps, alpha=infer_alpha) for doc in test_data]
    else:
        test_regressors = [test_model.docvecs[doc.tags[0]] for doc in test_docs]
    test_regressors = sm.add_constant(test_regressors)
    
    # predict & evaluate
    test_predictions = predictor.predict(test_regressors)
    corrects = sum(np.rint(test_predictions) == [doc.sentiment for doc in test_data])
    errors = len(test_predictions) - corrects
    error_rate = float(errors) / len(test_predictions)
    return (error_rate, errors, len(test_predictions), predictor)

In [None]:
from collections import defaultdict
best_error = defaultdict(lambda :1.0)  # to selectively-print only best errors achieved

In [None]:
from random import shuffle
import datetime

alpha, min_alpha, passes = (0.025, 0.001, 1)
alpha_delta = (alpha - min_alpha) / passes

print("START %s" % datetime.datetime.now())

for epoch in range(passes):
    shuffle(doc_list)  # shuffling gets best results
    
    for name, train_model in models_by_name.items():
        # train
        duration = 'na'
        train_model.alpha, train_model.min_alpha = alpha, alpha
        with elapsed_timer() as elapsed:
            train_model.train(doc_list)
            duration = '%.1f' % elapsed()
            
        # evaluate
        eval_duration = ''
        with elapsed_timer() as eval_elapsed:
            err, err_count, test_count, predictor = error_rate_for_model(train_model, train_docs, test_docs)
        eval_duration = '%.1f' % eval_elapsed()
        best_indicator = ' '
        if err <= best_error[name]:
            best_error[name] = err
            best_indicator = '*' 
        print("%s%f : %i passes : %s %ss %ss" % (best_indicator, err, epoch + 1, name, duration, eval_duration))

        if ((epoch + 1) % 5) == 0 or epoch == 0:
            eval_duration = ''
            with elapsed_timer() as eval_elapsed:
                infer_err, err_count, test_count, predictor = error_rate_for_model(train_model, train_docs, test_docs, infer=True)
            eval_duration = '%.1f' % eval_elapsed()
            best_indicator = ' '
            if infer_err < best_error[name + '_inferred']:
                best_error[name + '_inferred'] = infer_err
                best_indicator = '*'
            print("%s%f : %i passes : %s %ss %ss" % (best_indicator, infer_err, epoch + 1, name + '_inferred', duration, eval_duration))

    print('completed pass %i at alpha %f' % (epoch + 1, alpha))
    alpha -= alpha_delta
    
print("END %s" % str(datetime.datetime.now()))

In [None]:
simple_models[0].save('model0.dat')
# simple_models[1].save('model1.dat')
# simple_models[2].save('model2.dat')

In [None]:
# simple_models[0].save_word2vec_format('model0.bin', binary=True)
# simple_models[1].save_word2vec_format('model1.bin', binary=True)
# simple_models[2].save_word2vec_format('model2.bin', binary=True)
simple_models[0].save_word2vec_format('model0.txt', binary=False)
# simple_models[1].save_word2vec_format('model1.txt', binary=False)
# simple_models[2].save_word2vec_format('model2.txt', binary=False)

In [None]:
test_retrieve = Doc2Vec.load('model0.dat')

In [None]:
test_retrieve.most_similar('fuck')