In [4]:
import re
import os
import random

from gensim.models.doc2vec import TaggedDocument
from gensim.models import Doc2Vec

In [5]:
def extract_words(sentence):
    sentence = sentence.lower()
    sentence = re.sub(r'<[^>]+>', ' ', sentence)
    sentence = re.sub(r'(\w)\'(\w)', '\1\2', sentence)
    sentence = re.sub(r'\W', ' ', sentence)
    sentence = re.sub(r'\s+', ' ', sentence)
    sentence = sentence.strip()
    return sentence.split()

In [6]:
unsup_sentences = []

In [7]:
# http://ai.stanford.edu/~amaas/data/sentiment
for dirname in ["train/pos", "train/neg", "train/unsup", "test/pos", "test/neg"]:
    for fname in sorted(os.listdir("data/aclImdb/" + dirname)):
        if fname[-4:] == '.txt':
            with open("data/aclImdb/" + dirname + "/" + fname, encoding='utf-8') as f:
                sentences = f.read()
                words = extract_words(sentences)
                unsup_sentences.append(TaggedDocument(words, [dirname + "/" + fname]))

In [8]:
len(unsup_sentences)

100000

In [9]:
# http://www.cs.cornell.edu/people/pabo/movie-review-data/
for dirname in ["data/review_polarity/txt_sentoken/pos", "data/review_polarity/txt_sentoken/neg"]:
    for fname in sorted(os.listdir(dirname)):
        if fname[-4:] == '.txt':
            with open(dirname + "/" + fname, encoding='utf-8') as f:
                for i, sentences in enumerate(f):
                    words = extract_words(sentences)
                    unsup_sentences.append(TaggedDocument(words, ['%s/%s-%d' % (dirname, fname, i)]))

In [10]:
len(unsup_sentences)

164720

In [11]:
unsup_sentences[164000]

TaggedDocument(words=['starring', 'tim', 'roth', 'jennifer', 'beals', 'antonio', 'banderas', 'quentin', 'tarantino', 'valeria', 'golino', 'madonna', 'bruce', 'willis', 'marisa', 'tomei', 'alicia', 'witt', 'lili', 'taylor', 'and', 'ione', 'skye'], tags=['data/review_polarity/txt_sentoken/neg/cv977_4776.txt-5'])

In [12]:
# https://nlp.stanford.edu/sentiment/
with open("data/stanfordSentimentTreebank/original_rt_snippets.txt", encoding='utf-8') as f:
    for i, sentence in enumerate(f):
        words = extract_words(sentence)
        unsup_sentences.append(TaggedDocument(words, ['rt-%d' % i]))

In [13]:
len(unsup_sentences)

175325

In [14]:
unsup_sentences[175000]

TaggedDocument(words=['i', 'a', 'movie', 'that', 'ends', 'with', 'truckzilla', 'for', 'cryin', 'out', 'loud', 'if', 'that', 'does', 'clue', 'you', 'in', 'that', 'somethin', 'horribly', 'wrong', 'nothing', 'will'], tags=['rt-10280'])

In [15]:
class PermuteSentences(object):
    def __init__(self, sentences):
        self.sentences = sentences
        
    def __iter__(self):
        shuffled = list(self.sentences)
        random.shuffle(shuffled)
        for sentence in shuffled:
            yield sentence

In [16]:
permuter = PermuteSentences(unsup_sentences)

In [25]:
model = Doc2Vec(permuter, dn=0, hs=1, vector_size=70)

In [26]:
model.save('models/reviews_70.d2v')

In [21]:
model.infer_vector(extract_words("This place is not worth your time, let alone Vegas."))

array([-2.21584029e-02,  1.40059814e-01,  1.10775471e-01, -1.37958556e-01,
        1.84793174e-02, -9.72866733e-03, -2.86999419e-02,  1.50538996e-04,
        1.31784946e-01, -6.05217926e-03, -1.08003214e-01, -2.38595173e-01,
        3.11588142e-02,  2.03356370e-02,  7.07082748e-02, -8.12748075e-02,
        1.22072197e-01, -2.98364818e-01,  2.12156728e-01, -3.31813157e-01,
        2.34340951e-01,  1.17587045e-01, -1.94785446e-01, -1.36342958e-01,
       -3.91722202e-01, -7.58900344e-02,  2.26971105e-01,  1.75877586e-01,
        1.45304158e-01, -2.39197552e-01,  1.87830225e-01,  1.18280936e-03,
        3.40742469e-02,  7.93473572e-02, -2.02818945e-01,  1.78153478e-02,
        4.78179641e-02, -1.24255742e-03, -7.85557181e-02,  4.21098769e-02,
       -2.58627310e-02,  1.96892098e-02, -1.81037530e-01, -1.92666218e-01,
        1.22619141e-03, -7.86005259e-02, -1.08408615e-01, -7.14767650e-02,
        1.20176807e-01,  1.16375268e-01], dtype=float32)

In [22]:
from sklearn.metrics.pairwise import cosine_similarity
cosine_similarity(
    [model.infer_vector(extract_words("This place is not worth your time, let alone Vegas."))],
    [model.infer_vector(extract_words("This is not the best time."))]
)

array([[0.6106016]], dtype=float32)

In [24]:
cosine_similarity(
    [model.infer_vector(extract_words("Highly recommended."))],
    [model.infer_vector(extract_words("Service sucks."))]
)

array([[0.14926729]], dtype=float32)