In [1]:
from gensim.models.doc2vec import TaggedDocument
from gensim.models import Doc2Vec
import re
import os
import random
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [3]:
def extract_words(sent):
    sent = sent.lower()
    sent = re.sub(r'<[^>]+>', ' ', sent) #strip html tags
    sent = re.sub(r'(\w)\'(\w)', '\1\2', sent) # remoce apostrophes
    sent = re.sub(r'\W',' ', sent) # remove punctuation
    sent = re.sub(r'\s+', ' ', sent) # remove repeated space
    sent = sent.strip()
    return sent.split()

In [4]:
# test text cleanup block
words = 'ajshd asda, + 1231 "sd + ''alkdj''    '
print(extract_words(words))

['ajshd', 'asda', '1231', 'sd', 'alkdj']


In [5]:
unsup_sentences = []

# data 
"""
https://ai.stanford.edu/~amaas/data/sentiment/, IMDB data
http://www.cs.cornell.edu/people/pabo/movie-review-data/
https://nlp.stanford.edu/sentiment/, Rotten Tomatoes data

Sentiment:
https://www.kaggle.com/rahulin05/sentiment-labelled-sentences-data-set?login=true
"""

# https://ai.stanford.edu/~amaas/data/sentiment/

# Document tag is important in way that it shows all these words go together in one document
# There is no label in this case. We are trying to understand which words are used together
for dirname in ["train/pos", "train/neg", "train/unsup", "test/pos", "test/neg"]:
    for fname in sorted(os.listdir("data/aclImdb/" + dirname)):
        if fname[-4:] == '.txt':
            with open("data/aclImdb/" + dirname + '/' + fname, encoding = 'UTF-8') as f:
                sent = f.read()
                words = extract_words(sent)
                unsup_sentences.append(TaggedDocument(words,[dirname + "/" + fname]))

In [6]:
print("unsup_sentences length: {0}".format(len(unsup_sentences)))
print("unsup_sentences length: {0}".format(unsup_sentences[0]))

unsup_sentences length: 100000
unsup_sentences length: TaggedDocument(['bromwell', 'high', 'is', 'a', 'cartoon', 'comedy', 'it', 'ran', 'at', 'the', 'same', 'time', 'as', 'some', 'other', 'programs', 'about', 'school', 'life', 'such', 'as', 'teachers', 'my', '35', 'years', 'in', 'the', 'teaching', 'profession', 'lead', 'me', 'to', 'believe', 'that', 'bromwell', 'hig', 'satire', 'is', 'much', 'closer', 'to', 'reality', 'than', 'is', 'teachers', 'the', 'scramble', 'to', 'survive', 'financially', 'the', 'insightful', 'students', 'who', 'can', 'see', 'right', 'through', 'their', 'pathetic', 'teachers', 'pomp', 'the', 'pettiness', 'of', 'the', 'whole', 'situation', 'all', 'remind', 'me', 'of', 'the', 'schools', 'i', 'knew', 'and', 'their', 'students', 'when', 'i', 'saw', 'the', 'episode', 'in', 'which', 'a', 'student', 'repeatedly', 'tried', 'to', 'burn', 'down', 'the', 'school', 'i', 'immediately', 'recalled', 'at', 'high', 'a', 'classic', 'line', 'inspector', 'here', 'to', 'sack', 'one', 

In [7]:
# http://www.cs.cornell.edu/people/pabo/movie-review-data/
for dirname in ["data/review_polarity/txt_sentoken/pos", "data/review_polarity/txt_sentoken/neg"]:
    for fname in sorted(os.listdir(dirname)):
        if fname[-4:] == '.txt':
            with open(dirname + '/' + fname, encoding = 'UTF-8') as f:
                for i, sent in enumerate(f):
                    words = extract_words(sent)
                    unsup_sentences.append(TaggedDocument(words,["%s/%s-%d" % (dirname, fname, i)]))

In [8]:
# https://nlp.stanford.edu/sentiment/
with open("data/stanfordSentimentTreebank/original_rt_snippets.txt", encoding='UTF-8') as f:
    for i, line in enumerate(f):
        words = extract_words(sent)
        unsup_sentences.append(TaggedDocument(words, ["rt-%d" % i]))

In [9]:
len(unsup_sentences)

175325

In [10]:
unsup_sentences[0:2]

[TaggedDocument(words=['bromwell', 'high', 'is', 'a', 'cartoon', 'comedy', 'it', 'ran', 'at', 'the', 'same', 'time', 'as', 'some', 'other', 'programs', 'about', 'school', 'life', 'such', 'as', 'teachers', 'my', '35', 'years', 'in', 'the', 'teaching', 'profession', 'lead', 'me', 'to', 'believe', 'that', 'bromwell', 'hig', 'satire', 'is', 'much', 'closer', 'to', 'reality', 'than', 'is', 'teachers', 'the', 'scramble', 'to', 'survive', 'financially', 'the', 'insightful', 'students', 'who', 'can', 'see', 'right', 'through', 'their', 'pathetic', 'teachers', 'pomp', 'the', 'pettiness', 'of', 'the', 'whole', 'situation', 'all', 'remind', 'me', 'of', 'the', 'schools', 'i', 'knew', 'and', 'their', 'students', 'when', 'i', 'saw', 'the', 'episode', 'in', 'which', 'a', 'student', 'repeatedly', 'tried', 'to', 'burn', 'down', 'the', 'school', 'i', 'immediately', 'recalled', 'at', 'high', 'a', 'classic', 'line', 'inspector', 'here', 'to', 'sack', 'one', 'of', 'your', 'teachers', 'student', 'welcome', 

In [None]:
class PermuteSentences(object):
    def __init__(self, sents):
        self.sents = sents
    def __iter__(self):
        shuffled = list(self.sents)
        random.shuffle(shuffled)
        for sent in shuffled:
            yield sent
    

In [None]:
permuter = PermuteSentences(unsup_sentences)
#Training of Doc2Vec model
model = Doc2Vec(permuter, dm=0, hs=1, vector_size=50)

In [None]:
# done with training, free up some memory
model.delete_temporary_training_data(keep_inference=True)

In [None]:
model.save('reviews.d2v')
# in other program, we could write: model = Doc2Vec.load('reviews.d2v')

In [None]:
model.infer_vector(extract_words("This is very bad video. I don't like it"))

In [None]:
cosine_similarity(
[model.infer_vector(extract_words("This is very bad video. I don't like it"))],
[model.infer_vector(extract_words("video sucks."))])

In [None]:
cosine_similarity(
[model.infer_vector(extract_words("It is now snowing in New York"))],
[model.infer_vector(extract_words("I feel sick. Dont feel like going to school"))])