In [1]:
from gensim.models.doc2vec import TaggedDocument
from gensim.models import Doc2Vec
import re
import os
import random
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [3]:
def extract_words(sent):
    sent = sent.lower()
    sent = re.sub(r'<[^>]+>', ' ', sent) #strip html tags
    sent = re.sub(r'(\w)\'(\w)', '\1\2', sent) # remoce apostrophes
    sent = re.sub(r'\W',' ', sent) # remove punctuation
    sent = re.sub(r'\s+', ' ', sent) # remove repeated space
    sent = sent.strip()
    return sent.split()

In [8]:
# test text cleanup block
words = 'I wish + I could fly...-''    '
print(extract_words(words))

['i', 'wish', 'i', 'could', 'fly']


In [12]:
proj_root = os.path.dirname(os.getcwd())
#print("Current working dir : {0}".format(proj_root))

Current working dir : /Users/dbiswas/Documents/Malabika/MS/Fall2018/social_media_mining/project/comments_analysis


In [16]:
sentences = []

imdb_subdir = 'traindata/imdb'
imdb_dir = proj_root + "/" + imdb_subdir

# Document tag is important in way that it shows all these words go together in one document .There is no label in 
# this case. We are trying to understand which words are used together
for dirname in ["positive", "negative"]:
    for fname in sorted(os.listdir(imdb_dir + "/" + dirname)):
        if fname.endswith('.txt'):
            with open(imdb_dir + "/" + dirname + '/' + fname, encoding = 'UTF-8') as f:
                sent = f.read()
                words = extract_words(sent)
                sentences.append(TaggedDocument(words,[dirname + "/" + fname]))

In [17]:
print("sentences length: {0}".format(len(sentences)))
print("sentences length: {0}".format(sentences[0]))

sentences length: 7
sentences length: TaggedDocument(['oh', 'the', 'sixties', 'there', 'were', 'some', 'interesting', 'films', 'i', 'was', 'more', 'of', 'a', 'movie', 'goer', 'then', 'i', 'now', 'enjoy', 'renting', 'movies', 'and', 'relaxing', 'in', 'my', 'home', 'rather', 'than', 'going', 'to', 'the', 'theater', 'i', 'also', 'saw', 'this', 'short', 'film', 'the', 'legend', 'of', 'the', 'boy', 'and', 'the', 'eagle', 'i', 'have', 'been', 'searching', 'for', 'this', 'film', 'for', 'years', 'it', 'was', 'truly', 'inspiring', 'surprisingly', 'i', 'was', 'finally', 'able', 'to', 'gather', 'more', 'information', 'from', 'your', 'site', 'thank', 'you', 'surprised', 'to', 'find', 'out', 'that', 'this', 'short', 'film', 'was', 'an', 'opening', 'for', 'a', 'disney', 'picture', 'i', 'too', 'did', 'not', 'remember', 'the', 'disney', 'film', 'i', 'did', 'not', 'even', 'remember', 'that', 'it', 'was', 'an', 'opening', 'film', 'for', 'disney', 'i', 'truly', 'wish', 'they', 'would', 'show', 'this', 'o

In [19]:
review_subdir = 'traindata/review'
review_dir = proj_root + "/" + review_subdir

for dirname in ["positive", "negative"]:
    for fname in sorted(os.listdir(review_dir + "/" + dirname)):
        if fname.endswith('.txt'):
            with open(review_dir + "/" + dirname + "/" + fname, encoding = 'UTF-8') as f:
                for i, sent in enumerate(f):
                    words = extract_words(sent)
                    sentences.append(TaggedDocument(words,["%s/%s-%d" % (dirname, fname, i)]))

In [21]:
stanford_subdir = 'traindata/stanford'
stanford_dir = proj_root + "/" + stanford_subdir

with open(stanford_dir + "/" + "original_rt_snippets.txt", encoding='UTF-8') as f:
    for i, line in enumerate(f):
        words = extract_words(sent)
        sentences.append(TaggedDocument(words, ["rt-%d" % i]))

In [23]:
len("Number of training examples : {0}".format(sentences))

1507489

In [25]:
sentences[0:2]

[TaggedDocument(words=['oh', 'the', 'sixties', 'there', 'were', 'some', 'interesting', 'films', 'i', 'was', 'more', 'of', 'a', 'movie', 'goer', 'then', 'i', 'now', 'enjoy', 'renting', 'movies', 'and', 'relaxing', 'in', 'my', 'home', 'rather', 'than', 'going', 'to', 'the', 'theater', 'i', 'also', 'saw', 'this', 'short', 'film', 'the', 'legend', 'of', 'the', 'boy', 'and', 'the', 'eagle', 'i', 'have', 'been', 'searching', 'for', 'this', 'film', 'for', 'years', 'it', 'was', 'truly', 'inspiring', 'surprisingly', 'i', 'was', 'finally', 'able', 'to', 'gather', 'more', 'information', 'from', 'your', 'site', 'thank', 'you', 'surprised', 'to', 'find', 'out', 'that', 'this', 'short', 'film', 'was', 'an', 'opening', 'for', 'a', 'disney', 'picture', 'i', 'too', 'did', 'not', 'remember', 'the', 'disney', 'film', 'i', 'did', 'not', 'even', 'remember', 'that', 'it', 'was', 'an', 'opening', 'film', 'for', 'disney', 'i', 'truly', 'wish', 'they', 'would', 'show', 'this', 'on', 'tv', 'sometime', 'i', 'won

In [26]:
class PermuteSentences(object):
    def __init__(self, sents):
        self.sents = sents
    def __iter__(self):
        shuffled = list(self.sents)
        random.shuffle(shuffled)
        for sent in shuffled:
            yield sent
    

In [28]:
permuter = PermuteSentences(sentences)
#Training Doc2Vec model
model = Doc2Vec(permuter, dm=0, hs=1, vector_size=50)

In [None]:
# done with training, free up some memory
model.delete_temporary_training_data(keep_inference=True)

In [30]:
model_dir = proj_root + "/" + "model"
model.save(model_dir + "/" + 'reviews.d2v')

In [43]:
#Test doc2vec model
model.infer_vector(extract_words("This is really exciting video. Thank you for presenting to us."))

array([-0.01350077,  0.08934327, -0.09697033,  0.02471156, -0.18965186,
        0.02942901, -0.03993719, -0.18242534,  0.10629763,  0.23757818,
        0.03010215, -0.0022863 , -0.22295296, -0.01697446, -0.14168206,
       -0.1323943 ,  0.06893007,  0.07598587,  0.06111291,  0.15839736,
       -0.31210947, -0.06756169, -0.11877166, -0.2460407 ,  0.09327912,
       -0.00296085,  0.17162102,  0.13997607, -0.20364867,  0.09936013,
        0.04297823, -0.11172268,  0.1276897 ,  0.14330468,  0.01672968,
        0.02624682,  0.15967034, -0.03492855,  0.43552196, -0.22896902,
       -0.22452025, -0.32068533, -0.2026356 , -0.19075608,  0.27631646,
       -0.08875233,  0.38306233,  0.32971722, -0.25210103, -0.04952441],
      dtype=float32)

In [44]:
#Check model outcomes using sample sentences
cosine_similarity(
[model.infer_vector(extract_words("This is really exciting video. Thank you for presenting to us."))],
[model.infer_vector(extract_words("Exciting video. Keep it coming"))])

array([[0.7256148]], dtype=float32)

In [45]:
cosine_similarity(
[model.infer_vector(extract_words("It is now snowing in New York"))],
[model.infer_vector(extract_words("Golden State Warriors are champions"))])

array([[0.5963397]], dtype=float32)