In [None]:
"""
Created on Mon Apr 19 21:25:09 2017

@author: raysun

Sources - https://radimrehurek.com/gensim/models/doc2vec.html
          https://radimrehurek.com/gensim/wiki.html
          https://markroxor.github.io/gensim/static/notebooks/doc2vec-lee.html
          https://markroxor.github.io/gensim/static/notebooks/doc2vec-wikipedia.html
          https://arxiv.org/abs/1507.07998
"""

import gensim, logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

from gensim.corpora.wikicorpus import WikiCorpus
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from pprint import pprint
import multiprocessing

In [None]:
# Step 1. Load the Wikipedia articles

# Download the dump of all Wikipedia articles from http://download.wikimedia.org/enwiki/, where you may need either 
# 1. enwiki-latest-pages-articles.xml.bz2, or 
# 2. enwiki-YYYYMMDD-pages-articles.xml.bz2 for date-specific dumps. 

# This file is about 14GB in size and contains (a compressed version of) all articles from the English Wikipedia.


In [None]:
# Step 2. Load the corpus

# Load the wiki corpus
wiki = WikiCorpus("/Users/raysun/test_data/examples/L3_Wikipedia/wiki/enwiki-latest-pages-articles.xml.bz2")

# Define TaggedWikiDocument class to convert WikiCorpus into suitable form for Doc2Vec
class TaggedWikiDocument(object):
    def __init__(self, wiki):
        self.wiki = wiki
        self.wiki.metadata = True
    def __iter__(self):
        for content, (page_id, title) in self.wiki.get_texts():
            yield TaggedDocument([c.decode("utf-8") for c in content], [title])  

# Generate tagged wiki documents
documents = TaggedWikiDocument(wiki)


In [None]:
# Step 3. Perform text processing to compute the optimized min_count

# Preprocess text data
preproc = Doc2Vec(min_count=0)
preproc.scan_vocab(documents)

# Optimize the min_count
for num in range(0, 20):
    print('min_count: {}, size of vocab: '.format(num), 
          pre.scale_vocab(min_count=num, dry_run=True)['memory']['vocab']/1000)

# Print optimized min_count
print(min_count) 

In [None]:
# Step 4. Build Doc2Vec models

# Enable multirocessing by the number of available CPUs
cores = multiprocessing.cpu_count()

# Build the Doc2Vec models
models = [
    # PV-DBOW 
    Doc2Vec(dm=0, dbow_words=1, size=200, window=8, min_count=19, iter=10, workers=cores),
    # PV-DM w/average
    Doc2Vec(dm=1, dm_mean=1, size=200, window=8, min_count=19, iter =10, workers=cores),
]

# Build vocabularies
models[0].build_vocab(documents)
print(str(models[0]))
models[1].reset_from(models[0])
print(str(models[1]))

# Save models
model.save('/tmp/d2v_wikipedia.model')


In [None]:
# Step 5. Train Doc2Vec of the English wikipedia articles

print models

for model in models:
    %%time model.train(documents,total_examples=model.corpus_count,epochs=model.iter)
    

In [None]:
# Step 6. Analyze similarity

# First, we calculate cosine simillarity of "Machine learning" using Paragraph Vector. 
# Word Vector and Document Vector are separately stored. We have to add .docvecs after 
# model name to extract Document Vector from Doc2Vec Model.
for model in models:
    print(str(model))
    pprint(model.docvecs.most_similar(positive=["Machine learning"], topn=20))
    
# Note that DBOW model interpret the word 'Machine Learning' as a part of Computer Science 
# field, and DM model as Data Science related field.

# Second, we calculate cosine simillarity of "Lady Gaga" using Paragraph Vector.
for model in models:
    print(str(model))
    pprint(model.docvecs.most_similar(positive=["Lady Gaga"], topn=10))
    
# Third, calculating cosine simillarity of "Lady Gaga" - "American" + "Japanese" - "Italian" using 
# Document vector and Word Vectors. "American" and "Japanese" are Word Vectors, not Paragraph 
# Vectors. Word Vectors are already converted to lowercases by WikiCorpus.
for model in models:
    print(str(model))
    vec = [model.docvecs["Lady Gaga"] - model["american"] + model["japanese"] - model["italian"]]
    pprint([m for m in model.docvecs.most_similar(vec, topn=11) if m[0] != "Lady Gaga"])
    
# As a result, DBOW model demonstrate the similar artists with Lady Gaga in Japan such as 
# 'Perfume', which is the Most famous Idol in Japan. On the other hand, DM model results 
# don't include the Japanese aritsts in top 10 simillar documents. It's almost same with 
# no vector calculated results.

# This results demonstrate that DBOW employed in the original paper is outstanding for 
# calculating the similarity between Document Vector and Word Vector.
    

In [None]:
# Option 1. Convert the Wikipedia articles (used for LSI and LDA analyses)

# Convert the articles to plain text (process Wiki markup) and store the result as sparse TF-IDF vectors. 
# In Python, this is easy to do on-the-fly and we don’t even need to uncompress the whole archive to disk. 
# There is a script included in gensim that does just that, run the command like:
'''
    python -m gensim.scripts.make_wikicorpus ./gensim/results/enwiki-latest-pages-articles.xml.bz2 ./gensim/results/wiki_en
'''
# This step takes several hours and uses about 30GB disk space.

import logging, gensim
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

from gensim import corpora
from gensim import models

# Load id->word mapping (the dictionary), one of the results of step 2 above
id2word = corpora.Dictionary.load_from_text('wiki_en_wordids.txt')

# Load corpus iterator
mm = corpora.MmCorpus('wiki_en_tfidf.mm')
# mm = corpora.MmCorpus(bz2.BZ2File('wiki_en_tfidf.mm.bz2')) # use this if you compressed the TFIDF output
print(mm)

# a. Extract 100 LSI topics; use the default one-pass algorithm
lsi = models.lsimodel.LsiModel(corpus=mm, id2word=id2word, num_topics=100)

# Print the most contributing words (both positively and negatively) for each of the first ten topics
print lsi.print_topics(10)

# b. Extract 100 LDA topics, using 1 pass and updating once every 1 chunk (10,000 documents)
lda = models.ldamodel.LdaModel(corpus=mm, id2word=id2word, num_topics=100, update_every=1, chunksize=10000, passes=1)

# Print the most contributing words (both positively and negatively) for each of the first ten topics
print lda.print_topics(10)
