In [None]:
"""
Created on Mon Apr 18 19:32:42 2017

@author: raysun

Source - https://radimrehurek.com/gensim/models/word2vec.html
"""

import gensim, logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.WARNING)

# Step 1: Import wikipedia
import wikipedia

#print(wikipedia.summary("Wikipedia"))
#print(wikipedia.summary("Facebook", sentences=1))

#wikilists = wikipedia.search("Trump")
#print(wikilists)

#wikipage = wikipedia.page("California")
#print(wikipage.title)
#print(wikipage.url)
#print(wikipage.content)
#print(wikipage.links[0])
#wikipedia.set_lang("fr")

#'title' denotes the exact title of the article to be fetched
#title = "Machine learning"
#wikipage = wikipedia.page(title)
#print(wikipage.url)
#titles = wikipedia.search('machine learning')
#wikipage = wikipedia.page(titles[0])
#print(wikipage.title)
#documents = wikipage.content


In [None]:
# Step 2: Perform tokenization

# Read in a wikipedia page for this study
#wikipage = wikipedia.page("Queen Elizabeth II")
#wikipage = wikipedia.page("George Washington")
wikipage = wikipedia.page("Albert Einstein")
documents = wikipage.content

from nltk.tokenize import MWETokenizer
tokenizer = MWETokenizer()
sentences = tokenizer.tokenize(documents.split())
#print(sentences)

from nltk import corpus
stoplist = corpus.stopwords.words(fileids='english')
texts = [[word for word in sentence.lower().split() if word not in stoplist]
             for sentence in sentences]
print(texts[:20])

sentences = []
for terms in texts:
    for term in terms:
        if term not in [",",";",".","?","\"","(",")","====","===","==","..",":",u"\u2013"]:
            #term = term.lower() # convert to lower case
            if term[0] in ['(']:
                term = term[1:]
            elif term[-1] in [')','.',',',';']:
                term = term[:-1]
            term.strip().replace("=","")
            term.strip().replace("-","")
            term.strip().replace("'s","")
            if len(term) > 0:
                sentences.append(term)
print(sentences[:20])

# Save it for our word2vec experiments
sentences_orig = sentences

In [None]:
# Step 3: Stemming
# Note that Chinese language may skip this step

from gensim.parsing import PorterStemmer
global_stemmer = PorterStemmer()
     
class StemmingHelper(object):
    """
    Class to aid the stemming process - from word to stemmed form,
    and vice versa.
    The 'original' form of a stemmed word will be returned as the
    form in which its been used the most number of times in the text.
    """
 
    #This reverse lookup will remember the original forms of the stemmed
    #words
    word_lookup = {}
 
    @classmethod
    def stem(cls, word):
        """
        Stems a word and updates the reverse lookup.
        """
 
        #Stem the word
        stemmed = global_stemmer.stem(word)
 
        #Update the word lookup
        if stemmed not in cls.word_lookup:
            cls.word_lookup[stemmed] = {}
        cls.word_lookup[stemmed][word] = (
            cls.word_lookup[stemmed].get(word, 0) + 1)
 
        return stemmed
 
    @classmethod
    def original_form(cls, word):
        """
        Returns original form of a word given the stemmed version,
        as stored in the word lookup.
        """
 
        if word in cls.word_lookup:
            return max(cls.word_lookup[word].keys(),
                       key=lambda x: cls.word_lookup[word][x])
        else:
            return word

StemmingHelper.stem('learning')
StemmingHelper.original_form('learn')

# Note that you can also use stemming algorihtms from NLTK (nltk.stem package)
# Source - http://www.nltk.org/howto/stem.html

from __future__ import print_function
from nltk.stem import *

# Create a new Porter stemmer
stemmer = PorterStemmer()

# Perform stemming to convert plural form to single form and change tenses to present tense
plurals = ['caresses', 'flies', 'dies', 'mules', 'denied',
            'died', 'agreed', 'owned', 'humbled', 'sized',
            'meeting', 'stating', 'siezing', 'itemization',
            'sensational', 'traditional', 'reference', 'colonizer',
            'plotted']
singles = [stemmer.stem(plural) for plural in plurals]
print(' '.join(singles))  # doctest: +NORMALIZE_WHITESPACE
#caress fli die mule deni die agre own humbl size meet
#state siez item sensat tradit refer colon plot

# Create a new Snowball stemmer
from nltk.stem.snowball import SnowballStemmer

# See which languages are supported.
print(" ".join(SnowballStemmer.languages))
#danish dutch english finnish french german hungarian italian
#norwegian porter portuguese romanian russian spanish swedish

#Create a new instance of a language specific subclass.
stemmer = SnowballStemmer("english")

# Stem a word.
print(stemmer.stem("running"))
#run

# Decide not to stem stopwords.
stemmer2 = SnowballStemmer("english", ignore_stopwords=True)

print(stemmer.stem("having"))
#have

print(stemmer2.stem("having"))
#having

# Note that the 'english' stemmer is better than the original 'porter' stemmer.
print(SnowballStemmer("english").stem("generously"))
#generous

print(SnowballStemmer("porter").stem("generously"))
#gener

# Back to our case study
#print(sentences[:20])

sentences = [stemmer.stem(sentence) for sentence in sentences]
print(sentences[:20])

In [None]:
# Step 2: Run a simple Word2Vec model

from gensim.models import Word2Vec, Doc2Vec

# Give a try
simple_ngram = [['semantha', 'bee'], ['semantha', 'gibb']]

# train word2vec on the two sentences
model = gensim.models.Word2Vec(simple_ngram, min_count=1)

# check vocabularies used in the model
vocab = model.wv.vocab.keys()
print(vocab)

In [None]:
# Step 3. Build a Word2Vec model for our wikedia case study

#Syntax: model = Word2Vec(sentences, min_count=min_count, size=size, window=window)
#
# Parameters: 
#
# size: the size of the NN layers, which correspond to the “degrees” of freedom
#           the training algorithm has. Bigger size values require more training 
#           data, but can lead to better (more accurate) models. Reasonable values 
#           are in the tens to hundreds (default = 100).
#
# window: only terms hat occur within a window-neighbourhood of a term, in a sentence, 
#           are associated with it during training. The usual value is 4. Unless your 
#           text contains big sentences, leave it at that.
#
# min_count: 0-100, depending on the size of the dataset of interest.
#
# workers: the number of devices of parallelization to speed up training.
#           (default = 1 = no parallelization). This parameter has only effect 
#           if you have Cython installed. Without Cython, you’ll only be able 
#           to use one core because of the GIL (and word2vec training is slow).
#
# iter: the sweeps of SGD through the data; more is better. It runs in general iter+1 
#          passes; by default iter=5. 

# Back to our wikipedia case study
min_count = 100 # size = 200 [u'queen', u'elizabeth']
min_count = 50   # size = 200 [u'queen', u'royal', u'elizabeth', u'british']
min_count = 20
size = 100
window = 5

# Save the sentences to a file
outfile = open('/tmp/my_sentences.txt','w')

# Compute skip N-gram
my_sentences = []
start = 0
sentences = sentences_orig
total = len(sentences)
for index in range(start,total):
    for jndex in range(max(index-window,start),min(index+window,total)): 
        if jndex < index:
            ngram = [sentences[jndex], sentences[index]]
        elif jndex > index:
            ngram = [sentences[index], sentences[jndex]]
    my_sentences.append(ngram)
    outfile.write(str(ngram))
print(my_sentences[0:20])  

# Save my_sentences
my_sentences_orig = my_sentences
    
# Run a Word2Vec model    
model = Word2Vec(my_sentences, min_count=min_count, size=size, window=window)

# Save model. Note that you can also use model.wv.save_word2vec_format instead.
fname = "/tmp/wikipedia.model"
model.save(fname)
model = Word2Vec.load(fname)

# Check vocabularies used in the model
vocab = model.wv.vocab.keys()
print(vocab)

In [None]:
#Step 4. Retrieve saved sentences and run Word2Vec model

# Load sentences
inpfile = open('/tmp/my_sentences.txt','r')
my_texts = inpfile.readlines()[0]
my_texts = my_texts.replace("\'","'")
for texts in my_texts.split(']['):
    cbow = []
    for text in texts.split(','):
        text = text.strip().replace("[","")
        text = text.strip().replace("]","")
        if len(text) > 0:
            cbow.append(text)
    if len(cbow) > 0:
        my_sentences.append(cbow)
print(my_sentences[0:20])

# Case 1. Read data and run with default parameters
my_model_1 = Word2Vec(my_sentences) # generate different results

# Check vocabularies used in the model
my_vocab_1 = my_model_1.wv.vocab.keys()
print(my_vocab_1)

# Case 2. Use our parameters defined in the previous session
min_count = 20
size = 100
window = 5
my_model_2 = Word2Vec(my_sentences_orig, min_count=min_count, size=size, window=window)

# check vocabularies used in the model
my_vocab_2 = my_model_2.wv.vocab.keys()
#rint(my_vocab_2)

# Case 3. Load the saved model
fname = "/tmp/wikipedia.model"
my_model_3 = Word2Vec.load(fname)

# check vocabularies used in the model
my_vocab_3 = my_model_3.wv.vocab.keys()
print(my_vocab_3)

# Case 4. Manually build your model by calling Word2Vec(sentences, iter=1) to run 
# two passes over the sentences iterator. 
# 1. The first pass collects words and their frequencies to build an internal 
#    dictionary tree structure. 
# 2. The second and subsequent passes train the neural model. These two (or, 
#   iter+1) passes can also be initiated manually, in case your input stream 
#   is non-repeatable (you can only afford one pass), and you’re able to initialize 
#   the vocabulary some other way:

my_model_4 = Word2Vec(iter=1)  # an empty model, no training yet
my_model_4.build_vocab(my_sentences)  # can be a non-repeatable, 1-pass generator
my_model_4.train(my_sentences,total_examples=my_model_4.corpus_count,epochs=my_model_4.iter) # can be a non-repeatable, 1-pass generator

# check vocabularies used in the model
my_vocab_4 = my_model_4.wv.vocab.keys()
print(my_vocab_4)

In [None]:
# Step 5: Perform similarity analysis and scoring
# To compute the cosine similarity between two terms, use the similarity method. 
# Cosine similarity is generally bounded by [-1, 1]. The corresponding ‘distance’ 
# can be measured as 1-similarity. To figure out the terms most similar to a 
# special one, you can use the most_similar method.

# Back to our wikipedia case study
min_count = 100 # size = 200 [u'queen', u'elizabeth']
min_count = 50   # size = 200 [u'queen', u'royal', u'elizabeth', u'british']
min_count = 20
size = 100
#window = 5
model = Word2Vec(my_sentences, min_count=min_count, size=size, window=window, hs=1, negative=0)

# check vocabularies used in the model
vocab = model.wv.vocab.keys()
#print(vocab)

print(1,StemmingHelper.stem('physicists'))
print(2,model.wv.most_similar(StemmingHelper.stem('physicists')))
print(3,model.wv.similarity('relativity', 'physics'))
print(4,model.wv.similarity('quantum', 'gravitational'))
print(5,model.wv.similarity('einstein', 'german'))
print(6,model.wv.similarity('momentum', 'motion'))
print(7,model.wv.similarity('princeton', 'university'))

#print(8,model.wv.most_similar(positive=['scientists','einstein'],negative=['german'],topn=10)) # president
print(8,model.wv.most_similar(positive=['physicist', 'einstein'], negative=['german'])) # president
print(9,model.wv.most_similar_cosmul(positive=['music', 'einstein'], negative=['german'])) # president
print(10,model.wv.doesnt_match("Albert Einstein developed the theory of relativity".lower().split()))
print(11,model.wv.most_similar_cosmul(positive=['scientific', 'physics'], negative=['president'])) # president
print(12,model.wv.doesnt_match("Albert Einstein Einstein was affiliated with Princeton University".lower().split()))

# Note that the word vectors are stored in a KeyedVectors instance in model.wv. 
# This separates the read-only word vector lookup operations in KeyedVectors 
# from the training code in Word2Vec.
print(model['quantum'])     # raw NumPy vector of a word
print(model.wv['quantum'])  # numpy vector of a word


In [None]:
# Step 6. Compute model score

# gensim has currently only implemented score for the hierarchical softmax scheme [Mikolov et al., 2013], 
# so you should have run word2vec with hs=1 and negative=0 for this to work.

# iter = sweeps of SGD through the data; more is better (however, it takes a while. Please wait!)
iter = 5 

# we only have scoring for the hierarchical softmax setup
#model = Word2Vec(my_sentences, iter=iter, hs=1, negative=0) 
#print(model.score(["Einstein publised more than 300 scientific papers".split]))