In [2]:

# coding: utf-8

# # Creating Word2Vec model
# ## Objective: Load given file remove stopwords, create/update a model and save as binary file.
# 
# 

# 1. Import the required packages.
# 2. create a class: vocabulary and files to be parsed.
# 3. Create iterator that can read each line and removes stopwords.
#    Use stemmer to stem english words.
#    The length of the word should be b/w 2 to 15.
#    All the words in the file 'stopwords.txt' will be removed.   
# 4. 

# In[52]:


# contails word2vec model
import gensim
from gensim.models import Word2Vec

# help in reading files from system
import os
import os.path

# regular expression to get words from line.
import re

# natural language processor
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem.snowball import SnowballStemmer

# pandas to load file
import pandas as pd

# to get best feature representation
from sklearn.decomposition import PCA

# to get list of files from command line
import argparse

In [59]:
stemmer = SnowballStemmer('english')

# load file which contains words that should not be a part of word2vec model
removeWords = pd.read_csv('stopwords.txt').values


# Create class to capture file and its text.
# 

# In[41]:


class WordsOfFile(object):
    def __init__(self, fileNames):
        self.fileNames = fileNames
        self.vocabulary = set([])        
        print('Files being read are: ', fileNames)
        
    def __iter__(self):
        for fileName in self.fileNames:
            print('Processing file: ', fileName)
            
            # reading each line of the file
            for line in open(fileName, encoding='latin1'):
                # get list of all words of length b/w 2 to 15
                words = re.findall(r'(\b[A-Za-z][a-z]{2,15}\b)', line)
                #skip all removeable words. reamining words should be stemmed.
                words = [stemmer.stem(word.lower()) for word in words if not word.lower() in removeWords]
                # add all the words in set. using generator as its more memory efficient
                for word in words:
                    self.vocabulary.add(word)
                    
                # yield the vocabulary. this will let you iterate over list of words just once but in every efficient way.
                yield words
        


# Load the file and pass it to the WordsOfFile class. 
# 
#     This will initialize vocabulary and run iterator on it.
#     We can pass multiple files too. keep the size small of you want to see the model generated after each file. 
#     
# 

# In[42]:


wordsOfFile = WordsOfFile(["MB1.txt"])



Files being read are:  ['MB1.txt']


In [66]:
wordsOfFile = WordsOfFile(["EsContent.txt"])

Files being read are:  ['EsContent.txt']


In [69]:
wordsOfFile.vocabulary

set()

In [61]:
word2VecModel = gensim.models.FastText(wordsOfFile, min_count=100, size=5, workers=4, min_alpha=2.0, max_n=7)
    
    # save the model to the disk in binary format.
word2VecModel.save('word2VecModel2.bin')

Processing file:  MB1.txt


RuntimeError: you must first build vocabulary before training the model

In [43]:

print("Yes, word2VecModel.bin Model exists. Will update the existing model with new file")
model = Word2Vec.load("word2VecModel.bin")
model.build_vocab(wordsOfFile,update=True)
model.train(wordsOfFile)
model.save('word2VecModel.bin')

Yes, word2VecModel.bin Model exists. Will update the existing model with new file
Processing file:  MB.txt


ValueError: You must specify either total_examples or total_words, for proper job parameters updationand progress calculations. The usual value is total_examples=model.corpus_count.

In [46]:
print("Yes, word2VecModel.bin Model exists. Will update the existing model with new file")
model = Word2Vec.load("word2VecModel.bin")
model.build_vocab(wordsOfFile,update=True)

Yes, word2VecModel.bin Model exists. Will update the existing model with new file
Processing file:  MB.txt


In [48]:
model.train(wordsOfFile,epochs=3,total_examples=model.corpus_count)

Processing file:  MB.txt
Processing file:  MB.txt
Processing file:  MB.txt


In [52]:
model.wv.most_similar("kshatriya")

[('immeasur', 0.9975221157073975),
 ('possess', 0.9953098893165588),
 ('eleph', 0.9914615750312805),
 ('energi', 0.9887579679489136),
 ('herb', 0.9857688546180725),
 ('instrument', 0.9856622219085693),
 ('torrent', 0.9846266508102417),
 ('princip', 0.9820976853370667),
 ('extrem', 0.9813475012779236),
 ('stupefi', 0.980380654335022)]

In [45]:
model.wv.most_similar("arjun")

[('arjuna', 0.9973067045211792),
 ('tbe', 0.9893862009048462),
 ('effulg', 0.9788151383399963),
 ('ala', 0.968237042427063),
 ('high', 0.9638353586196899),
 ('blare', 0.9238641262054443),
 ('look', 0.9220253825187683),
 ('injur', 0.9175955057144165),
 ('blaze', 0.9165440797805786),
 ('blood', 0.9079023599624634)]