## Word2Vec using NLTK

Building word2vec model from scratch requires lots and lots of data to train the model. Hence the model works best on google news corpus or Wikipedia corpus. As these are very large corpus and so it can not be trained on PC’s with 8GB-16GB RAM. So I am using <b>nltk brown corpus</b> for the training. It has about <b>1.2 million words</b> and the results are good.

#### Import Libraries

In [41]:
import numpy as np
print('Numpy Version '+np.__version__)
import pandas as pd
print('Pandas Version '+pd.__version__)
import matplotlib.pyplot as plt
%matplotlib inline
import tensorflow as tf
print('Tensorflow Version '+tf.__version__)
from IPython.display import Image # To view image from location/url
import keras
print('Keras Version '+keras.__version__)
import nltk
import logging
import multiprocessing
import re
import os
import gensim
from gensim.corpora import WikiCorpus
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence
import gensim.models.keyedvectors as word2vec
from nltk.corpus import brown, movie_reviews, treebank
#nltk.download('brown')
#nltk.download('movie_reviews')
#nltk.download('treebank')
#nltk.download('punkt')

# Input & Output files path
inputDir = "./data/"
outputDir = "./data/outDir/"
inputDir_BigFiles = "E:/BigFiles/"
outputDir_Bigfiles = "E:/BigFiles/outDir/"

Numpy Version 1.12.1
Pandas Version 0.20.3
Tensorflow Version 1.1.0
Keras Version 2.1.3


#### Train/Load & Save NLTK model

In [25]:
brown_model = Word2Vec(brown.sents())
movie_reviews_model = Word2Vec(movie_reviews.sents())
treebank_model = Word2Vec(treebank.sents())
brown_model.save(outputDir+"brown_model")
size = str(round((os.path.getsize(outputDir+'brown_model')/1000),1))
print('Brown corpus model saved at '+outputDir+ '\nSize of file :'+size+ ' KB' )

Brown corpus model saved at ./data/outDir/
Size of file :19197.9 KB


#### Test NLTK model

In [6]:
# Brown
print(brown_model.most_similar('money', topn=5))
#find the odd one out
print (brown_model.doesnt_match("breakfast cereal dinner lunch".split()))
#vector representation of word human
#print (brown_model["human"])

[('care', 0.9140259027481079), ('chance', 0.9021503925323486), ('job', 0.8922946453094482), ('trouble', 0.8632405996322632), ('easy', 0.860753059387207)]
cereal


In [7]:
# Movie Reviews
print(movie_reviews_model.most_similar('money', topn=5))
#find the odd one out
print (movie_reviews_model.doesnt_match("girl boy her movie".split()))
#vector representation of word human
#print (movie_reviews_model["human"])

[('attention', 0.7723175287246704), ('eyes', 0.7617353200912476), ('chance', 0.7536006569862366), ('him', 0.7455449104309082), ('home', 0.7288988828659058)]
movie


In [8]:
# Treebank
print(treebank_model.most_similar('money', topn=5))
#find the odd one out
print (treebank_model.doesnt_match("financial money earning fat".split()))
#vector representation of word human
#print (treebank_model["human"])

[('all', 0.9999011158943176), ('only', 0.9998963475227356), ('new', 0.9998949766159058), ('some', 0.999890923500061), ("'", 0.9998877048492432)]
money


In [27]:
def train_model(inp, out, type=0):
    '''
    inp  : Input Dataset
    out  : Output Model
    type : 0(default) for CBOW & 1 for Skipgram
    '''
    logger = logging.getLogger("word2vect-training")
    logging.basicConfig(format="%(asctime)s:%(levelname)s:%(message)s")
    logging.root.setLevel(level=logging.INFO)
    
    model = Word2Vec(LineSentence(inp), size=100, window=5,min_count=5,workers=multiprocessing.cpu_count(),sg=type)
    model.init_sims(replace = True)
    model.save(out)

#### Train/Load & Save Glove model

In [42]:
# Convert glove_input_file in GloVe format into word2vec_output_file in word2vec format.
gensim.scripts.glove2word2vec.glove2word2vec(glove_input_file = inputDir_BigFiles+"glove.6B/glove.6B.50d.txt", 
                                             word2vec_output_file = outputDir_Bigfiles+"word2vec_glove_file")

# Load GloVe: Global Vectors for Word Representation Word2Vec model.
glove_model = word2vec.KeyedVectors.load_word2vec_format(outputDir_Bigfiles+"word2vec_glove_file")
glove_model.save(outputDir_Bigfiles+"word2vec_glove_model")

#### Test Glove model

In [43]:
print(glove_model.most_similar('money', topn=5))
#find the odd one out
print (glove_model.doesnt_match("breakfast cereal dinner lunch".split()))

[('cash', 0.8989869356155396), ('paying', 0.8788655996322632), ('funds', 0.8768925070762634), ('pay', 0.8716533184051514), ('raise', 0.8444491028785706)]
cereal


In [31]:
# Load Google's pre-trained Word2Vec model.
google_model = word2vec.KeyedVectors.load_word2vec_format('D:/Drive/BigFiles/GoogleNews-vectors-negative300.bin', binary=True) 

In [None]:
google_model.save("D:/Drive/BigFiles/Google/google_word-vec_out.syn0.npy")
print ("Google corpus model saved.")



In [10]:
os.path

<module 'ntpath' from 'C:\\Users\\ankit.bhatia\\AppData\\Local\\Continuum\\anaconda3\\envs\\tensor\\lib\\ntpath.py'>

In [11]:
import socket
host_name = socket.gethostname()
host_name

'IRIS-CSG-1387'