# Word Embedding In Natural language Processing


In [1]:
# A word embedding is a learned representation for text where words that have the same meaning have a similar representation

In [2]:
# Word Embedding - 
# 1. To provide a meaningful vector representation of the word
# 2. This is an improvement over frequency based approaches


# Working principle behind word embedding:
#  - while Training, meaning of the word is learned by analyzing its neighbors


# Usages:
# 1. we can use pre-trained models( Word2Vec, glove)
# 2. We can also train our own embedding using word2vec library.
# 3. training of our own embedding is also possible using Keras embedding method


# skip-gram vs cbow (continuous bag of words): While a bag-of-words model predicts a word 
#                   given the neighboring context, a skip-gram model predicts the context (or neighbors) of a word, given the word itself

In [None]:
# Word2Vec: https://wiki.pathmind.com/word2vec

# word2vec trains words against other words that neighbor them in the input corpus.

# It does so in one of two ways, 
# either using context to predict a target word (a method known as continuous bag of words, or CBOW), 
# or using a word to predict a target context, which is called skip-gram. it produces more accurate results on large datasets.

# Option 2 - Training own word2vec model

In [3]:
# define tokenized senences as training data
tokenized_sentences = [['Hello','This','is','python','training','by','Aman'],
             ['Hello','This','is','Java','training','by','Aman'],
             ['Hello','This','is','Data Science','training','by','Unfold','Data','Science'],
             ['Hello','This','is','programming','training','']]

# '' - this will also be considered as word

In [4]:
# training word2vec model : 2. We can also train our own embedding using word2vec library.
from gensim.models import Word2Vec
import warnings
warnings.filterwarnings('ignore')
mymodel = Word2Vec(tokenized_sentences, min_count=1) # min_count = 1 means consider words even if it occurred only once.

In [6]:
# summarizing the loaded model
print(mymodel)

# Observation on below output
# vocab - the total number of unique words
# size - each of the below 14 vocab will be rated based on 100 features like for eg: does_bark, has_mouth, etc.

Word2Vec(vocab=14, size=100, alpha=0.025)


In [7]:
# summarize vocabulary
words = list(mymodel.wv.vocab)

In [8]:
# summarize vocabulary
print(words)

['Hello', 'This', 'is', 'python', 'training', 'by', 'Aman', 'Java', 'Data Science', 'Unfold', 'Data', 'Science', 'programming', '']


In [10]:
# access word vector for one word "Hello"
print(mymodel['Hello'])

[ 2.6415866e-03 -1.3855502e-03 -3.8600569e-03 -4.9171047e-03
  6.8308087e-04  2.3193758e-03 -1.0291578e-03  2.3272699e-03
 -1.9845173e-03  1.5841145e-03  4.4977646e-03 -3.7625260e-03
  3.2597310e-03 -2.4843819e-03  2.6844600e-03 -5.3768087e-04
 -2.1219174e-03 -4.0911287e-03 -3.9883726e-03 -4.8130397e-03
 -3.8309779e-03  2.7398660e-03  4.6871815e-04 -1.8282381e-03
 -1.4734179e-03  2.4689538e-03  4.0270435e-03 -3.2605149e-03
  2.9342817e-03  2.7687694e-03 -1.6917753e-04  3.6806199e-03
 -2.5652875e-03  2.0396789e-03  4.1413107e-03  8.6929744e-05
 -4.6985513e-03 -1.7085299e-03  6.8564975e-04  6.5581390e-04
  1.9432650e-03  2.1717893e-03  4.8512691e-03  4.6147895e-03
 -1.2325962e-03 -6.6940986e-05 -2.4595321e-03  3.5173693e-03
  2.5457388e-03 -2.5102813e-03  2.2696787e-03  1.8118306e-03
 -9.0079533e-04 -3.9707045e-03  2.4799458e-03  4.1773377e-04
  4.4813706e-03 -1.6630677e-03 -1.2754368e-03  4.1415193e-03
  1.1745522e-03  1.9707424e-03 -4.8856792e-04 -4.3606777e-03
  3.7441468e-03 -4.85876

In [11]:
#try finding most similar words for word "Data"
mymodel.most_similar("Data")

[('programming', 0.13299772143363953),
 ('training', 0.08566539734601974),
 ('', 0.04990440607070923),
 ('Unfold', 0.03425639122724533),
 ('Java', 0.02375667728483677),
 ('is', -0.004918448626995087),
 ('Hello', -0.02265256643295288),
 ('by', -0.028853053227066994),
 ('Science', -0.04116513580083847),
 ('python', -0.05294158309698105)]

In [12]:
#try finding most similar words for word "data", to check case sensitivity
mymodel.most_similar("data")

# so if we try to find the most_similar matrix for a word not present in the training set (case insensitive), we will get error

KeyError: ignored

In [13]:
### Drawback of 2. We can also train our own embedding using word2vec library.
# as we have supplied limited words, training quality is not good enough

# First Task for you - comment me how you can solve a document classification problem using above concept


In [None]:
#####################################Part 2###################################################

# Option 3 - Create Embedding model using Keras Embedding

In [23]:
from numpy import array
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers.embeddings import Embedding # This layer can only be used as the first layer in a model.
# define documents
Sent = ['Hello, how are you',
        'how are you',
        'how are you doing',
        'I am doing great',
        'I am doing good',
        'We are good']

In [24]:
# defining class labels: first 3 docs as class-1, next 3 docs as class-0
sent_labels = array([1,1,1,0,0,0])

In [25]:
# integer encoding of the documents
my_vocab_size = 30 # here we want the vocab size (in last eg it was 100 by default) to be 30
encoded_sent = [one_hot(i, my_vocab_size) for i in Sent] # one hot encoding of sentences
print(encoded_sent)

# Observation: 'are' has been encoded with 3, but 'i' is also getting represented by 3

[[29, 22, 3, 13], [22, 3, 13], [22, 3, 13, 8], [3, 5, 8, 27], [3, 5, 8, 26], [23, 3, 26]]


In [26]:
# padding documents to a max length =5 
# This is to make the 'encoded_sent' vector nXn matrix.
length = 5
padded_sent = pad_sequences(encoded_sent, maxlen=length, padding='pre') # max_length is 5, to get a 5X5 matrix. and 0s will be added to the beginning
print(padded_sent)

[[ 0 29 22  3 13]
 [ 0  0 22  3 13]
 [ 0 22  3 13  8]
 [ 0  3  5  8 27]
 [ 0  3  5  8 26]
 [ 0  0 23  3 26]]


In [31]:
# defining the model
mymodel = Sequential() # defining a sequential model
mymodel.add(Embedding(my_vocab_size, 8, input_length=length)) # adding an embedding layer
# my_vocab_size - size of my vocabulary
# 8 - seems like output dimension
# max_length - the padding size, n in nXn matrix
mymodel.add(Flatten())
mymodel.add(Dense(1, activation='sigmoid')) # output layer


In [32]:
# compiling the model
mymodel.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
# fiting  the model
mymodel.fit(padded_sent, sent_labels, epochs=30)

# evaluate the model
modelloss, modelaccuracy = mymodel.evaluate(padded_sent, sent_labels, verbose=0)
print('Accuracy: %f' % (modelaccuracy*100))

# here the accuracy is 100%

# The Prediction part

In [39]:
mysent_to_predict = ['how are you Suman',
        'I am good']

# question should be classified as 1, and answer should be classified as 0

In [40]:
# integer encode the documents
vocab_size = 30
encoded = [one_hot(d, vocab_size) for d in mysent_to_predict]
print(encoded)


[[22, 3, 13, 29], [3, 5, 26]]


In [41]:
# pad documents to a max length of 5 words
max_length = 5
mypadded = pad_sequences(encoded, maxlen=max_length, padding='pre')
print(mypadded)

[[ 0 22  3 13 29]
 [ 0  0  3  5 26]]


In [43]:
mymodel.predict_classes(mypadded)

# Here even 'suman' is a new word, the model didn't gave any error like the earlier one

array([[1],
       [0]], dtype=int32)

# Option 1 - Using Pre Trained Word2Vec Model

In [44]:
from gensim.models import KeyedVector   
import os 
os.chdir("F:\\")

ImportError: ignored

In [None]:
#Download GoogleNews-vectors-negative300.bin from ; this is a pre-trained model
#https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz

In [None]:
PreTrainedModel = KeyedVectors.load_word2vec_format("GoogleNews-vectors-negative300.bin", binary=True)

In [None]:
# calculate: (king - man) + woman = ?
result = PreTrainedModel.most_similar("Data")
print(result)

MemoryError: 