[link: Update gensim instruction](https://github.com/RaRe-Technologies/gensim/wiki/Migrating-from-Gensim-3.x-to-4)

# Word Embedding In Natural language Processing


In [1]:
# !pip install --upgrade gensim

In [None]:
# from IPython.display import Image
# Image("C:\\Users\\User\\Desktop\\Why.png")

# In this session, I will explain how to use all options in Python - Step By Step

In [2]:
# training word2vec model
from gensim.models import Word2Vec
import gensim
import warnings
warnings.filterwarnings('ignore')

In [3]:
print("version of gensim: ", gensim.__version__)

# version of gensim:  4.2.0

version of gensim:  4.2.0


# Option 1 - Training own word2vec model

In [4]:
# define tokenized senences as training data
tokenized_sentences = [['Hello','This','is','python','training','by','Aman'],
             ['Hello','This','is','Java','training','by','Aman'],
             ['Hello','This','is','Data Science','training','by','Unfold','Data','Science'],
             ['Hello','This','is','programming','training','']]

In [5]:
mymodel = Word2Vec(sentences=tokenized_sentences, min_count=1)

In [6]:
# summarizing the loaded model

print(mymodel)

Word2Vec<vocab=14, vector_size=100, alpha=0.025>


In [7]:
# summarize vocabulary

words = list(mymodel.wv.key_to_index)

In [8]:
# summarize vocabulary
print(words)

['training', 'is', 'This', 'Hello', 'by', 'Aman', '', 'programming', 'Science', 'Data', 'Unfold', 'Data Science', 'Java', 'python']


In [9]:
# access word vector for one word "training"

print(mymodel.wv['Hello'])

[-8.2426788e-03  9.2993546e-03 -1.9766092e-04 -1.9672776e-03
  4.6036290e-03 -4.0953159e-03  2.7431131e-03  6.9399667e-03
  6.0654259e-03 -7.5107957e-03  9.3823504e-03  4.6718074e-03
  3.9661191e-03 -6.2435055e-03  8.4599778e-03 -2.1501661e-03
  8.8251876e-03 -5.3620026e-03 -8.1294207e-03  6.8245577e-03
  1.6711927e-03 -2.1985101e-03  9.5135998e-03  9.4938539e-03
 -9.7740479e-03  2.5052286e-03  6.1566923e-03  3.8724565e-03
  2.0227861e-03  4.3050051e-04  6.7363022e-04 -3.8206363e-03
 -7.1402504e-03 -2.0888734e-03  3.9238976e-03  8.8186832e-03
  9.2591504e-03 -5.9759379e-03 -9.4026709e-03  9.7643761e-03
  3.4297847e-03  5.1661157e-03  6.2823440e-03 -2.8042626e-03
  7.3227026e-03  2.8302716e-03  2.8710032e-03 -2.3803711e-03
 -3.1282497e-03 -2.3701428e-03  4.2764354e-03  7.6057913e-05
 -9.5842788e-03 -9.6655441e-03 -6.1481954e-03 -1.2856961e-04
  1.9974159e-03  9.4319675e-03  5.5843499e-03 -4.2906976e-03
  2.7831554e-04  4.9643586e-03  7.6983096e-03 -1.1442233e-03
  4.3234206e-03 -5.81438

In [10]:
mymodel.wv.key_to_index['Aman']

5

In [12]:
mymodel.wv.get_vecattr('Aman', 'count')

2

In [13]:
mymodel.wv.get_vecattr('Hello', 'count')

4

In [14]:
len(mymodel.wv)

14

In [15]:
# try finding most similar words for word "Data"
# If the word not present in the vocabulary then it will throw an error

mymodel.wv.most_similar("Data")

[('training', 0.21617144346237183),
 ('Science', 0.04467388987541199),
 ('Unfold', 0.015019134618341923),
 ('', 0.0019510614220052958),
 ('This', -0.032843127846717834),
 ('Java', -0.04568908363580704),
 ('Data Science', -0.07425408065319061),
 ('programming', -0.09326909482479095),
 ('is', -0.09575393050909042),
 ('Aman', -0.10513805598020554)]

## Option 2 - Create Embedding model using Keras Embedding

In [17]:
from numpy import array
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers.embeddings import Embedding
import keras

In [18]:
print("keras version: ", keras.__version__)

# keras version:  2.8.0

keras version:  2.8.0


In [19]:
# define documents

Sent = ['Hello, how are you',
        'how are you',
        'how are you doing',
        'I am doing great',
        'I am doing good',
        'I am good']

In [20]:
# defining class labels

sent_labels = array([1,1,1,0,0,0])

In [21]:
# integer encoding of the documents

my_vocab_size = 30

encoded_sent = [one_hot(i, my_vocab_size) for i in Sent]

print(encoded_sent)

[[13, 14, 21, 5], [14, 21, 5], [14, 21, 5, 5], [9, 22, 5, 2], [9, 22, 5, 27], [9, 22, 27]]


In [22]:
# padding documents to a max length = 5 

length = 5

padded_sent = pad_sequences(encoded_sent, maxlen=length, padding='pre')

print(padded_sent)

[[ 0 13 14 21  5]
 [ 0  0 14 21  5]
 [ 0 14 21  5  5]
 [ 0  9 22  5  2]
 [ 0  9 22  5 27]
 [ 0  0  9 22 27]]


In [24]:
# defining the model

mymodel = Sequential()
mymodel.add(Embedding(my_vocab_size, 8, input_length=length))
mymodel.add(Flatten())
mymodel.add(Dense(1, activation='sigmoid'))

In [25]:
# compiling the model
mymodel.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [27]:
# fiting  the model
mymodel.fit(padded_sent, sent_labels, epochs=30)

# evaluate the model
modelloss, modelaccuracy = mymodel.evaluate(padded_sent, sent_labels, verbose=0)
print('Accuracy: %f' % (modelaccuracy*100))

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
Accuracy: 100.000000


# The Prediction part

In [28]:
mysent_to_predict = ['how are you Suman',
        'I am good']

In [29]:
# integer encode the documents
vocab_size = 30
encoded = [one_hot(d, vocab_size) for d in mysent_to_predict]
print(encoded)


[[14, 21, 5, 23], [9, 22, 27]]


In [30]:
# pad documents to a max length of 5 words
max_length = 5
mypadded = pad_sequences(encoded, maxlen=max_length, padding='pre')
print(mypadded)

[[ 0 14 21  5 23]
 [ 0  0  9 22 27]]


In [32]:
mymodel.predict(mypadded)

array([[0.50674343],
       [0.48089656]], dtype=float32)