## Training own Word2Vec

In [None]:
#Defined tokenized sentences as training data
tokenized_sentences = [['Hello','Welcome','to','AI','Program','by','John'],
                       ['Hello','Glad','to','have','you','here'],
                       ['Hello','Welcome','back'],
                       ['Welcome','to','DS','Program','by','John']]

In [None]:
#Training Word2vec model
from gensim.models import Word2Vec

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
model = Word2Vec(sentences = tokenized_sentences,min_count=1)
#take all the words which has been occured even 1 time. If Igive it 2,in this whole vocab, the owrds whihc has been occured 2 times only be considered.

In [None]:
print(model) #Size = 100 features/characteristics for one word

Word2Vec(vocab=13, size=100, alpha=0.025)


In [None]:
words = list(model.wv.vocab)
words

['Hello',
 'Welcome',
 'to',
 'AI',
 'Program',
 'by',
 'John',
 'Glad',
 'have',
 'you',
 'here',
 'back',
 'DS']

In [None]:
#I want to see the vector of training
print(model['AI']) #The word training has been rated on 100 different features
#Encoded value for Program

[-2.8244264e-03 -3.2562399e-03 -3.5934714e-03  1.5879658e-03
  1.2692633e-03  1.6655559e-03  2.0891200e-03 -1.5681548e-03
  1.5032160e-03 -6.7089999e-04  4.6587973e-03  4.0787226e-03
  3.4705335e-03 -4.6739182e-03 -3.8629705e-03  2.9158581e-03
  2.0429835e-04  2.5400750e-03  4.8033674e-03 -4.1345800e-03
 -2.9748254e-03 -3.9139218e-03 -4.8338422e-03 -1.3274958e-03
 -2.2322396e-03  3.7176707e-03 -8.0487435e-04  3.3891585e-03
  5.0769199e-04  1.9545516e-03  3.0044388e-04 -2.2552626e-03
  7.3813484e-04 -5.8766059e-04  4.3856236e-03  2.8983993e-03
 -4.3165432e-03  3.6524332e-03  4.3365667e-03  9.8882755e-04
  3.6278958e-03  4.2415173e-03  2.9795095e-03  1.3953992e-03
 -7.0569447e-05 -1.3833055e-03 -3.0287780e-04 -1.8826313e-03
  4.7156069e-04  3.9645848e-03  1.9722623e-03  3.8490035e-03
  4.1444288e-03  2.2115272e-03  4.1628899e-03  2.2933932e-03
  5.4253032e-04  8.3968951e-04 -1.3282083e-03  2.8861903e-03
 -1.1311959e-03 -3.2007827e-03  7.3696800e-05  2.4951585e-03
  4.7643646e-03 -4.30806

In [None]:
print(model['World']) 

KeyError: ignored

In [None]:
model.most_similar('Hello') #the vector of hello is compared with all other words

[('here', 0.08527158200740814),
 ('John', 0.07021407783031464),
 ('to', 0.06822391599416733),
 ('AI', 0.06018120050430298),
 ('DS', 0.05924709141254425),
 ('Program', 0.027920937165617943),
 ('have', 0.016009867191314697),
 ('you', 0.00652676448225975),
 ('Welcome', -0.008957228623330593),
 ('by', -0.018957197666168213)]

In [None]:
model.most_similar('Apple') 

KeyError: ignored

## LP 

Here there is a big limitation where we need to train our model by our own from scratch. Model is not able to identify any new word. That is the disadvantage of when we train our own network

## -------------------------------------------------------

## Without Word2vec how can we create our own word embeddings?

##### Rescue is Keras. Keras embedding layer.

In [None]:
from numpy import array
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers.embeddings import Embedding

In [None]:
#Define your own documents
sentences = ['Hello how are you',
             'how are you',
             'whats going on',
             'I am doing great',
             'You are awesome',
             'I really love you so much']

In [None]:
sent_labels = array([1,1,1,0,0,0])

In [None]:
vocab_size = 50 # dimension
encoded_sent = [one_hot(i,vocab_size)for i in sentences]
encoded_sent

[[34, 20, 35, 22],
 [20, 35, 22],
 [20, 43, 6],
 [1, 42, 22, 35],
 [22, 35, 20],
 [1, 36, 1, 22, 5, 35]]

In [None]:
#Now I want my nn to train  for that the size of the vector should be same.
padded_sent = pad_sequences(encoded_sent,maxlen = 6,padding='pre')
print(padded_sent)

[[ 0  0 34 20 35 22]
 [ 0  0  0 20 35 22]
 [ 0  0  0 20 43  6]
 [ 0  0  1 42 22 35]
 [ 0  0  0 22 35 20]
 [ 1 36  1 22  5 35]]


In [None]:
mymodel = Sequential()
mymodel.add(Embedding(vocab_size,8,input_length = 6,)) #6 senteences
mymodel.add(Flatten())
mymodel.add(Dense(1,activation = 'sigmoid')) #Because target is categorical in nature

In [None]:
mymodel.compile(optimizer='adam',loss = 'binary_crossentropy',metrics = ['accuracy'])

In [None]:
mymodel.fit(padded_sent,sent_labels,epochs = 30)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.History at 0x7fb2d018b9d0>

In [None]:
#Evaluate the model
mymodel.evaluate(padded_sent,sent_labels,verbose = 1)



[0.6365944147109985, 0.8333333134651184]

In [None]:
model_loss,model_accuracy = mymodel.evaluate(padded_sent,sent_labels,verbose = 1)
print('Accuracy : %f' %(model_accuracy*100))

Accuracy : 83.333331


### The Prediction part

In [None]:
sent_for_pred = ['Hello how are you',
                 'I am doing great']

In [None]:
vocab_size = 50 #Is it dimension?
encoded = [one_hot(i,vocab_size)for i in sent_for_pred]
encoded

[[34, 20, 35, 22], [1, 42, 22, 35]]

In [None]:
#Now I want my nn to train  for that the size of the vector should be same.
padded_sent = pad_sequences(encoded,maxlen = 6,padding='pre')
print(padded_sent)

[[ 0  0 34 20 35 22]
 [ 0  0  1 42 22 35]]


In [None]:
predict_x = mymodel.predict(padded_sent)
predict_x


array([[0.5358278 ],
       [0.48555127]], dtype=float32)

## LP
Hre vocab is so small, so model performance wont be that great.

## -------------------------------------------------------------

## 3. Using Pretrained Word2vec model

In [None]:
from gensim.models import Word2Vec, KeyedVectors

In [None]:
pretrained_model = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin',binary = True)

FileNotFoundError: ignored

In [None]:
pretrained_model.most_similar('data')
print(result)