<a href="https://colab.research.google.com/github/a-tab-sys/Natural-Language-Processing-NLP/blob/master/04%20Word_embedding_Techniques.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Word Embedding Techniques using Embedding Layer in Keras

### Libraries USed Tensorflow> 2.0  and keras

In [None]:
# this is no longer applicable:
# pip uninstall tensorflow-gpu
# error is related to compatibility issues with the tensorflow-gpu package
# the main cause is that tensorflow-gpu is deprecated starting from TensorFlow 2.1 and is now merged into the main tensorflow package.

In [23]:
pip install tensorflow

Note: you may need to restart the kernel to use updated packages.


In [29]:
# verify GPU support after installation, you can check if TensorFlow is recognizing your GPU
import tensorflow as tf
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))
print(tf.__version__)

Num GPUs Available:  0
2.19.0


In [31]:
# library helps you perform one hot encoding
from tensorflow.keras.preprocessing.text import one_hot

In [50]:
### sentences
# our sentance size is not consistant, some sentances have 4 words, some have 5
# when we are training a neural network, your sentance, input size have to be fixed
# to fix this issue we will use post and pre padding
sent=[  'the glass of milk',
     'the glass of juice',
     'the cup of tea',
    'I am a good boy',
     'I am a good developer',
     'understand the meaning of words',
     'your videos are good']

In [52]:
sent

['the glass of milk',
 'the glass of juice',
 'the cup of tea',
 'I am a good boy',
 'I am a good developer',
 'understand the meaning of words',
 'your videos are good']

#### Vocabulary size

In [55]:
# have to specify vocabulary size - arbitrary value, could be set to anything
# larger vocablary size gives you larger feature representation
voc_size=500

#### One Hot Representation

In [58]:
# captures indexes
# each vector list is the OHE version of the sentance.
# for instance in "the glass of milk", "the" is in the index position 180, "glass" is in the index position 405
onehot_repr=[one_hot(words,voc_size)for words in sent]
print(onehot_repr)

[[59, 91, 161, 276], [59, 91, 161, 478], [59, 195, 161, 38], [431, 419, 151, 203, 366], [431, 419, 151, 203, 147], [332, 59, 331, 161, 277], [280, 215, 294, 203]]


#### Word Embedding Representation

In [62]:
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential

In [64]:
import numpy as np

In [66]:
### pre padding
# from our dataset, our largest sentance is actually 5 words but to understand padding, lets set this to 8
# so the pad-sequesnce will make sure that wherever the OHE representation of our vector is 4 or 5, it is going to make it 8
# it is gonna do this using padding- here we are using pre secifically. if we use post, 0's will be added to the end
sent_length=8
embedded_docs=pad_sequences(onehot_repr,padding='pre',maxlen=sent_length)
print(embedded_docs)

[[  0   0   0   0  59  91 161 276]
 [  0   0   0   0  59  91 161 478]
 [  0   0   0   0  59 195 161  38]
 [  0   0   0 431 419 151 203 366]
 [  0   0   0 431 419 151 203 147]
 [  0   0   0 332  59 331 161 277]
 [  0   0   0   0 280 215 294 203]]


In [68]:
### feature dimensions
# now for example with this sentance: [  0   0   0   0  59  91 161 276]
# so for each and every value in the above sentance, we will provide that in the form of feature representation
# we are setting our feature representaiton size as 10
dim=10
# so esscentially the example the first 0 in our sentance above would be represented by 10 values. THIS IS ESSCENTIALLY WOD2VEC
# [ 0.4
    0.5
    045
    1.2
    2.4
    0.76
    0.78
    1.3
    3.2
    0   ]


In [None]:
model=Sequential()
model.add(Embedding(voc_size,10,input_length=sent_length))
# the embedding layer below works similar to word2vec, it will train our neural network
# arguments include <voc size, how many features per vector (could be dim varibale defined above), specify input length which is our sentance length, which is 8>
model.compile('adam','mse')
# compiling with adam and qaamean square error, and taking the loss function

In [72]:
model.summary()

In [None]:
# 'the glass of milk',
embedded_docs[0]

array([  0,   0,   0,   0,  59,  91, 161, 276])

In [78]:
model.predict(embedded_docs[0])

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 120ms/step


array([[ 0.00298679,  0.02932216,  0.02459887, -0.01831581, -0.03383742,
         0.02392096, -0.04072629,  0.00456358, -0.04682035, -0.0052909 ],
       [ 0.00298679,  0.02932216,  0.02459887, -0.01831581, -0.03383742,
         0.02392096, -0.04072629,  0.00456358, -0.04682035, -0.0052909 ],
       [ 0.00298679,  0.02932216,  0.02459887, -0.01831581, -0.03383742,
         0.02392096, -0.04072629,  0.00456358, -0.04682035, -0.0052909 ],
       [ 0.00298679,  0.02932216,  0.02459887, -0.01831581, -0.03383742,
         0.02392096, -0.04072629,  0.00456358, -0.04682035, -0.0052909 ],
       [-0.00902073, -0.04047633, -0.04965064, -0.01325365, -0.03963579,
        -0.0161491 ,  0.00109575, -0.02132442,  0.03011035,  0.04604227],
       [-0.01142861, -0.02867019,  0.01654385,  0.04083909,  0.04539467,
         0.0361747 ,  0.04886154, -0.00536014,  0.034936  , -0.04806005],
       [ 0.02368667,  0.03151972, -0.02726269, -0.04854659, -0.03845816,
         0.01311852,  0.00633377, -0.04570239

In [None]:
print(model.predict(embedded_docs))

[[[ 0.03938437 -0.02009605 -0.03878935 -0.04955565  0.00419912
   -0.01431773  0.02523251  0.01653036  0.04291571 -0.00864979]
  [ 0.03938437 -0.02009605 -0.03878935 -0.04955565  0.00419912
   -0.01431773  0.02523251  0.01653036  0.04291571 -0.00864979]
  [ 0.03938437 -0.02009605 -0.03878935 -0.04955565  0.00419912
   -0.01431773  0.02523251  0.01653036  0.04291571 -0.00864979]
  [ 0.03938437 -0.02009605 -0.03878935 -0.04955565  0.00419912
   -0.01431773  0.02523251  0.01653036  0.04291571 -0.00864979]
  [ 0.03059326 -0.04286614  0.00899569  0.00743791 -0.000781
    0.04186494  0.03977301  0.00326709  0.00619651 -0.01993654]
  [ 0.02512412 -0.0087087   0.03144198  0.00704668 -0.00177735
   -0.03415867 -0.00100178  0.01562483  0.03178963  0.02784893]
  [-0.00653008  0.02340979 -0.01967902 -0.00494973 -0.02693756
   -0.03746525  0.01460877 -0.00449115 -0.00130982 -0.0039017 ]
  [-0.03150218  0.01950303 -0.01415605 -0.00183152  0.01207731
    0.02444079  0.0140041   0.0070256   0.04950741

In [None]:
embedded_docs[0]

array([   0,    0,    0,    0, 6654,  998, 8966, 1609])

In [None]:
print(model.predict(embedded_docs)[0])

[[-0.00425554 -0.00159295 -0.04714153  0.04425247 -0.00973954 -0.04325813
   0.04007108 -0.0143286  -0.03659749 -0.02379028]
 [-0.00425554 -0.00159295 -0.04714153  0.04425247 -0.00973954 -0.04325813
   0.04007108 -0.0143286  -0.03659749 -0.02379028]
 [-0.00425554 -0.00159295 -0.04714153  0.04425247 -0.00973954 -0.04325813
   0.04007108 -0.0143286  -0.03659749 -0.02379028]
 [-0.00425554 -0.00159295 -0.04714153  0.04425247 -0.00973954 -0.04325813
   0.04007108 -0.0143286  -0.03659749 -0.02379028]
 [-0.03786323 -0.02628061  0.02974111 -0.03307171  0.0271405   0.00945134
   0.02378127  0.04176904  0.00514941  0.0152082 ]
 [ 0.04834186  0.04388311 -0.02802253 -0.01475487 -0.01212303  0.03762435
  -0.01166249 -0.02141088  0.04654533  0.01537322]
 [ 0.03276015 -0.00637691  0.03907344 -0.01912468  0.02177186 -0.04630325
   0.00800942 -0.03115667 -0.00486455 -0.04843524]
 [-0.04173617  0.03438064  0.02880521 -0.01896455  0.0323303  -0.00109453
  -0.01675171 -0.00941917 -0.03309294 -0.04779492]]

In [None]:
### Assignment
# go to kaggle, get the imdb 50k movie review dataset, and convert to vectors
# also take up the sentances below, and convert to vectors

sent=["The world is a better place",
      "Marvel series is my favourite movie",
      "I like DC movies",
      "the cat is eating the food",
      "Tom and Jerry is my favourite movie",
      "Python is my favourite programming language"
      ]