<a href="https://colab.research.google.com/github/ashutoshgithubs/Machine-Learning/blob/main/wordEmbeddingTech.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Word Embedding Techniques using Embedding Layer in Keras
#### Steps:


*   Sentence
*   One Hot Encoding
*   Padding (post/pre) -> OHE
*   OHE -> Vector





In [1]:
!pip install tensorflow[and-cuda]

Collecting nvidia-cublas-cu12==12.5.3.2 (from tensorflow[and-cuda])
  Downloading nvidia_cublas_cu12-12.5.3.2-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.5.82 (from tensorflow[and-cuda])
  Downloading nvidia_cuda_cupti_cu12-12.5.82-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.5.82 (from tensorflow[and-cuda])
  Downloading nvidia_cuda_nvrtc_cu12-12.5.82-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.5.82 (from tensorflow[and-cuda])
  Downloading nvidia_cuda_runtime_cu12-12.5.82-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cudnn-cu12==9.3.0.75 (from tensorflow[and-cuda])
  Downloading nvidia_cudnn_cu12-9.3.0.75-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cufft-cu12==11.2.3.61 (from tensorflow[and-cuda])
  Downloading nvidia_cufft_cu12-11.2.3.61-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecti

In [2]:
import tensorflow as tf
print(tf.__version__)

2.19.0


In [3]:
from tensorflow.keras.preprocessing.text import one_hot

In [4]:
### sentences /corpus
sent=[  'the glass of milk',
     'the glass of juice',
     'the cup of tea',
    'I am a good boy',
     'I am a good developer',
     'understand the meaning of words',
     'your videos are good']

In [8]:
### Vocabulary size
voc_size=500

### OHE Representation

In [6]:
ohe_rep = []
for words in sent:
  one_hot_rep = one_hot(words, voc_size)
  ohe_rep.append(one_hot_rep)

In [7]:
ohe_rep

[[35, 34, 132, 333],
 [35, 34, 132, 132],
 [35, 395, 132, 65],
 [493, 494, 266, 116, 82],
 [493, 494, 266, 116, 229],
 [208, 35, 52, 132, 414],
 [483, 433, 92, 116]]

### Word Embedding Represntation

In [9]:
from tensorflow.keras.layers import Embedding # word2Vec like
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential

In [10]:
# Lets apply pre-padding to make all sentence length same
sent_length = 7 # can take any number >=len(max sentence len)
embedded_docs = pad_sequences(ohe_rep, padding='pre', maxlen=sent_length)
print(embedded_docs)

[[  0   0   0  35  34 132 333]
 [  0   0   0  35  34 132 132]
 [  0   0   0  35 395 132  65]
 [  0   0 493 494 266 116  82]
 [  0   0 493 494 266 116 229]
 [  0   0 208  35  52 132 414]
 [  0   0   0 483 433  92 116]]


### Feature representation

In [14]:
dim = 10 # each element of embedded docs will be represented as a vec of size 10.

In [18]:
model = Sequential()
model.add(Embedding(voc_size, dim, input_length=sent_length))
model.compile('adam', 'mse') #This compiles the model, preparing it for training.



In [16]:
model.predict(embedded_docs[0]) #This method generates output predictions for the input samples in form of vec embedding..

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 317ms/step


array([[ 0.00469248, -0.01073547,  0.00895814,  0.00262933,  0.04850277,
        -0.01256697,  0.02386774,  0.00475971, -0.00091059,  0.02498451],
       [ 0.00469248, -0.01073547,  0.00895814,  0.00262933,  0.04850277,
        -0.01256697,  0.02386774,  0.00475971, -0.00091059,  0.02498451],
       [ 0.00469248, -0.01073547,  0.00895814,  0.00262933,  0.04850277,
        -0.01256697,  0.02386774,  0.00475971, -0.00091059,  0.02498451],
       [ 0.01600876,  0.01707646, -0.03869817,  0.04631675, -0.02831711,
        -0.02229309,  0.02499728, -0.01568338, -0.0382818 ,  0.03262076],
       [-0.01663733,  0.02557254, -0.03082478, -0.01626392, -0.02441312,
        -0.02011229, -0.03128134,  0.01284528,  0.04921223, -0.04507161],
       [ 0.04349952,  0.04440976, -0.04505372, -0.01226728,  0.0167048 ,
        -0.01482568,  0.04962078,  0.0367722 ,  0.04173071, -0.01740074],
       [-0.01447834,  0.04014652, -0.039384  , -0.04374974,  0.02345014,
         0.02582233, -0.02236173, -0.0096036 

In [20]:
model.summary()

In [19]:
# Lets predict the entire embedded docs
model.predict(embedded_docs)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 133ms/step


array([[[-0.00075148,  0.00397871, -0.01305057, -0.014066  ,
         -0.01240871,  0.0188851 ,  0.01685417,  0.03079201,
          0.04095152,  0.03399625],
        [-0.00075148,  0.00397871, -0.01305057, -0.014066  ,
         -0.01240871,  0.0188851 ,  0.01685417,  0.03079201,
          0.04095152,  0.03399625],
        [-0.00075148,  0.00397871, -0.01305057, -0.014066  ,
         -0.01240871,  0.0188851 ,  0.01685417,  0.03079201,
          0.04095152,  0.03399625],
        [ 0.02164781,  0.04977408, -0.03929586, -0.0015509 ,
          0.04000889,  0.03436973,  0.03555692, -0.0476131 ,
          0.01174231, -0.01067797],
        [ 0.03201317, -0.0369992 ,  0.02483188, -0.02912866,
          0.01881379, -0.03695338,  0.04551147, -0.03383148,
         -0.01536794, -0.00188925],
        [-0.02471271, -0.02866468, -0.02865238, -0.01798248,
         -0.03950733, -0.03802934, -0.01687471, -0.03290422,
          0.02990263, -0.02294538],
        [ 0.01687102,  0.0471635 , -0.0387177 ,  0.0