In [1]:
import tensorflow as tf

In [2]:
print(tf.__version__)

2.12.0


In [3]:
### Sentences
sent=[  'the glass of milk',
     'the glass of juice',
     'the cup of tea',
    'I am a good boy',
     'I am a good developer',
     'understand the meaning of words',
     'your videos are good']
sent

['the glass of milk',
 'the glass of juice',
 'the cup of tea',
 'I am a good boy',
 'I am a good developer',
 'understand the meaning of words',
 'your videos are good']

### One hot representation

In [4]:
from tensorflow.keras.preprocessing.text import one_hot

### Define the vocabulary size
vocab_size = 300

onehot_repr = [one_hot(input_text=words, n=vocab_size) for words in sent]
onehot_repr ### The numbers indicate the indexes where 1 is present for those respective words in one hot encoding

[[33, 38, 207, 28],
 [33, 38, 207, 30],
 [33, 248, 207, 282],
 [36, 147, 76, 37, 247],
 [36, 147, 76, 37, 217],
 [146, 33, 286, 207, 87],
 [282, 276, 75, 37]]

### Word Embedding 

In [5]:
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
import numpy as np

In [6]:
### Pre-padding
max_sent_length = 9 ## defining the max length of sentence as 9
pre_embedded_docs = pad_sequences(sequences=onehot_repr, maxlen=max_sent_length, padding='pre')
pre_embedded_docs

array([[  0,   0,   0,   0,   0,  33,  38, 207,  28],
       [  0,   0,   0,   0,   0,  33,  38, 207,  30],
       [  0,   0,   0,   0,   0,  33, 248, 207, 282],
       [  0,   0,   0,   0,  36, 147,  76,  37, 247],
       [  0,   0,   0,   0,  36, 147,  76,  37, 217],
       [  0,   0,   0,   0, 146,  33, 286, 207,  87],
       [  0,   0,   0,   0,   0, 282, 276,  75,  37]])

In [7]:
### Post-padding
post_embedded_docs = pad_sequences(sequences=onehot_repr, maxlen=max_sent_length, padding='post')
post_embedded_docs

array([[ 33,  38, 207,  28,   0,   0,   0,   0,   0],
       [ 33,  38, 207,  30,   0,   0,   0,   0,   0],
       [ 33, 248, 207, 282,   0,   0,   0,   0,   0],
       [ 36, 147,  76,  37, 247,   0,   0,   0,   0],
       [ 36, 147,  76,  37, 217,   0,   0,   0,   0],
       [146,  33, 286, 207,  87,   0,   0,   0,   0],
       [282, 276,  75,  37,   0,   0,   0,   0,   0]])

In [8]:
### defining feature representation dimension as 10
dimension = 10

model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=dimension, input_length=max_sent_length))
model.compile('adam','mse')

In [9]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 9, 10)             3000      
                                                                 
Total params: 3,000
Trainable params: 3,000
Non-trainable params: 0
_________________________________________________________________


In [10]:
pre_embedded_docs[0] ## first sentence in sent 'the glass of milk'

array([  0,   0,   0,   0,   0,  33,  38, 207,  28])

In [11]:
model.predict(pre_embedded_docs[0]) ## Each word in the sentence will have 10 dimensions



array([[-0.03038744,  0.01850469,  0.04297635,  0.03112287,  0.02953329,
         0.0069404 ,  0.0069802 , -0.0387303 ,  0.01437113,  0.02293072],
       [-0.03038744,  0.01850469,  0.04297635,  0.03112287,  0.02953329,
         0.0069404 ,  0.0069802 , -0.0387303 ,  0.01437113,  0.02293072],
       [-0.03038744,  0.01850469,  0.04297635,  0.03112287,  0.02953329,
         0.0069404 ,  0.0069802 , -0.0387303 ,  0.01437113,  0.02293072],
       [-0.03038744,  0.01850469,  0.04297635,  0.03112287,  0.02953329,
         0.0069404 ,  0.0069802 , -0.0387303 ,  0.01437113,  0.02293072],
       [-0.03038744,  0.01850469,  0.04297635,  0.03112287,  0.02953329,
         0.0069404 ,  0.0069802 , -0.0387303 ,  0.01437113,  0.02293072],
       [ 0.00242889, -0.01561313, -0.04776262,  0.02735804, -0.02853603,
        -0.03796167,  0.01691175, -0.0379807 , -0.0071648 , -0.04197459],
       [-0.00147576, -0.01849983, -0.02593526, -0.00266396,  0.03511072,
         0.02277527, -0.01061299, -0.0158391 

### Assignment



In [12]:
corpus = ["The world is a better place",
      "Marvel series is my favourite movie",
      "I like DC movies",
      "the cat is eating the food",
      "Tom and Jerry is my favourite movie",
      "Python is my favourite programming language"]
corpus

['The world is a better place',
 'Marvel series is my favourite movie',
 'I like DC movies',
 'the cat is eating the food',
 'Tom and Jerry is my favourite movie',
 'Python is my favourite programming language']

In [13]:
vocabulary_size = 100
onehot_rep = [one_hot(input_text=words, n=vocabulary_size) for words in corpus]
onehot_rep

[[21, 89, 45, 58, 76, 22],
 [21, 86, 45, 77, 67, 40],
 [77, 51, 11, 97],
 [21, 20, 45, 86, 21, 67],
 [98, 27, 58, 45, 77, 67, 40],
 [83, 45, 77, 67, 48, 54]]

In [14]:
post_padding_docs = pad_sequences(sequences=onehot_rep, maxlen=max_sent_length,padding='post')
post_padding_docs

array([[21, 89, 45, 58, 76, 22,  0,  0,  0],
       [21, 86, 45, 77, 67, 40,  0,  0,  0],
       [77, 51, 11, 97,  0,  0,  0,  0,  0],
       [21, 20, 45, 86, 21, 67,  0,  0,  0],
       [98, 27, 58, 45, 77, 67, 40,  0,  0],
       [83, 45, 77, 67, 48, 54,  0,  0,  0]])

In [18]:
dim = 5
model2 = Sequential()
model2.add(Embedding(input_dim=vocabulary_size, output_dim=dim, input_length=max_sent_length))
model2.compile('adam','mse')

In [19]:
model2.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 9, 5)              500       
                                                                 
Total params: 500
Trainable params: 500
Non-trainable params: 0
_________________________________________________________________


In [21]:
model2.predict(post_padding_docs[0])



array([[ 0.04426196, -0.00023454,  0.03833333,  0.04403989, -0.02409847],
       [ 0.02309952, -0.02768748,  0.02415853, -0.00664582, -0.02300026],
       [ 0.02372183,  0.02919609,  0.02599997, -0.01651946,  0.0162328 ],
       [-0.02650467,  0.03954666, -0.01543214,  0.02306311, -0.01176028],
       [ 0.02516413, -0.02481567, -0.02657212, -0.02440155,  0.01934316],
       [ 0.000244  ,  0.0107279 , -0.04969523, -0.03285037, -0.03992555],
       [ 0.01008104,  0.03982894,  0.04301873,  0.02121538, -0.01328218],
       [ 0.01008104,  0.03982894,  0.04301873,  0.02121538, -0.01328218],
       [ 0.01008104,  0.03982894,  0.04301873,  0.02121538, -0.01328218]],
      dtype=float32)