In [1]:
###### Import one_hot #######
from tensorflow.keras.preprocessing.text import one_hot, Tokenizer

In [2]:
sent = ['the glass of milk',
        'the glass of juice',
        'the cup of tea',
        'I am a good boy',
        'I am a good developer',
        'understand the meaning of words',
        'your videos are good'
        ]

In [3]:
###### Show the sentence #######
sent

['the glass of milk',
 'the glass of juice',
 'the cup of tea',
 'I am a good boy',
 'I am a good developer',
 'understand the meaning of words',
 'your videos are good']

In [4]:
###### Find the vocabulary size of this sent ######
from keras.preprocessing.sequence import pad_sequences

tokenizer_ = Tokenizer()
###### This fit_on_texts will able to take list or corpus as well as dataframe #######
tokenizer_.fit_on_texts(sent)

####### print the # of word count in this sent #####
print(tokenizer_.word_counts, '\n\n')

####### print the word_index of this sent #######
print(tokenizer_.word_index, '\n\n')


####### print the len of this word_counts and word_index #######
print(len(tokenizer_.word_counts), len(tokenizer_.word_index), '\n\n')

###### Type of this word_counts and word_index ######
print(type(tokenizer_.word_counts), type(tokenizer_.word_index))

OrderedDict([('the', 4), ('glass', 2), ('of', 4), ('milk', 1), ('juice', 1), ('cup', 1), ('tea', 1), ('i', 2), ('am', 2), ('a', 2), ('good', 3), ('boy', 1), ('developer', 1), ('understand', 1), ('meaning', 1), ('words', 1), ('your', 1), ('videos', 1), ('are', 1)]) 


{'the': 1, 'of': 2, 'good': 3, 'glass': 4, 'i': 5, 'am': 6, 'a': 7, 'milk': 8, 'juice': 9, 'cup': 10, 'tea': 11, 'boy': 12, 'developer': 13, 'understand': 14, 'meaning': 15, 'words': 16, 'your': 17, 'videos': 18, 'are': 19} 


19 19 


<class 'collections.OrderedDict'> <class 'dict'>


In [5]:
for _ in sent:
  print(_)

the glass of milk
the glass of juice
the cup of tea
I am a good boy
I am a good developer
understand the meaning of words
your videos are good


In [6]:
###### Create the vocabulary size ###### 
voc_size = len(tokenizer_.word_counts) + 1

print("Vacabulary size of this sent will be: ", voc_size)

Vacabulary size of this sent will be:  20


In [7]:
###### One hot representation ######
one_hot_ = [one_hot(each_, voc_size) for each_ in sent]
one_hot_

[[13, 14, 5, 5],
 [13, 14, 5, 12],
 [13, 11, 5, 1],
 [2, 14, 3, 10, 2],
 [2, 14, 3, 10, 3],
 [17, 13, 7, 5, 3],
 [10, 5, 19, 10]]

In [8]:
###### Find the max length ######
max_length_ = -1

for list_ in one_hot_:
  if len(list_) > max_length_:
    max_length_ = len(list_)
  else:
    pass

print(max_length_)

5


In [9]:
###### Do the padding ######
sent_length = max_length_ + 2

embedded_docs = pad_sequences(one_hot_, padding = 'pre', maxlen = sent_length)

print(embedded_docs)

[[ 0  0  0 13 14  5  5]
 [ 0  0  0 13 14  5 12]
 [ 0  0  0 13 11  5  1]
 [ 0  0  2 14  3 10  2]
 [ 0  0  2 14  3 10  3]
 [ 0  0 17 13  7  5  3]
 [ 0  0  0 10  5 19 10]]


In [10]:
####### Word Embedding Representation #######
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding

model = Sequential()

"""
1.
input_dim - Integer. Size of the vocabulary, i.e. maximum integer index + 1.

2.
output_dim: Integer. Dimension of the dense embedding.

3.
input_length: Length of input sequences, when it is constant. This argument is required if you are going to connect Flatten then Dense layers upstream (without it, the shape of the dense outputs cannot be computed).

"""
model.add(Embedding(input_dim = voc_size, output_dim = 10, input_length = sent_length))
model.compile('adam', 'mse')

In [11]:
model.predict(embedded_docs)

array([[[-2.86746975e-02, -3.60842347e-02, -2.43837368e-02,
          1.78996064e-02,  3.47359106e-03,  4.47059609e-02,
          2.64165886e-02, -4.01552431e-02,  1.78296901e-02,
          4.02342342e-02],
        [-2.86746975e-02, -3.60842347e-02, -2.43837368e-02,
          1.78996064e-02,  3.47359106e-03,  4.47059609e-02,
          2.64165886e-02, -4.01552431e-02,  1.78296901e-02,
          4.02342342e-02],
        [-2.86746975e-02, -3.60842347e-02, -2.43837368e-02,
          1.78996064e-02,  3.47359106e-03,  4.47059609e-02,
          2.64165886e-02, -4.01552431e-02,  1.78296901e-02,
          4.02342342e-02],
        [-3.63548882e-02, -1.15937591e-02, -5.92194498e-04,
          4.91417572e-03,  5.95778227e-03, -2.14999076e-02,
          2.87193321e-02,  4.09431793e-02, -3.59265693e-02,
         -2.43193638e-02],
        [ 4.61993851e-02, -3.75742540e-02,  1.42897293e-03,
         -4.41082604e-02,  5.20832837e-05, -4.03513089e-02,
          1.97764747e-02,  5.73327392e-03,  3.492928

In [12]:
embedded_docs[0]

array([ 0,  0,  0, 13, 14,  5,  5], dtype=int32)

In [13]:
model.predict(embedded_docs[0])



array([[-2.8674698e-02, -3.6084235e-02, -2.4383737e-02,  1.7899606e-02,
         3.4735911e-03,  4.4705961e-02,  2.6416589e-02, -4.0155243e-02,
         1.7829690e-02,  4.0234234e-02],
       [-2.8674698e-02, -3.6084235e-02, -2.4383737e-02,  1.7899606e-02,
         3.4735911e-03,  4.4705961e-02,  2.6416589e-02, -4.0155243e-02,
         1.7829690e-02,  4.0234234e-02],
       [-2.8674698e-02, -3.6084235e-02, -2.4383737e-02,  1.7899606e-02,
         3.4735911e-03,  4.4705961e-02,  2.6416589e-02, -4.0155243e-02,
         1.7829690e-02,  4.0234234e-02],
       [-3.6354888e-02, -1.1593759e-02, -5.9219450e-04,  4.9141757e-03,
         5.9577823e-03, -2.1499908e-02,  2.8719332e-02,  4.0943179e-02,
        -3.5926569e-02, -2.4319364e-02],
       [ 4.6199385e-02, -3.7574254e-02,  1.4289729e-03, -4.4108260e-02,
         5.2083284e-05, -4.0351309e-02,  1.9776475e-02,  5.7332739e-03,
         3.4929287e-02, -2.8171316e-03],
       [-4.5065034e-02,  2.8261516e-02, -1.7657030e-02, -2.1416033e-02,
   

In [14]:
tokenizer_.word_index

{'the': 1,
 'of': 2,
 'good': 3,
 'glass': 4,
 'i': 5,
 'am': 6,
 'a': 7,
 'milk': 8,
 'juice': 9,
 'cup': 10,
 'tea': 11,
 'boy': 12,
 'developer': 13,
 'understand': 14,
 'meaning': 15,
 'words': 16,
 'your': 17,
 'videos': 18,
 'are': 19}

In [15]:
len(tokenizer_.word_index), len(tokenizer_.word_counts)

(19, 19)

In [16]:
message_ = [
    ['the glass of milk'],
    ['the glass of juice'],
    ['the cup of tea'],
    ['I am a good boy'],
    ['I am a good developer'],
    ['understand the meaning of words'],
    ['your videos are good']
]

In [17]:
import pandas as pd

df = pd.DataFrame(message_, columns = ['message'])
df.head()

Unnamed: 0,message
0,the glass of milk
1,the glass of juice
2,the cup of tea
3,I am a good boy
4,I am a good developer


In [18]:
###### Find the unique words in this dataframe ######
tokenizer_ = Tokenizer()
tokenizer_.fit_on_texts(df.loc[:, 'message'])
vocabulary_size = len(tokenizer_.word_counts) + 1

In [19]:
###### One Hot Representation #######
one_hot_representation_ = [one_hot(eachSentence_, vocabulary_size) for eachSentence_ in df.loc[:, 'message']]
one_hot_representation_

[[13, 14, 5, 5],
 [13, 14, 5, 12],
 [13, 11, 5, 1],
 [2, 14, 3, 10, 2],
 [2, 14, 3, 10, 3],
 [17, 13, 7, 5, 3],
 [10, 5, 19, 10]]

In [20]:
###### Find the maxLength #####
maxLength_ = -1
for list_ in one_hot_representation_:
  if len(list_) > maxLength_:
    maxLength_ = len(list_)
  else:
    pass

maxLength_

5

In [21]:
###### Padding ######
embedded_doc_ = pad_sequences(one_hot_representation_, padding = 'pre', maxlen = maxLength_ + 2)
print(embedded_doc_)

[[ 0  0  0 13 14  5  5]
 [ 0  0  0 13 14  5 12]
 [ 0  0  0 13 11  5  1]
 [ 0  0  2 14  3 10  2]
 [ 0  0  2 14  3 10  3]
 [ 0  0 17 13  7  5  3]
 [ 0  0  0 10  5 19 10]]


In [22]:
####### Word Embedding #######
model = Sequential()

model.add(Embedding(input_dim = vocabulary_size, output_dim = 10, input_length = maxLength_ + 2))

model.compile(optimizer = 'Adam', loss = 'mse')

In [23]:
model.predict(embedded_doc_)

array([[[-9.81826708e-03,  2.15994753e-02,  3.19458507e-02,
         -4.31077257e-02, -2.77480017e-02,  1.38146542e-02,
         -5.27166203e-03, -6.68689609e-06,  2.05349438e-02,
          1.69454478e-02],
        [-9.81826708e-03,  2.15994753e-02,  3.19458507e-02,
         -4.31077257e-02, -2.77480017e-02,  1.38146542e-02,
         -5.27166203e-03, -6.68689609e-06,  2.05349438e-02,
          1.69454478e-02],
        [-9.81826708e-03,  2.15994753e-02,  3.19458507e-02,
         -4.31077257e-02, -2.77480017e-02,  1.38146542e-02,
         -5.27166203e-03, -6.68689609e-06,  2.05349438e-02,
          1.69454478e-02],
        [-3.61406319e-02,  1.34793669e-03, -8.68022442e-03,
          6.58706576e-03, -4.13907282e-02,  8.59297439e-03,
          3.52890752e-02,  4.23797220e-03,  4.97703627e-03,
         -4.80752960e-02],
        [-4.20079008e-02, -4.53390963e-02, -4.77772243e-02,
          1.25117563e-02,  4.06573080e-02,  4.26929630e-02,
         -4.89220135e-02,  3.85654084e-02,  1.915596

In [24]:
embedded_doc_[1]

array([ 0,  0,  0, 13, 14,  5, 12], dtype=int32)

In [25]:
for count_row_, element_ in enumerate(model.predict(embedded_doc_[1])):
  print(f"{count_row_} word converted into {len(element_)} vectors.")



0 word converted into 10 vectors.
1 word converted into 10 vectors.
2 word converted into 10 vectors.
3 word converted into 10 vectors.
4 word converted into 10 vectors.
5 word converted into 10 vectors.
6 word converted into 10 vectors.
