In [23]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding,Dense, Flatten
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [24]:
sentences = [
    "the cat sat on the mat",
    "the dog sat on the log",
    "cats and dogs are great pets",
    "dog is better than cat",
    "the mat is on the floor"
]

In [25]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(sentences);
total_words = len(tokenizer.word_index) + 1
print("Total Words" ,total_words)

Total Words 18


In [26]:
def create_cbow(sentences,window_size=2):
    input_data = []
    output_data = []

    for sentence in sentences:
        words = sentence.split()
        for i,word in enumerate(words):
          start_index = max(0,i-window_size)
          end_index = min(len(words),i+window_size+1)
          context = [words[j] for j in range(start_index,end_index) if j!=i]

          input_data.append(context)
          output_data.append(word)

    return input_data , output_data

In [27]:
input_data , output_data = create_cbow(sentences)

In [28]:
print(input_data)

[['cat', 'sat'], ['the', 'sat', 'on'], ['the', 'cat', 'on', 'the'], ['cat', 'sat', 'the', 'mat'], ['sat', 'on', 'mat'], ['on', 'the'], ['dog', 'sat'], ['the', 'sat', 'on'], ['the', 'dog', 'on', 'the'], ['dog', 'sat', 'the', 'log'], ['sat', 'on', 'log'], ['on', 'the'], ['and', 'dogs'], ['cats', 'dogs', 'are'], ['cats', 'and', 'are', 'great'], ['and', 'dogs', 'great', 'pets'], ['dogs', 'are', 'pets'], ['are', 'great'], ['is', 'better'], ['dog', 'better', 'than'], ['dog', 'is', 'than', 'cat'], ['is', 'better', 'cat'], ['better', 'than'], ['mat', 'is'], ['the', 'is', 'on'], ['the', 'mat', 'on', 'the'], ['mat', 'is', 'the', 'floor'], ['is', 'on', 'floor'], ['on', 'the']]


In [29]:
input_sequence = tokenizer.texts_to_sequences(input_data)
output_sequence = tokenizer.texts_to_sequences(output_data)
print(input_sequence)

[[3, 4], [1, 4, 2], [1, 3, 2, 1], [3, 4, 1, 5], [4, 2, 5], [2, 1], [6, 4], [1, 4, 2], [1, 6, 2, 1], [6, 4, 1, 8], [4, 2, 8], [2, 1], [10, 11], [9, 11, 12], [9, 10, 12, 13], [10, 11, 13, 14], [11, 12, 14], [12, 13], [7, 15], [6, 15, 16], [6, 7, 16, 3], [7, 15, 3], [15, 16], [5, 7], [1, 7, 2], [1, 5, 2, 1], [5, 7, 1, 17], [7, 2, 17], [2, 1]]


In [30]:
output_sequence = tf.keras.utils.to_categorical(output_sequence,num_classes = total_words)

In [32]:
max_length = max(len(seq) for seq in input_sequence)
input_sequence = pad_sequences(input_sequence,maxlen=max_length,padding='post')

In [33]:
model = Sequential()
model.add(Embedding(input_dim=total_words,output_dim=10))
model.add(Flatten())
model.add(Dense(total_words,activation="softmax"))

In [35]:
model.compile(optimizer="adam",loss="categorical_crossentropy",metrics=['accuracy'])

In [36]:
model.fit(input_sequence,output_sequence,epochs=10,verbose=1)

Epoch 1/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step - accuracy: 0.0345 - loss: 2.8827
Epoch 2/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step - accuracy: 0.0345 - loss: 2.8788
Epoch 3/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step - accuracy: 0.0345 - loss: 2.8749
Epoch 4/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 57ms/step - accuracy: 0.1379 - loss: 2.8709
Epoch 5/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step - accuracy: 0.1379 - loss: 2.8670
Epoch 6/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step - accuracy: 0.1724 - loss: 2.8631
Epoch 7/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step - accuracy: 0.1724 - loss: 2.8591
Epoch 8/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step - accuracy: 0.1724 - loss: 2.8551
Epoch 9/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m

<keras.src.callbacks.history.History at 0x7f9b7a584580>

In [41]:
 def prediction_fun(context):
  context_seq = tokenizer.texts_to_sequences([context])
  context_seq = pad_sequences(input_sequence,maxlen=max_length,padding='post')
  predicted = model.predict(context_seq)
  return tokenizer.index_word[np.argmax(predicted)]

In [42]:
context_example = ["the","cat","on","the"]
prediction_ans = prediction_fun(context_example)
print(prediction_ans)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
the
