<a href="https://colab.research.google.com/github/bhargavrakholiya123/ML-Progress/blob/main/RNN_Word_Predictor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
corpus = [
    "I love playing football",
    "I love playing cricket",
    "I love eating pizza",
    "I love eating burger",
    "I enjoy watching movies",
    "I enjoy watching cartoons",
    "I enjoy reading books",
    "I enjoy reading magazines"
]


In [3]:
from tensorflow.keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)

word_index = tokenizer.word_index
print(word_index)


{'i': 1, 'love': 2, 'enjoy': 3, 'playing': 4, 'eating': 5, 'watching': 6, 'reading': 7, 'football': 8, 'cricket': 9, 'pizza': 10, 'burger': 11, 'movies': 12, 'cartoons': 13, 'books': 14, 'magazines': 15}


In [4]:
X = []
y = []

for sentence in corpus:
    words = sentence.split()
    for i in range(1, len(words)):
        X.append(words[:i])
        y.append(words[i])


In [5]:
X_encoded = tokenizer.texts_to_sequences([" ".join(seq) for seq in X])
y_encoded = tokenizer.texts_to_sequences(y)


In [6]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

max_len = max(len(seq) for seq in X_encoded)
X_padded = pad_sequences(X_encoded, maxlen=max_len, padding='pre')

y_encoded = np.array([label[0] for label in y_encoded])  # flatten target


In [7]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense

vocab_size = len(word_index) + 1

model = Sequential([
    Embedding(vocab_size, 10, input_length=max_len),
    SimpleRNN(16),
    Dense(vocab_size, activation='softmax')
])




In [8]:
model.compile(
    optimizer='adam',
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

model.fit(X_padded, y_encoded, epochs=300, verbose=0)


<keras.src.callbacks.history.History at 0x78031e51d280>

In [9]:
import numpy as np

def predict_next_word(text):
    seq = tokenizer.texts_to_sequences([text])[0]
    seq = pad_sequences([seq], maxlen=max_len, padding='pre')
    pred = model.predict(seq, verbose=0)
    next_word = tokenizer.index_word[np.argmax(pred)]
    return next_word


In [10]:
print(predict_next_word("I love"))
print(predict_next_word("I love playing"))
print(predict_next_word("I enjoy reading"))
print(predict_next_word("I enjoy"))
print(predict_next_word("I love eating"))


eating
cricket
magazines
watching
burger
