<a href="https://colab.research.google.com/github/akhii-leesh/Next-word-predictor/blob/main/MLE_proj_GRU.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install tensorflow



In [None]:
corpus = [
    "The sun rises in the east",
    "The sky is clear and blue",
    "Birds are chirping in the morning light",
    "The river flows gently over the rocks",
    "The autumn leaves fall softly to the ground",
    "The snow blankets the trees in winter",
    "Spring brings flowers blooming everywhere",
    "The summer heat warms the earth",
    "The stars twinkle in the night sky",
    "The moonlight dances on the waves",
    "A gentle breeze rustles the leaves",
    "The forest is alive with the sound of nature",
    "The rain nourishes the thirsty soil",
    "The mountain peaks rise high into the clouds",
    "The ocean waves crash against the shore"
]

In [None]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

# Sample corpus
text = ''.join(corpus)

# Tokenizing the text
tokenizer = Tokenizer()
tokenizer.fit_on_texts([text])

total_words = len(tokenizer.word_index) + 1  # Add 1 for padding token
input_sequences = []

# Generate sequences
for i in range(1, len(text.split())):
    n_gram_sequence = text.split()[:i+1]
    input_sequences.append(tokenizer.texts_to_sequences([n_gram_sequence])[0])

# Pad sequences to ensure consistent length
max_sequence_len = max([len(x) for x in input_sequences])
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))

# Split into inputs (X) and targets (y)
X, y = input_sequences[:,:-1], input_sequences[:,-1]

# One-hot encode the output labels
y = tf.keras.utils.to_categorical(y, num_classes=total_words)

In [None]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(total_words, 100, input_length=max_sequence_len-1),
    tf.keras.layers.GRU(150, return_sequences=True),
    tf.keras.layers.GRU(100),
    tf.keras.layers.Dense(total_words, activation='softmax')
])

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()



In [None]:
epochs = 100
history = model.fit(X, y, epochs=epochs, verbose=1)

Epoch 1/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step - accuracy: 1.0000 - loss: 0.0728
Epoch 2/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step - accuracy: 1.0000 - loss: 0.0681
Epoch 3/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step - accuracy: 1.0000 - loss: 0.0694 
Epoch 4/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step - accuracy: 1.0000 - loss: 0.0670
Epoch 5/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step - accuracy: 1.0000 - loss: 0.0643 
Epoch 6/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step - accuracy: 1.0000 - loss: 0.0612
Epoch 7/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step - accuracy: 1.0000 - loss: 0.0634
Epoch 8/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step - accuracy: 1.0000 - loss: 0.0601
Epoch 9/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m

In [None]:
def predict_next_word(model, tokenizer, text_sequence, max_sequence_len):
    sequence = tokenizer.texts_to_sequences([text_sequence])[0]
    sequence = pad_sequences([sequence], maxlen=max_sequence_len-1, padding='pre')
    predicted = model.predict(sequence, verbose=0)
    predicted_word_index = np.argmax(predicted, axis=-1)

    for word, index in tokenizer.word_index.items():
        if index == predicted_word_index:
            return word
    return ""

# Example prediction
seed_text = "A gentle breeze"
next_word = predict_next_word(model, tokenizer, seed_text, max_sequence_len)
print(f"Next word: {next_word}")

Next word: in
