<a href="https://colab.research.google.com/github/akhii-leesh/Next-word-predictor/blob/main/MLE_Proj.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Update your TensorFlow and Keras installation
!pip install --upgrade tensorflow keras numpy

import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping

Collecting keras
  Downloading keras-3.6.0-py3-none-any.whl.metadata (5.8 kB)
Collecting numpy
  Downloading numpy-2.1.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.9/60.9 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
Downloading keras-3.6.0-py3-none-any.whl (1.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m22.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: keras
  Attempting uninstall: keras
    Found existing installation: keras 3.4.1
    Uninstalling keras-3.4.1:
      Successfully uninstalled keras-3.4.1
Successfully installed keras-3.6.0


In [None]:
corpus = [
    "The sun rises in the east",
    "The sky is clear and blue",
    "Birds are chirping in the morning light",
    "The river flows gently over the rocks",
    "The autumn leaves fall softly to the ground",
    "The snow blankets the trees in winter",
    "Spring brings flowers blooming everywhere",
    "The summer heat warms the earth",
    "The stars twinkle in the night sky",
    "The moonlight dances on the waves",
    "A gentle breeze rustles the leaves",
    "The forest is alive with the sound of nature",
    "The rain nourishes the thirsty soil",
    "The mountain peaks rise high into the clouds",
    "The ocean waves crash against the shore"
]

In [None]:
# Updated tokenization process
tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)  # corpus should be a pre-processed list of text

total_words = len(tokenizer.word_index) + 1

# Create input sequences
input_sequences = []
for line in corpus:
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)

# Padding sequences
max_sequence_len = max([len(x) for x in input_sequences])
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))

# Splitting data into predictors and label
predictors, label = input_sequences[:, :-1], input_sequences[:, -1]
label = tf.keras.utils.to_categorical(label, num_classes=total_words)

In [None]:
# Model Definition
model = Sequential()
model.add(Embedding(total_words, 100, input_length=max_sequence_len-1))
model.add(LSTM(600, return_sequences=True))
model.add(Dropout(0.2))  # Adding dropout for regularization
model.add(LSTM(400))  # Reducing the LSTM units to prevent overfitting
model.add(Dense(total_words, activation='softmax'))

# Compile model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Display model summary
model.summary()

In [None]:
# Add EarlyStopping
early_stop = EarlyStopping(monitor='loss', patience=3, verbose=1)

# Model Training
history = model.fit(predictors, label, epochs=100, verbose=1, callbacks=[early_stop])

Epoch 1/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 11ms/step - accuracy: 0.0447 - loss: 4.2623
Epoch 2/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.1771 - loss: 4.1982  
Epoch 3/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.1693 - loss: 4.0326
Epoch 4/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.1654 - loss: 4.0068 
Epoch 5/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.1615 - loss: 4.0191 
Epoch 6/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.1576 - loss: 3.9478
Epoch 7/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.1459 - loss: 3.9715
Epoch 8/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.1576 - loss: 3.9404 
Epoch 9/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[

In [None]:
def generate_text(seed_text, next_words, max_sequence_len):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        predicted = model.predict(token_list, verbose=0)
        predicted_word_index = np.argmax(predicted, axis=-1)

        output_word = ""
        for word, index in tokenizer.word_index.items():
            if index == predicted_word_index:
                output_word = word
                break

        seed_text += " " + output_word
    return seed_text

In [None]:
# Train the model with a validation split (e.g., 20% of data for validation)
history = model.fit(predictors, label, epochs=100, verbose=1, validation_split=0.2, callbacks=[early_stop])

# Access validation accuracy from the history object
val_accuracy = history.history['val_accuracy'][-1]  # Validation accuracy from the last epoch
print(f"Validation accuracy: {val_accuracy * 100:.2f}%")

Epoch 1/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 158ms/step - accuracy: 0.8713 - loss: 0.3910 - val_accuracy: 0.8333 - val_loss: 0.4818
Epoch 2/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step - accuracy: 0.8748 - loss: 0.4239 - val_accuracy: 0.8333 - val_loss: 0.5580
Epoch 3/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step - accuracy: 0.8904 - loss: 0.3848 - val_accuracy: 0.8333 - val_loss: 0.6348
Epoch 4/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step - accuracy: 0.9021 - loss: 0.3372 - val_accuracy: 0.8333 - val_loss: 0.6910
Epoch 5/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step - accuracy: 0.8865 - loss: 0.3853 - val_accuracy: 0.8333 - val_loss: 0.7456
Epoch 6/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step - accuracy: 0.8977 - loss: 0.3915 - val_accuracy: 0.8333 - val_loss: 0.7980
Epoch 7/100
[1m3/3[0m [32m━━━━━━━━━━

In [None]:
seed_text = "The sky"
next_words = 10
print(generate_text(seed_text, next_words, max_sequence_len))

The sky is clear and blue sound of nature sound nature everywhere
