In [5]:
import numpy as np
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
import matplotlib.pyplot as plt

In [1]:
from google.colab import files
files.upload()  # Choose your kaggle.json file here


Saving kaggle.json to kaggle (1).json


{'kaggle (1).json': b'{"username":"abdullahafarooqi","key":"52b4e86a208c8685383fd6359d530722"}'}

In [2]:
!mkdir -p ~/.kaggle
!mv kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json


In [3]:
!kaggle datasets download -d ronikdedhia/next-word-prediction


Dataset URL: https://www.kaggle.com/datasets/ronikdedhia/next-word-prediction
License(s): unknown
Downloading next-word-prediction.zip to /content
  0% 0.00/228k [00:00<?, ?B/s]
100% 228k/228k [00:00<00:00, 691MB/s]


In [4]:
!unzip next-word-prediction.zip


Archive:  next-word-prediction.zip
  inflating: 1661-0.txt              


In [None]:
nltk.download('stopwords')

In [None]:
def load_and_clean_text(path):
    with open(path, "r", encoding="utf-8") as file:
        text = file.read()

    text = text.lower()  # Lowercase
    text = re.sub(r'[^a-z\s\.\?\!\n]', '', text)  # Keep letters, spaces, punctuation
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    text = re.sub(r'(\.|\?|\!)(\s*)', r'\1\n', text)  # Sentence separation
    return text

# File path inside dataset folder
path = "dataset/1661-0.txt"
text = load_and_clean_text(path)

In [None]:
tokenizer = Tokenizer(oov_token="<OOV>", filters='')
tokenizer.fit_on_texts([text])
total_words = len(tokenizer.word_index) + 1


In [None]:
SEQUENCE_LENGTH = 15
input_sequences = []
words = tokenizer.texts_to_sequences([text])[0]

for i in range(SEQUENCE_LENGTH, len(words)):
    seq = words[i-SEQUENCE_LENGTH:i]
    input_sequences.append(seq)

input_sequences = np.array(input_sequences)
X = input_sequences[:, :-1]
y = input_sequences[:, -1]
y = to_categorical(y, num_classes=total_words)

print(f"Total words: {total_words}")
print(f"Input shape: {X.shape}, Label shape: {y.shape}")

In [None]:
model = Sequential([
    Embedding(total_words, 128, input_length=SEQUENCE_LENGTH-1),
    Bidirectional(LSTM(256, return_sequences=True)),
    Dropout(0.3),
    Bidirectional(LSTM(128)),
    Dropout(0.3),
    Dense(128, activation='relu'),
    Dense(total_words, activation='softmax')
])


In [None]:
optimizer = Adam(learning_rate=0.001)
model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])

callbacks = [
    EarlyStopping(patience=5, restore_best_weights=True),
    ReduceLROnPlateau(factor=0.1, patience=3)
]

model.summary()

In [None]:
history = model.fit(
    X, y,
    epochs=50,
    batch_size=256,
    validation_split=0.1,
    callbacks=callbacks
)

In [None]:
plt.figure(figsize=(12, 4))

plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'], label='Training Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Model Accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Model Loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend()

plt.tight_layout()
plt.show()

In [None]:
def predict_next_words(seed_text, next_words=10, temperature=0.7):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]

        if len(token_list) > SEQUENCE_LENGTH-1:
            token_list = token_list[-(SEQUENCE_LENGTH-1):]
        else:
            token_list = pad_sequences([token_list], maxlen=SEQUENCE_LENGTH-1, padding='pre')[0]

        token_list = token_list.reshape(1, -1)

        predictions = model.predict(token_list, verbose=0)[0]
        predictions = np.log(predictions + 1e-8) / temperature
        exp_preds = np.exp(predictions)
        predictions = exp_preds / np.sum(exp_preds)

        predicted_index = np.random.choice(range(total_words), p=predictions)
        output_word = tokenizer.index_word.get(predicted_index, "<OOV>")

        seed_text += " " + output_word

    return seed_text

In [None]:
seed_text = "Sherlock Holmes took his bottle from the corner of the mantel-piece"
print("\nSeed text:", seed_text)
print("Generated text:")
print(predict_next_words(seed_text, next_words=15))