In [11]:
# Libraries
import tensorflow as tf
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from datasets import load_dataset
import numpy as np

In [13]:
# Configuration (Reduced for faster training)
SEQ_LEN = 30          # Reduced sequence length
BATCH_SIZE = 128      # Optimized batch size for GPU
EMBED_DIM = 64        # Smaller embedding dimension
LSTM_UNITS = 128      # Reduced LSTM units
EPOCHS = 10          # Start with fewer epochs

In [14]:
# Load and preprocess data
dataset = load_dataset("wikitext", "wikitext-2-raw-v1")
text = " ".join([t for t in dataset["train"]["text"] if t.strip()][:5000])  # Use subset for testing
text = text.lower()
text



In [15]:
# Tokenization
tokenizer = Tokenizer(num_words=10000, oov_token="<OOV>", filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n')
tokenizer.fit_on_texts([text])
total_words = min(10000, len(tokenizer.word_index) + 1)
total_words

10000

In [16]:
import pickle

# Save tokenizer
with open("tokenizer.pkl", "wb") as f:
    pickle.dump(tokenizer, f)


In [17]:
# Create sequences
sequences = tokenizer.texts_to_sequences([text])[0]
input_seqs = []
for i in range(SEQ_LEN, len(sequences)):
    input_seqs.append(sequences[i-SEQ_LEN:i+1])
input_seqs = np.array(input_seqs)
X, y = input_seqs[:, :-1], input_seqs[:, -1]

In [18]:
# Build optimized model
model = tf.keras.Sequential([
    Embedding(total_words, EMBED_DIM, input_length=SEQ_LEN),
    Bidirectional(LSTM(LSTM_UNITS)),  # Bidirectional for better context
    Dense(total_words, activation='softmax')
])

In [19]:
# Enable GPU acceleration
with tf.device('/GPU:0' if tf.config.list_physical_devices('GPU') else '/CPU:0'):
    model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    history = model.fit(X, y, batch_size=BATCH_SIZE, epochs=EPOCHS, validation_split=0.1)






Epoch 1/10
[1m2534/2534[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m345s[0m 135ms/step - accuracy: 0.0967 - loss: 6.8601 - val_accuracy: 0.1484 - val_loss: 6.4708
Epoch 2/10
[1m2534/2534[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m297s[0m 117ms/step - accuracy: 0.1481 - loss: 5.9704 - val_accuracy: 0.1594 - val_loss: 6.3022
Epoch 3/10
[1m2534/2534[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m290s[0m 114ms/step - accuracy: 0.1689 - loss: 5.4350 - val_accuracy: 0.1656 - val_loss: 6.2599
Epoch 4/10
[1m2534/2534[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m362s[0m 143ms/step - accuracy: 0.1872 - loss: 4.9958 - val_accuracy: 0.1672 - val_loss: 6.2967
Epoch 5/10
[1m2534/2534[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m369s[0m 146ms/step - accuracy: 0.2064 - loss: 4.6243 - val_accuracy: 0.1665 - val_loss: 6.3954
Epoch 6/10
[1m2534/2534[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m406s[0m 160ms/step - accuracy: 0.2240 - loss: 4.3009 - val_accuracy: 0.1644 - val_loss:

In [43]:
def predict_next_word(seed_text, model, tokenizer, seq_len=SEQ_LEN, top_n=1):
    # Tokenize and pad the input text
    token_list = tokenizer.texts_to_sequences([seed_text])[0]
    token_list = pad_sequences([token_list], maxlen=seq_len, padding='pre')
    
    # Get predictions
    preds = model.predict(token_list, verbose=0)[0]
    top_indices = np.argsort(preds)[-top_n:][::-1]  # Get top N predictions
    
    # Extract words and probabilities
    predicted_words = [tokenizer.index_word.get(i, "?") for i in top_indices]
    
    # Format into a sentence
    if len(predicted_words) == 1:
        return f"{seed_text} {predicted_words[0]}"
    else:
        return f"{seed_text} {', '.join(predicted_words[:-1]) + " "+ predicted_words[-1]}"

# Example usage
print(predict_next_word("Machine learning is a", model, tokenizer))

Machine learning is a popular


In [44]:
model.save("next_word_model.h5")


