In [3]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding

# Sample text data (training corpus)
text_data = [
    "i love machine learning",
    "i love deep learning",
    "machine learning is amazing",
    "deep learning is powerful",
    "rnn is great for sequence data",
    "tensorflow helps with deep learning",
    "artificial intelligence is the future",
    "deep learning powers modern AI",
    "neural networks can recognize patterns",
    "data science relies on machine learning",
    "supervised learning needs labeled data",
    "unsupervised learning finds hidden patterns",
    "reinforcement learning improves decision making",
    "python is great for machine learning",
    "tensorflow makes deep learning easier",
    "keras simplifies neural network development",
    "pytorch is popular among researchers",
    "scikit learn is useful for data science",
    "matplotlib helps visualize data",
    "self driving cars use deep learning",
    "natural language processing helps chatbots",
    "speech recognition converts voice to text",
    "computer vision detects objects in images",
    "AI recommends movies based on preferences"
]

# Tokenize words (convert words to numbers)
tokenizer = Tokenizer()
tokenizer.fit_on_texts(text_data)
total_words = len(tokenizer.word_index) + 1  # +1 for padding token

# Create input sequences
input_sequences = []
for line in text_data:
    tokens = tokenizer.texts_to_sequences([line])[0]  # Convert sentence to numbers
    for i in range(1, len(tokens)):
        input_sequences.append(tokens[:i+1])  # Create sequences of increasing length

# Pad sequences to the same length
max_sequence_length = max(len(seq) for seq in input_sequences)
input_sequences = pad_sequences(input_sequences, maxlen=max_sequence_length, padding='pre')

# Split inputs (X) and outputs (y)
X, y = input_sequences[:, :-1], input_sequences[:, -1]

# Convert labels to categorical (one-hot encoding)
y = tf.keras.utils.to_categorical(y, num_classes=total_words)

# Build the LSTM model
model = Sequential([
    Embedding(total_words, 10, input_length=max_sequence_length-1),  # Embedding layer
    LSTM(50, activation='relu'),  # LSTM layer with 50 units
    Dense(total_words, activation='softmax')  # Output layer (predict next word)
])

# Compile model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(X, y, epochs=100, verbose=1)

# Function to predict the next word
def predict_next_word(seed_text):
    sequence = tokenizer.texts_to_sequences([seed_text])[0]
    sequence = pad_sequences([sequence], maxlen=max_sequence_length-1, padding='pre')
    predicted_index = np.argmax(model.predict(sequence))
    for word, index in tokenizer.word_index.items():
        if index == predicted_index:
            return word
    return "?"

# Try predicting next words
seed_text = "machine learning"
next_word = predict_next_word(seed_text)
print(f"'{seed_text}' → '{seed_text} {next_word}'")

seed_text = "deep learning"
next_word = predict_next_word(seed_text)
print(f"'{seed_text}' → '{seed_text} {next_word}'")


Epoch 1/100
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 38ms/step - accuracy: 0.0182 - loss: 4.4065   
Epoch 2/100
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step - accuracy: 0.1362 - loss: 4.4026
Epoch 3/100
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step - accuracy: 0.1352 - loss: 4.3993 
Epoch 4/100
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step - accuracy: 0.1445 - loss: 4.3950
Epoch 5/100
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step - accuracy: 0.1362 - loss: 4.3913 
Epoch 6/100
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step - accuracy: 0.1393 - loss: 4.3851 
Epoch 7/100
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step - accuracy: 0.1478 - loss: 4.3779
Epoch 8/100
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step - accuracy: 0.1093 - loss: 4.3721 
Epoch 9/100
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━