<a href="https://colab.research.google.com/github/aryanarora07/ML-AI/blob/main/LSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Importing necessary libraries
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
import re
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Step 1: Load and preprocess the data
with open("sherlock.txt", "r", encoding="utf-8") as file:
    text = file.read().lower()  # Read the file and convert to lowercase

# Step 2: Clean the text (removing HTML tags, extra spaces, etc.)
text = re.sub(r'<.*?>', '', text)  # Remove any HTML tags
text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces with a single space

# Step 3: Tokenization (split the text into words)
words = text.split()  # Split text into words

# Step 4: Create a Tokenizer to convert words into integers
tokenizer = Tokenizer()
tokenizer.fit_on_texts([text])  # Fit tokenizer on the text

# Create a dictionary of word-to-index
word_to_int = tokenizer.word_index
int_to_word = {i: word for word, i in word_to_int.items()}

# Total number of unique words
vocab_size = len(word_to_int) + 1  # +1 because indexing starts from 1

# Step 5: Prepare sequences for training
sequence_length = 50  # Length of input sequence (number of words)
sequences = []

for i in range(sequence_length, len(words)):
    seq = words[i-sequence_length:i]  # Create sequence of 50 words
    sequences.append(' '.join(seq))

# Step 6: Prepare X and y (X: input, y: output)
X = tokenizer.texts_to_sequences(sequences)

# Pad the sequences so they have the same length (sequence_length)
X = pad_sequences(X, maxlen=sequence_length, padding='pre')

# The next word (y) is the word that follows the sequence
y = [seq.split()[-1] for seq in sequences]  # Last word of each sequence

# Convert y to a sequence of integers
y = tokenizer.texts_to_sequences(y)

# Convert y to a numpy array
# Original: y = np.array(y)
y = np.array([item[0] if item else 0 for item in y]) # Replace empty list with 0, take the first element otherwise

# One-hot encode the output (y) using to_categorical
y = to_categorical(y, num_classes=vocab_size)  # One-hot encode the output



# Step 7: Build the LSTM model
model = Sequential()

# Input Layer (Embedding Layer)
model.add(Embedding(input_dim=vocab_size, output_dim=128, input_length=sequence_length))

# LSTM Layer
model.add(LSTM(128, return_sequences=False))

# Dropout Layer (for regularization)
model.add(Dropout(0.2))

# Dense Layer (Output Layer)
model.add(Dense(vocab_size, activation='softmax'))

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Display model summary
model.summary()

# Step 8: Train the model
model.fit(X, y, batch_size=64, epochs=50, validation_split=0.1)

# Step 9: Save the trained model
model.save('sherlock_text_generator_model.h5')





Epoch 1/50
[1m1512/1512[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 11ms/step - accuracy: 0.1273 - loss: 6.0619 - val_accuracy: 0.5479 - val_loss: 3.7603
Epoch 2/50
[1m1512/1512[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 11ms/step - accuracy: 0.6510 - loss: 2.7076 - val_accuracy: 0.7304 - val_loss: 2.5605
Epoch 3/50
[1m1512/1512[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 10ms/step - accuracy: 0.8208 - loss: 1.4636 - val_accuracy: 0.7948 - val_loss: 2.1998
Epoch 4/50
[1m1512/1512[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 10ms/step - accuracy: 0.8895 - loss: 0.9072 - val_accuracy: 0.8289 - val_loss: 2.0162
Epoch 5/50
[1m1512/1512[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 10ms/step - accuracy: 0.9263 - loss: 0.5928 - val_accuracy: 0.8520 - val_loss: 1.9250
Epoch 6/50
[1m1512/1512[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 10ms/step - accuracy: 0.9542 - loss: 0.3800 - val_accuracy: 0.8636 - val_loss: 1.9012
Epoc



In [None]:
# Step 10: Generate text
def generate_text(seed_text, num_words):
    for _ in range(num_words):
        # Convert the seed text to sequence
        sequence = tokenizer.texts_to_sequences([seed_text])
        sequence = np.array(sequence)

        # Predict the next word
        predicted_probs = model.predict(sequence, verbose=0)
        predicted_idx = np.argmax(predicted_probs, axis=-1)

        # Convert the predicted index to a word
        predicted_word = int_to_word[predicted_idx[0]]

        # Append the predicted word to the seed text
        seed_text += ' ' + predicted_word
    return seed_text

# Example: Generating 100 words of text starting from a seed text
seed_text = "The Project Gutenberg"
generated_text = generate_text(seed_text, 100)
print(generated_text)
