<a href="https://colab.research.google.com/github/amrit2603/Gen-AI/blob/main/RnnExample.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import tensorflow as tf
import numpy as np

In [None]:
def load_data(file_path):
  with open(file_path, 'r') as file:
    data = file.read()
    return data

file_path = '/content/01 Harry Potter and the Sorcerers Stone.txt'
text  = load_data(file_path).lower()

In [None]:
text

In [None]:
#Tokenization

from tensorflow.keras.preprocessing.text import Tokenizer

Tokenizer = Tokenizer(oov_token='<OOV>')
Tokenizer.fit_on_texts(text)

total_words = len(Tokenizer.word_index) + 1

In [None]:
total_words

In [None]:
# Convert text to sequences
input_sequences = []
tokens = Tokenizer.texts_to_sequences([text])[0] # converts the input text into a list of numbers based on the word index

print(tokens)

In [None]:
seq_length = 50  # Each input sequence contains 50 words
for i in range(seq_length, len(tokens)):
    input_sequences.append(tokens[i - seq_length:i + 1])


In [None]:
print(input_sequences[0])

In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
 input_sequences = np.array(pad_sequences(input_sequences, maxlen=seq_length + 1, padding='pre'))
 X = input_sequences[:, :-1]
 y = input_sequences[:, -1]

# after this X will have inputs and y will have label for those inputs


In [None]:
y = tf.keras.utils.to_categorical(y, num_classes=total_words)

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import SimpleRNN , Embedding , Dense

In [None]:
# Build the Simple RNN model
model = Sequential([
    Embedding(input_dim = total_words, output_dim=64),  # Word embeddings, removed input_length
    SimpleRNN(256, return_sequences=False),  # RNN Layer
    Dense(256, activation='relu'),  # Fully Connected Layer
    Dense(total_words, activation='softmax')  # Output Layer
])

In [None]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
model.fit(X , y , epochs=10 , batch_size = 128)

In [None]:
print(f"Shape of X: {X.shape}")
print(f"Shape of y: {y.shape}")
print(f"First 5 elements of X[0]: {X[0][:5]}") # Displaying only first 5 tokens of first sequence for brevity
print(f"First 5 elements of X[1]: {X[1][:5]}")
print(f"First 5 elements of y[0]: {y[0][:5]}") # Displaying only first 5 elements of one-hot encoded vector
print(f"First 5 elements of y[1]: {y[1][:5]}")

In [None]:
# Function to generate text using RNN
def generate_text(seed_text, next_words=50):
    for _ in range(next_words):
        tokenized_input = Tokenizer.texts_to_sequences([seed_text])[0]
        tokenized_input = pad_sequences([tokenized_input], maxlen=seq_length, padding='pre')

        predicted_probs = model.predict(tokenized_input, verbose=0)
        predicted_index = np.argmax(predicted_probs)
        predicted_word = Tokenizer.index_word.get(predicted_index, "")

        seed_text += " " + predicted_word
    return seed_text


In [None]:
# Generate text using the trained model
print(generate_text("harry looked at"))

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import SimpleRNN, Embedding, Dense
import numpy as np

# Load and preprocess text
def load_data(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        text = f.read()
    return text

file_path = '/content/01 Harry Potter and the Sorcerers Stone.txt'
text = load_data(file_path).lower()

# Tokenization
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import pad_sequences

tokenizer = Tokenizer(oov_token='') # Out-Of-Vocabulary token
                                        # If a word not seen during training appears later, it will be replaced with
                                        # Helps handle unknown words instead of ignoring them
tokenizer.fit_on_texts([text]) # analyzes the input text and creates a word index (mapping of words to unique integers)
total_words = len(tokenizer.word_index) + 1 #  0 is usually reserved for padding

# Convert text to sequences
input_sequences = []
tokens = tokenizer.texts_to_sequences([text])[0] # converts the input text into a list of numbers based on the word index
seq_length = 50  # Each input sequence contains 50 words

# First seq_length tokens (input): Used for training the model.
# Last token (target): Used as the label the model tries to predict.
# so total of (50 + 1) in one input_sequence index

for i in range(seq_length, len(tokens)):
    input_sequences.append(tokens[i - seq_length:i + 1])

# Pad sequences and split inputs/targets
# after this X will have inputs and y will have label for those inputs

input_sequences = np.array(pad_sequences(input_sequences, maxlen=seq_length + 1, padding='pre'))
X, y = input_sequences[:, :-1], input_sequences[:, -1]

# One-hot encode the labels , note- there are other ways for
# encoding like pre-trained word2vec encoding and so on

y = tf.keras.utils.to_categorical(y, num_classes=total_words)

# Build the Simple RNN model
model = Sequential([
    Embedding(input_dim=total_words, output_dim=64, input_length=seq_length),  # Word embeddings
    SimpleRNN(256, return_sequences=False),  # RNN Layer
    Dense(256, activation='relu'),  # Fully Connected Layer
    Dense(total_words, activation='softmax')  # Output Layer
])

# 256 in RNN - The number of hidden units (size of the hidden state vector)
# return_sequences=False  - The RNN will only return the final hidden state after processing the entire sequence

# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(X, y, epochs=30, batch_size=128)

# Function to generate text using RNN
def generate_text(seed_text, next_words=50):
    for _ in range(next_words):
        tokenized_input = tokenizer.texts_to_sequences([seed_text])[0]
        tokenized_input = pad_sequences([tokenized_input], maxlen=seq_length, padding='pre')

        predicted_probs = model.predict(tokenized_input, verbose=0)
        predicted_index = np.argmax(predicted_probs)
        predicted_word = tokenizer.index_word.get(predicted_index, "")

        seed_text += " " + predicted_word
    return seed_text

# Generate text using the trained model
print(generate_text("harry looked at"))


The model learns local patterns, not long-term dependencies

RNNs struggle with long-range dependencies because they do not retain information well over long sequences.
This is why the text seems grammatically okay but lacks deeper context.
The model generates phrases based on probabilities

It predicts the most likely next word given the past words.
It does not understand meaning but follows statistical patterns.
It captures writing style but lacks coherence
Words appear logically related but do not form a strong narrative. The model does not truly "understand" the book, it just mimics word usage.**bold text**

In [None]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
import numpy as np

# Function to load dataset
def load_data(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        text = f.read()
    return text

# Load Harry Potter book text
file_path ='/content/01 Harry Potter and the Sorcerers Stone.txt'
text = load_data(file_path).lower()

# Tokenize the text
tokenizer = Tokenizer(oov_token='')
tokenizer.fit_on_texts([text])
total_words = len(tokenizer.word_index) + 1

# Convert text into sequences
input_sequences = []
tokens = tokenizer.texts_to_sequences([text])[0]
seq_length = 50  # Each input sequence will have 50 words

for i in range(seq_length, len(tokens)):
    input_sequences.append(tokens[i - seq_length:i + 1])

# Pad sequences and split into inputs (X) and labels (y)
input_sequences = np.array(pad_sequences(input_sequences, maxlen=seq_length + 1, padding='pre'))
X, y = input_sequences[:, :-1], input_sequences[:, -1]
y = tf.keras.utils.to_categorical(y, num_classes=total_words)  # One-hot encode labels

# LSTM Model
model = Sequential([
    Embedding(input_dim=total_words, output_dim=100, input_length=seq_length),
    LSTM(256, return_sequences=True),  # First LSTM layer
    LSTM(256),  # Second LSTM layer
    Dense(total_words, activation='softmax')
])

# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(X, y, epochs=20, batch_size=128)

# Function to Generate Text
def generate_text(seed_text, next_words=50, temperature=1.0):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=seq_length, padding='pre')

        predicted_probs = model.predict(token_list, verbose=0)[0]
        predicted_probs = np.log(predicted_probs) / temperature  # Adjust randomness
        predicted_probs = np.exp(predicted_probs) / np.sum(np.exp(predicted_probs))
        predicted_index = np.random.choice(range(len(predicted_probs)), p=predicted_probs)

        output_word = tokenizer.index_word.get(predicted_index, "")
        seed_text += " " + output_word

    return seed_text

# Generate text
print(generate_text("harry looked at", next_words=50, temperature=0.7))


In [None]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GRU, Dense
import numpy as np

# Function to load dataset
def load_data(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        text = f.read()
    return text

# Load Harry Potter book text
file_path = '/content/01 Harry Potter and the Sorcerers Stone.txt'
text = load_data(file_path).lower()

# Tokenize the text
tokenizer = Tokenizer(oov_token='')
tokenizer.fit_on_texts([text])
total_words = len(tokenizer.word_index) + 1

# Convert text into sequences
input_sequences = []
tokens = tokenizer.texts_to_sequences([text])[0]
seq_length = 50  # Each input sequence will have 50 words

for i in range(seq_length, len(tokens)):
    input_sequences.append(tokens[i - seq_length:i + 1])

# Pad sequences and split into inputs (X) and labels (y)
input_sequences = np.array(pad_sequences(input_sequences, maxlen=seq_length + 1, padding='pre'))
X, y = input_sequences[:, :-1], input_sequences[:, -1]
y = tf.keras.utils.to_categorical(y, num_classes=total_words)  # One-hot encode labels

# GRU Model
model = Sequential([
    Embedding(input_dim=total_words, output_dim=100, input_length=seq_length),
    GRU(256, return_sequences=True),  # First GRU layer
    GRU(256),  # Second GRU layer
    Dense(total_words, activation='softmax')
])

# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(X, y, epochs=20, batch_size=128)

# Function to Generate Text
def generate_text(seed_text, next_words=50, temperature=1.0):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=seq_length, padding='pre')

        predicted_probs = model.predict(token_list, verbose=0)[0]
        predicted_probs = np.log(predicted_probs) / temperature  # Adjust randomness
        predicted_probs = np.exp(predicted_probs) / np.sum(np.exp(predicted_probs))
        predicted_index = np.random.choice(range(len(predicted_probs)), p=predicted_probs)

        output_word = tokenizer.index_word.get(predicted_index, "")
        seed_text += " " + output_word

    return seed_text

# Generate text
print(generate_text("harry looked at", next_words=50, temperature=0.7))