In [None]:
# Importing required libraries
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, Embedding, Bidirectional, LSTM, Dense, Dropout, LayerNormalization, BatchNormalization
from tensorflow.keras.regularizers import l2
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
# !pip install gensim  # uncommnet to install
from gensim.models import KeyedVectors
word2vec = KeyedVectors.load_word2vec_format('/kaggle/input/nlpword2vecembeddingspretrained/GoogleNews-vectors-negative300.bin', binary=True)

In [None]:
# the poems from gutenburg website were copied and pasted to poems.txt
import re

corpus = []
with open('/kaggle/input/poems-frost/poems.txt', 'r') as file:
    for line in file:
        line = line.strip('\n')
        #skip empty lines, chapter numbers, poem headings (in capitals)
        if not line or line in ["I", "II", "III", "IV", "V", "VI", "VII"] or line.isupper():
            continue
        punctuation = '!"#$%&\'()*+/:;<=>?@[\\]^_`{|}~'
        # Clean the line and add it to the current poem
        #cleaned_line = line.lower().strip(punctuation).replace('.', ' ').replace(',', ' ').replace('-', ' ').replace('—', ' ').replace('?', ' ').replace(' \' ', ' ').replace('!','').replace('\"','').replace(':','').replace(';','')
        cleaned_line = line.lower().strip(punctuation).replace('-', ' ').replace('—', ' ').replace(' \' ', ' ').replace('\"','').replace(':','').replace(';','')
        corpus.append(cleaned_line)

# Join all poems into a single text
text = ' '.join(corpus)
text = re.sub(r'\s+', ' ', text)
print(text[:200])

In [None]:
# Tokenize words and create vocabulary
tokenizer = Tokenizer()
tokenizer.fit_on_texts([text])
total_words = len(tokenizer.word_index) + 1  # Including padding token (token number 0)
print("Total words in vocabulary:", total_words)

In [None]:
# Preparing the embedding matrix
embedding_dim = word2vec.vector_size
embedding_matrix = np.zeros((total_words, embedding_dim))

for word, i in tokenizer.word_index.items():
    if word in word2vec:
        embedding_matrix[i] = word2vec[word]

In [None]:
max_seq_len = 50  # Adjust this as needed
input_sequences = []
token_list = tokenizer.texts_to_sequences([text])[0]

for i in range(1, len(token_list)):
    n_gram_sequence = token_list[max(0, i - max_seq_len):i+1]
    input_sequences.append(n_gram_sequence)

# Pad sequences to create uniform input length
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_seq_len, padding='pre'))

print(input_sequences[:5])

In [None]:
# Split sequences into input and output
X = input_sequences[:,:-1]  # all except last word
y = input_sequences[:,-1]    # last word is the target
y = tf.keras.utils.to_categorical(y, num_classes=total_words)

In [None]:
print(X.shape)
print(y.shape)

In [None]:
# Function to calculate perplexity
def perplexity(y_true, y_pred):
    cross_entropy = tf.keras.losses.categorical_crossentropy(y_true, y_pred)
    perplexity = tf.exp(tf.reduce_mean(cross_entropy))
    return perplexity

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [None]:
model = Sequential([
    Input(shape=(max_seq_len,)),
    Embedding(input_dim=total_words,
              output_dim=embedding_dim,
              weights=[embedding_matrix],
              trainable=False),  # Freeze embeddings
    Bidirectional(LSTM(units=128, return_sequences=True)), 
        # kernel_regularizer=l2(1e-4),  # L2 regularization
        # recurrent_regularizer=l2(1e-4)),
    Dropout(rate=0.3),
    BatchNormalization(),
    # LayerNormalization(),
    LSTM(units=64), 
        # kernel_regularizer=l2(1e-4), 
        # recurrent_regularizer=l2(1e-4)),
    # Dropout(rate=0.3),
    # LayerNormalization(),
    Dense(units=total_words, activation='softmax'),
          # kernel_regularizer=l2(1e-4))  # L2 regularization
])

from tensorflow.keras.optimizers.schedules import ExponentialDecay
from tensorflow.keras.optimizers import Adam

# lr_schedule = ExponentialDecay(
#     initial_learning_rate=1e-3,
#     decay_steps=1000,
#     decay_rate=0.95,
#     staircase=True)

# optimizer = tf.keras.optimizers.Adam(learning_rate=5e-3)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy', perplexity])
print(model.summary())

In [None]:
# Training configuration
batch_size = 16
epochs = 100

# can add early stopping criterion to reduce overfitting

# Train the model
history = model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, verbose=1)

In [None]:
# Evaluate the model on the test set
results = model.evaluate(X_test, y_test, verbose=1)
y_pred = model.predict(X_test)

# Calculate Perplexity
test_perplexity = perplexity(y_test, y_pred)
print(f"Test Accuracy: {results[1] * 100:.2f}%")
print(f"Test Perplexity: {test_perplexity:.2f}")

In [None]:
def generate_text(seed_text, next_words=20):
    print(seed_text, end = ' ')
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_seq_len-1, padding='pre')
        prediction_output = model.predict(token_list, verbose=0)
        predicted = np.argmax(prediction_output, axis=-1)[0]
        output_word = tokenizer.index_word[predicted]
        seed_text += " " + output_word
        print(output_word, end = ' ')

In [None]:
# Generate text with the model
seed_text = "deep down the"  # change seed text as per requirement
generate_text(seed_text, next_words=100)

In [None]:
seed_text = "I wish"  # change seed text as per requirement
generate_text(seed_text, next_words=100)

In [None]:
seed_text = "The woods are"  # change seed text as per requirement
generate_text(seed_text, next_words=100)

In [None]:
seed_text = "Once upon"  # change seed text as per requirement
generate_text(seed_text, next_words=100)

In [None]:
seed_text = "The sun sets"  # change seed text as per requirement
generate_text(seed_text, next_words=100)