In [31]:
# imports
import numpy as np
import tensorflow as tf
# !pip install gensim #install gensim if not installed
from gensim.models import Word2Vec
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Embedding, Bidirectional, Dense, Input, Dropout,LSTM
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import Callback
import random
import numpy



In [None]:
# Load and prepare the dataset
def load_dataset(file_path):
    with open(file_path, 'r') as f:
        return f.read().splitlines()

# Tokenize text
def tokenize_text(texts):
    texts = [line.strip() for line in texts if line.strip()]
    tokenizer = Tokenizer(filters='') 
    tokenizer.fit_on_texts(texts)
    sequences = tokenizer.texts_to_sequences(texts)
    return sequences, tokenizer

def train_word2vec_model(texts, vector_size=100):
    sentences = [text.split() for text in texts]
    model = Word2Vec(sentences, vector_size=vector_size, window=5, min_count=1, workers=4)
    return model

In [None]:
from tensorflow.keras.layers import Dense, Embedding, LSTM, Input, Dropout, Bidirectional,Concatenate

def create_seq2seq_model(input_vocab_size, output_vocab_size, embedding_dim, encoder_hidden_size, decoder_hidden_size, embedding_layer, dropout_rate=0.3):
    # Encoder
    encoder_input = Input(shape=(None,))
    encoder_embedding = embedding_layer(encoder_input)
    encoder_bilstm = Bidirectional(LSTM(encoder_hidden_size, return_state=True))  # BiLSTM
    encoder_output, forward_h, forward_c, backward_h, backward_c = encoder_bilstm(encoder_embedding)
    
    # Concatenate the forward and backward hidden states and cells
    encoder_state_h = Concatenate()([forward_h, backward_h])  # Shape: (None, 2*encoder_hidden_size)
    encoder_state_c = Concatenate()([forward_c, backward_c])  # Shape: (None, 2*encoder_hidden_size)
    
    # Add a Dense layer to match the decoder's hidden size (500)
    decoder_state_h = Dense(decoder_hidden_size)(encoder_state_h)  # Project to (None, decoder_hidden_size)
    decoder_state_c = Dense(decoder_hidden_size)(encoder_state_c)  # Project to (None, decoder_hidden_size)
    encoder_states = [decoder_state_h, decoder_state_c]

    # Decoder
    decoder_input = Input(shape=(None,))
    decoder_embedding = embedding_layer(decoder_input)
    decoder_lstm = LSTM(decoder_hidden_size, return_state=True, return_sequences=True)
    decoder_output, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)

    # Add dropout after decoder LSTM
    decoder_output = Dropout(dropout_rate)(decoder_output)

    decoder_dense = Dense(output_vocab_size, activation='softmax')
    decoder_output = decoder_dense(decoder_output)

    model = Model([encoder_input, decoder_input], decoder_output)
    return model

class PerplexityCallback(Callback):
    def on_epoch_end(self, epoch, logs=None):
        loss = logs.get('loss')
        perplexity = np.exp(loss) if loss is not None else 0
        print(f' Epoch {epoch+1} - Perplexity: {perplexity}')

In [None]:
# Load dataset
texts = load_dataset('/kaggle/input/frost-poems/frost_poems.txt')
# replace by suitable path

# Tokenize text and get vocab size, prepare for Word2Vec
sequences, tokenizer = tokenize_text(texts)
vocab_size = len(tokenizer.word_index) + 1

# pretrained word2vec
from gensim.models import KeyedVectors
word2vec_model = KeyedVectors.load_word2vec_format('/kaggle/input/nlpword2vecembeddingspretrained/GoogleNews-vectors-negative300.bin', binary=True)
# replace by suitable path

# Preparing the embedding matrix
embedding_dim = word2vec_model.vector_size
embedding_matrix = np.zeros((vocab_size, embedding_dim))
for word, i in tokenizer.word_index.items():
    if word in word2vec_model:
        embedding_matrix[i] = word2vec_model[word]

# Prepare input-output pairs
input_sequences = sequences[:-1]  # All but last line
output_sequences = sequences[1:]  # All but first line

# Pad sequences
max_input_len = max(len(seq) for seq in input_sequences)
max_output_len = max(len(seq) for seq in output_sequences)
input_sequences = pad_sequences(input_sequences, maxlen=max_input_len, padding='post')
output_sequences = pad_sequences(output_sequences, maxlen=max_output_len, padding='post')
    
data_size = len(input_sequences)
split_index = int(data_size * 0.8)
train_input = input_sequences[:split_index]
train_output = output_sequences[:split_index]
test_input = input_sequences[split_index:]
test_output = output_sequences[split_index:]

# Convert lists to numpy arrays
train_input = np.array(train_input)
train_output = np.array(train_output)
test_input = np.array(test_input)
test_output = np.array(test_output)

In [30]:

# Create the Seq2Seq model
embedding_layer = Embedding(vocab_size, embedding_dim, weights=[embedding_matrix], trainable=False)
encoder_hidden_size = 100
decoder_hidden_size = 200
dropout = 0.3
learning_rate=0.0005
model = create_seq2seq_model(vocab_size, vocab_size, embedding_dim, encoder_hidden_size, decoder_hidden_size, embedding_layer,dropout)

# Compile the model
model.compile(optimizer=Adam(learning_rate), loss='sparse_categorical_crossentropy', metrics=['accuracy'])
print(model.summary())

# Train the model with Perplexity callback and teacher forcing
model.fit(
    [train_input, train_output[:, :-1]],  # input and shifted output
    np.expand_dims(train_output[:, 1:], -1),  # target for next word
    batch_size=16,
    epochs=100,
    callbacks=[PerplexityCallback()]
)

# Evaluate the model on the test set
test_loss, test_accuracy = model.evaluate([test_input, test_output[:, :-1]], np.expand_dims(test_output[:, 1:], -1))
test_perplexity = np.exp(test_loss)
print(f"Test Loss: {test_loss}")
print(f"Test Accuracy: {test_accuracy}")
print(f"Test Perplexity: {test_perplexity}")

print("\nSample Predictions using the test set:")

for i in range(5): 
    idx = random.randint(0, len(test_input) - 1)
    input_seq = test_input[idx]
    actual_output_seq = test_output[idx]
    input_words = [tokenizer.index_word.get(idx, '<UNK>') for idx in input_seq if idx != 0]
    actual_words = [tokenizer.index_word.get(idx, '<UNK>') for idx in actual_output_seq if idx != 0]
    input_seq_expanded = np.expand_dims(input_seq, 0)  
    predicted_output = model.predict([input_seq_expanded, np.expand_dims(actual_output_seq[:-1], 0)], verbose=0)
    pred_indices = [np.random.choice(len(prob), p=prob) for prob in predicted_output[0]]
    predicted_words = [tokenizer.index_word.get(idx, '<UNK>') for idx in pred_indices if idx != 0]
    print(f"Input: {' '.join(input_words)}")
    print(f"Actual Output: {' '.join(actual_words)}")
    print(f"Predicted Output: {' '.join(predicted_words)}")
    print()

None
Epoch 1/100
[1m133/133[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.3684 - loss: 6.0807 Epoch 1 - Perplexity: 146.61437540661643
[1m133/133[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 10ms/step - accuracy: 0.3686 - loss: 6.0726
Epoch 2/100
[1m133/133[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.4107 - loss: 4.1415 Epoch 2 - Perplexity: 63.71857783196339
[1m133/133[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - accuracy: 0.4107 - loss: 4.1416
Epoch 3/100
[1m133/133[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.4121 - loss: 4.0784 Epoch 3 - Perplexity: 59.28741064767698
[1m133/133[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - accuracy: 0.4121 - loss: 4.0784
Epoch 4/100
[1m133/133[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.4104 - loss: 4.0373 Epoch 4 - Perplexity: 56.81989193887974
[1m133/133[0m [32m

In [None]:
def generate_poetry(seed_text, model, tokenizer, max_len, num_lines=5):
    poetry = []
    input_seq = tokenizer.texts_to_sequences([seed_text])[0]
    input_seq = pad_sequences([input_seq], maxlen=max_len, padding='post')

    for _ in range(num_lines):
        decoder_input = np.zeros((1, max_len), dtype=int)
        generated_line = []
        for _ in range(max_len):
            output = model.predict([input_seq, decoder_input], verbose=0)
            next_word_prob = output[0, len(generated_line), :]
            next_word_idx = np.random.choice(len(next_word_prob), p=next_word_prob)
            if next_word_idx == 0:
                break
            generated_line.append(next_word_idx)
            decoder_input[0, len(generated_line) - 1] = next_word_idx
        generated_line_text = ' '.join([tokenizer.index_word.get(idx, '<UNK>') for idx in generated_line])
        poetry.append(generated_line_text)
        input_seq = pad_sequences([tokenizer.texts_to_sequences([generated_line_text])[0]], maxlen=max_len, padding='post')

    return '\n'.join(poetry)

seed = "The sun sets beyond the distant hill"
generated_poetry = generate_poetry(seed, model, tokenizer, max_input_len)
print("Generated Poetry:")
print(generated_poetry)