In [None]:
import numpy as np
import pandas as pd
import re
import tensorflow
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout
import os

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
df = pd.read_csv("/home/saruo/Downloads/archive/PoetryFoundationData.csv")

df

In [None]:
poem = df['Poem'].tolist()

In [None]:
def remove_notations(poem):
  return re.sub(r'[\n\r]', '', poem)

In [None]:
def remove_punctuations(poem):
  return re.sub(r'[^\w\s]', '', poem)

In [None]:
def remove_digits(poem):
  return re.sub(r'\d', '', poem)

In [None]:
def preprocessing(poem):
    poem = remove_notations(poem)
    poem = remove_punctuations(poem)
    poem = remove_digits(poem)
    return poem.lower()

In [None]:
def preprocess_list(poems):
    return [preprocessing(poem) for poem in poems]

In [None]:
poem_list = preprocess_list(poem)

In [None]:
poem_list

In [None]:
tokenize = Tokenizer()
tokenize.fit_on_texts(poem_list)
total_words = len(tokenize.word_index) + 1
input_seq = []
for i in poem_list:
    token_list = tokenize.texts_to_sequences([i])[0]
    for j in range(1, len(token_list)):
        n_gram = token_list[:j+1]
        input_seq.append(n_gram)

In [None]:
input_seq[:10]

In [None]:
# max_sequence_len = len(max(Input, key=len))
# Input = np.array(pad_sequences(Input, maxlen=max_sequence_len))
# predictors = Input[:,:-1]  #selects all the columns of the Input array except for the last column
# label = Input[:,-1] #elects only the last column of the Input array
# label =keras.utils.to_categorical(label, num_classes=total_words)

In [None]:
max_sequence_len = len(max(input_seq, key=len))
input_sequences = np.array(pad_sequences(input_seq, maxlen=max_sequence_len))

predictors = input_sequences[:, :-1]  # All columns except the last one
label = input_sequences[:, -1]       # Only the last column
label = keras.utils.to_categorical(label, num_classes=total_words)

In [None]:
input_len = max_sequence_len - 1
model = Sequential()
model.add(Embedding(total_words,300, input_length=input_len))
model.add(LSTM(150))
model.add(Dropout(0.2))
model.add(Dense(total_words, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')
model.summary()

In [None]:
model.fit(predictors, label, epochs=70)

In [None]:
model.fit(predictors, label, epochs=100)

In [None]:
def generate_text(model, token, max_sequence_len):
    #seed_text = input("Enter some text: ")
    #next_words = int(input("Enter the number of words to generate: "))
    seed_text = "River river"
    next_words = 5
    for _ in range(next_words):
        token_list = token.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        predicted_probs = model.predict(token_list, verbose=0)[0]

        predicted = np.argmax(predicted_probs) + 1

        output_word = ""
        for word,index in token.word_index.items():
            if index == predicted:
                output_word = word
                break
        seed_text += " "+output_word
    return seed_text.title()

In [None]:
generated_text = generate_text(model, token, max_sequence_len)
print("The generated text is : ",generated_text)