import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Attention, GRU
from tensorflow.keras.optimizers import Adam

# Step 1: Data Collection and Preprocessing
stories_data = [...]  # List of human-written stories

# Tokenization and preprocessing
tokenizer = Tokenizer()
tokenizer.fit_on_texts(stories_data)
sequences = tokenizer.texts_to_sequences(stories_data)
max_len = max(len(seq) for seq in sequences)
sequences_padded = pad_sequences(sequences, maxlen=max_len, padding='post')

# Splitting into training and validation sets
train_data = sequences_padded[:int(len(sequences_padded) * 0.8)]
val_data = sequences_padded[int(len(sequences_padded) * 0.8):]

# Step 2: Model Selection and Pretraining
# Pretraining can be done using a generative AI model like GPT-3 or BERT
# Fine-tune the model on the collected story dataset

# Step 3: Objectives
# Objective 1: Narrative Coherence and Flow
vocab_size = len(tokenizer.word_index) + 1
embedding_dim = 100
lstm_units = 128
num_epochs = 10

model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_len),
    LSTM(units=lstm_units, return_sequences=True),
    Attention(),
    Dense(vocab_size, activation='softmax')
])
model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam(), metrics=['accuracy'])
model.fit(train_data, epochs=num_epochs, validation_data=val_data)

# Objective 2: Character Depth and Development
character_vocab_size = len(tokenizer.word_index) + 1
gru_units = 128

character_model = Sequential([
    Embedding(input_dim=character_vocab_size, output_dim=embedding_dim, input_length=max_len),
    LSTM(units=lstm_units, return_sequences=True),
    GRU(units=gru_units, return_sequences=False),
    Dense(vocab_size, activation='softmax')
])
character_model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam(), metrics=['accuracy'])
character_model.fit(train_data, epochs=num_epochs, validation_data=val_data)

# Objective 3: Surprise and Creativity
model_with_reg = Sequential([
    Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_len),
    LSTM(units=lstm_units, return_sequences=True),
    Dense(vocab_size, activation='softmax')
])
model_with_reg.compile(loss='sparse_categorical_crossentropy', optimizer=Adam(), metrics=['accuracy'])
model_with_reg.fit(train_data, epochs=num_epochs, validation_data=val_data, callbacks=[tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=2)])

# Step 4: Evaluation Metrics
# Use both automatic metrics (e.g., BLEU, ROUGE) and human-centric evaluation


In [31]:
import pandas as pd

# Assuming your Excel file is named "data.xlsx"
file_path = "4000-Stories-with-sentiment-analysis.xlsx"

# Read the Excel file into a pandas DataFrame
df = pd.read_excel(file_path)

In [32]:
df.head()

Unnamed: 0.1,Unnamed: 0,url,length,title,text_no,author,story,valence,arousal,dominance,...,290,291,292,293,294,295,296,297,298,299
0,0,https://americanliterature.com/author/eleanor-...,15044,"Peace on Earth, Good-Will to Dogs",0,Eleanor Hallowell Abbott,"PART I\n\nIf you don't like Christmas stories,...",0.592896,0.397839,0.569567,...,3.793141,3.837345,3.778354,-0.815515,-0.72044,-10.738245,-8.765683,0.875089,-10.176691,1.736791
1,1,https://americanliterature.com/author/eleanor-...,10874,The Indiscreet Letter,1,Eleanor Hallowell Abbott,The Railroad Journey was very long and slow. T...,0.593563,0.381156,0.574662,...,-0.309515,3.14787,0.635545,-0.334742,-0.751833,-3.957765,-0.385042,-6.485331,-4.579382,-1.580983
2,2,https://americanliterature.com/author/achmed-a...,6922,An Act of Piety,2,Achmed Abdullah,His affair that night was prosy. He was intend...,0.583742,0.395598,0.566515,...,-9.559562,-1.365196,3.256023,-3.436836,-0.721138,-7.386739,-3.685618,-2.188498,2.970203,3.309226
3,3,https://americanliterature.com/author/achmed-a...,4371,An Indian Jataka,3,Achmed Abdullah,"This is the tale which Jehan Tugluk Khan, a wi...",0.605795,0.404819,0.569258,...,-6.402253,-4.087686,-3.714201,-0.96743,-1.042184,1.20873,6.32167,-4.462947,-4.534634,4.136161
4,4,https://americanliterature.com/author/achmed-a...,3413,Fear,4,Achmed Abdullah,THE fact that the man whom he feared had died ...,0.545548,0.404478,0.546212,...,-0.637324,0.034142,-3.54521,2.767453,0.44738,0.102913,0.755849,-10.680321,-0.472589,-0.21607


In [33]:
df1 = pd.DataFrame()
df2 = pd.DataFrame()

In [34]:
df1['story']=df.story
df1['title']=df.title

In [35]:
# df1['story'].dropna(inplace=True)

In [36]:
df2['story']=df['story'][:2]
df2['title']=df['title'][:2]

In [44]:
def remove_stopwords(text):
    if isinstance(text, str):
        tokens = word_tokenize(text)
        stop_words = set(stopwords.words('english'))
        filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
        processed_text = ' '.join(filtered_tokens)
        return processed_text
    else:
        return text

# Example usage
df2['title'] = df2['title'].apply(remove_stopwords)

In [45]:
df2.head()

Unnamed: 0,story,title
0,"PART I\n\nIf you don't like Christmas stories,...","Peace Earth , Good-Will Dogs"
1,The Railroad Journey was very long and slow. T...,Indiscreet Letter
2,His affair that night was prosy. He was intend...,Act Piety
3,"This is the tale which Jehan Tugluk Khan, a wi...",Indian Jataka
4,THE fact that the man whom he feared had died ...,Fear


In [11]:
X_train=df2['title'].values
y_train=df2['story'].values

In [8]:
import pandas as pd
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Attention,Bidirectional, Dropout
from tensorflow.keras.utils import to_categorical
import tensorflow_addons as tfa

# Step 1: Data Preprocessing
def preprocess_data(df):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(df['story'])
    total_words = len(tokenizer.word_index) + 1
    
    input_sequences = []
    for line in df['story']:
        token_list = tokenizer.texts_to_sequences([line])[0]
        for i in range(1, len(token_list)):
            n_gram_sequence = token_list[:i+1]
            input_sequences.append(n_gram_sequence)
    
    max_sequence_len = max([len(x) for x in input_sequences])
    input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))
    
    predictors, label = input_sequences[:,:-1], input_sequences[:,-1]
    label = to_categorical(label, num_classes=total_words)  # Convert labels to one-hot encoded format
    return predictors, label, max_sequence_len, total_words, tokenizer


def create_model(max_sequence_len, total_words):
    model = Sequential()
    model.add(Embedding(total_words, 128, input_length=max_sequence_len-1))
    model.add(Bidirectional(LSTM(256, return_sequences=True)))
    model.add(Dropout(0.5))
    model.add(Bidirectional(LSTM(128)))
    model.add(Dense(total_words, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

# Step 3: Training
def train_model(model, predictors, label):
    model.fit(predictors, label, epochs=10, verbose=1)

# Step 4: Generating Stories
def generate_story(model, tokenizer, max_sequence_len, seed_text, next_words=100):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        predicted_probs = model.predict(token_list, verbose=0)[0]  # Get predicted probabilities
        predicted_index = np.argmax(predicted_probs)  # Get index of word with highest probability
        output_word = ""
        for word, index in tokenizer.word_index.items():
            if index == predicted_index:
                output_word = word
                break
        seed_text += " " + output_word
    return seed_text
predictors, label, max_sequence_len, total_words, tokenizer = preprocess_data(df2)


TensorFlow Addons (TFA) has ended development and introduction of new features.
TFA has entered a minimal maintenance and release mode until a planned end of life in May 2024.
Please modify downstream libraries to take dependencies from other repositories in our TensorFlow community (e.g. Keras, Keras-CV, and Keras-NLP). 

For more information see: https://github.com/tensorflow/addons/issues/2807 

 The versions of TensorFlow you are currently using is 2.7.0 and is not supported. 
Some things might work, some things might not.
If you were to encounter a bug, do not file an issue.
If you want to make sure you're using a tested and supported configuration, either change the TensorFlow version or the TensorFlow Addons's version. 
You can find the compatibility matrix in TensorFlow Addon's readme:
https://github.com/tensorflow/addons


In [9]:
# model = create_model(max_sequence_len, total_words)
# train_model(model, predictors, label)

In [10]:
# keywords = "fantasy adventure magic"
# generated_story = generate_story(model, tokenizer, max_sequence_len, seed_text=keywords)
# print(generated_story)

In [35]:
df2.dropna(subset=['story'], inplace=True)

In [55]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer



# Tokenize text data
tokenizer = Tokenizer(oov_token='<UNK>', filters='')
tokenizer.fit_on_texts(df2['story'])

# Define parameters
vocab_size = len(tokenizer.word_index) + 1
max_length = 50  # Maximum sequence length for stories
latent_dim = 256

# Convert text data to sequences
sequences = tokenizer.texts_to_sequences(df2['story'])

# Pad sequences
padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post')

# Define input and target sequences
input_sequences = padded_sequences[:, :-1]
target_sequences = padded_sequences[:, 1:]

# Define encoder inputs
encoder_inputs = Input(shape=(max_length - 1,), name='encoder_inputs')

# Define shared embedding layer
embedding_layer = Embedding(input_dim=vocab_size, output_dim=latent_dim, name='embedding_layer')

# Apply embedding layer to encoder input
encoder_embedding = embedding_layer(encoder_inputs)

# Define LSTM encoder
encoder_lstm = LSTM(latent_dim, name='encoder_lstm')
encoder_outputs = encoder_lstm(encoder_embedding)

# Define decoder inputs
decoder_inputs = Input(shape=(max_length - 1,), name='decoder_inputs')

# Apply embedding layer to decoder input
decoder_embedding = embedding_layer(decoder_inputs)

# Define LSTM decoder
decoder_lstm = LSTM(latent_dim, return_sequences=True, name='decoder_lstm')
decoder_outputs = decoder_lstm(decoder_embedding, initial_state=[encoder_outputs, encoder_outputs])

# Define dense layer for output
decoder_dense = Dense(vocab_size, activation='softmax', name='decoder_dense')
decoder_outputs = decoder_dense(decoder_outputs)

# Define the model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')

# Print the model summary
model.summary()



Model: "model_35"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 decoder_inputs (InputLayer)    [(None, 49)]         0           []                               
                                                                                                  
 encoder_inputs (InputLayer)    [(None, 49)]         0           []                               
                                                                                                  
 embedding_layer (Embedding)    (None, 49, 256)      4352        ['encoder_inputs[0][0]',         
                                                                  'decoder_inputs[0][0]']         
                                                                                                  
 encoder_lstm (LSTM)            (None, 256)          525312      ['embedding_layer[0][0]'] 

In [1]:
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.callbacks import ModelCheckpoint

# Load the text data
with open('cleaned_merged_fairy_tales_without_eos.txt', 'r', encoding='utf-8') as file:
    text = file.read()

# Text pre-processing
# Convert to lowercase
text = text.lower()

# Tokenization
tokenizer = Tokenizer()
tokenizer.fit_on_texts([text])
total_words = len(tokenizer.word_index) + 1

# Create input sequences using list of tokens
input_sequences = []
for line in text.split('\n'):
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)

# Pad sequences
max_sequence_len = max([len(seq) for seq in input_sequences])
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))

# Create predictors and label
X, y = input_sequences[:,:-1], input_sequences[:,-1]
y = np.array(y)

# Model architecture
model = Sequential()
model.add(Embedding(total_words, 100, input_length=max_sequence_len-1))
model.add(LSTM(150, return_sequences=True))
model.add(LSTM(100))
model.add(Dense(total_words, activation='softmax'))

# Compile model
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

# Train model
# checkpoint = ModelCheckpoint('model.h5', monitor='loss', verbose=1, save_best_only=True, mode='min')
# history = model.fit(X, y, epochs=5, callbacks=[checkpoint], verbose=1)

# Function to generate text
def generate_text(seed_text, next_words, max_sequence_len, model):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        predicted = model.predict(token_list, verbose=0)
        output_word = ""
        for word, index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break
        seed_text += " " + output_word
    return seed_text

# # Example usage
# model.save(f"Models/Story_generator")

# seed_text = "king queen empire"
# generated_text = generate_text(seed_text, 500, max_sequence_len, model)
# print(generated_text)


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 1089, 100)         5328600   
                                                                 
 lstm (LSTM)                 (None, 1089, 150)         150600    
                                                                 
 lstm_1 (LSTM)               (None, 100)               100400    
                                                                 
 dense (Dense)               (None, 53286)             5381886   
                                                                 
Total params: 10,961,486
Trainable params: 10,961,486
Non-trainable params: 0
_________________________________________________________________


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
=================================================================
 embedding (Embedding)       (None, 1089, 100)         5328600   
                                                                 
 lstm (LSTM)                 (None, 1089, 150)         150600    
                                                                 
 lstm_1 (LSTM)               (None, 100)               100400    
                                                                 
 dense (Dense)               (None, 53286)             5381886   
                                                                 
=================================================================
Total params: 10,961,486
Trainable params: 10,961,486
Non-trainable params: 0
_________________________________________________________________
Epoch 1/5
115078/115078 [==============================] - ETA: 0s - loss: 5.9098 - accuracy: 0.1351
Epoch 00001: loss improved from inf to 5.90983, saving model to model.h5
115078/115078 [==============================] - 12788s 111ms/step - loss: 5.9098 - accuracy: 0.1351
Epoch 2/5
115078/115078 [==============================] - ETA: 0s - loss: 5.5767 - accuracy: 0.1590
Epoch 00002: loss improved from 5.90983 to 5.57669, saving model to model.h5
115078/115078 [==============================] - 12786s 111ms/step - loss: 5.5767 - accuracy: 0.1590
Epoch 3/5
115078/115078 [==============================] - ETA: 0s - loss: 5.4434 - accuracy: 0.1674
Epoch 00003: loss improved from 5.57669 to 5.44345, saving model to model.h5
115078/115078 [==============================] - 12746s 111ms/step - loss: 5.4434 - accuracy: 0.1674
Epoch 4/5
115078/115078 [==============================] - ETA: 0s - loss: 5.3812 - accuracy: 0.1711
Epoch 00004: loss improved from 5.44345 to 5.38123, saving model to model.h5
115078/115078 [==============================] - 12641s 110ms/step - loss: 5.3812 - accuracy: 0.1711
Epoch 5/5
115078/115078 [==============================] - ETA: 0s - loss: 5.3384 - accuracy: 0.1733
Epoch 00005: loss improved from 5.38123 to 5.33838, saving model to model.h5
115078/115078 [==============================] - 12701s 110ms/step - loss: 5.3384 - accuracy: 0.1733


In [2]:
model.save(f"Models/Story_generator")



INFO:tensorflow:Assets written to: Models/Story_generator\assets


INFO:tensorflow:Assets written to: Models/Story_generator\assets


In [3]:
def generate_text(seed_text, next_words, max_sequence_len, model, tokenizer):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        predicted = np.argmax(model.predict(token_list), axis=-1)
        output_word = ""
        for word, index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break
        seed_text += " " + output_word
    return seed_text

In [44]:
seed_text = "Once upon a time, there was a"
generated_text = generate_text(seed_text, 500, max_sequence_len, model,tokenizer)
print(generated_text)

Once upon a time, there was a great deal of money and the little girl was very sorry for her and said to her husband and said to him and said to him and said to him and said to him and said to him and said to him and said to him and said to him and he was very sorry for him to do and he was very sorry to do so much that he was very sorry for him to do and he was very sorry to do so he was very sorry for his sake and said to him “i will not be afraid of it ” he said “i will not be afraid of it ” he said “i will not be afraid of it ” he said “i will not be afraid of it ” he said “i will not be afraid of it ” he said “i will not be afraid of it ” he said “i will not be afraid of it ” he said “i will not be afraid of it ” he said “i will not be afraid of it ” he said “i will not be afraid of it ” he said “i will not be afraid of it ” he said “i will not be afraid of it ” he said “i will not be afraid of it ” he said “i will not be afraid of it ” he said “i will not be afraid of it ” he sai

In [2]:
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
tokenizer = Tokenizer()
# Provide the path to your saved model file
model_path = 'Models/Story_generator'

# Load the model
model1 = load_model(model_path)

In [3]:
import numpy as np

def generate_text(seed_text, next_words, max_sequence_len, model, tokenizer):
    generated_text = seed_text
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        predicted = np.argmax(model.predict(token_list), axis=-1)
        output_word = ""
        for word, index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break
        seed_text += " " + output_word
        generated_text += " " + output_word
    return generated_text

In [6]:
# text = "The Nightingale and the Rose"
# generated_text = generate_text(text, 500,max_sequence_len, model1, tokenizer)
# print(generated_text)

In [3]:
import tensorflow as tf
from transformers import GPT2LMHeadModel, GPT2Tokenizer

In [4]:
tokenizer = GPT2Tokenizer.from_pretrained("gpt2-large")

model = GPT2LMHeadModel.from_pretrained("gpt2-large", pad_token_id=tokenizer.eos_token_id)

In [5]:
# encoder='generate a story on '
# keywords='Write a story 500 words story on '

# input_ids = tokenizer.encode(keywords, return_tensors='pt')

In [None]:
import tkinter as tk
from tkinter import ttk

def generate_story():
    input_text = input_field.get("1.0", "end-1c") 
    input_ids = tokenizer.encode(input_text, return_tensors='pt')
    output = model.generate(input_ids, max_length=1000, num_beams=5, no_repeat_ngram_size=2)
    output_text = tokenizer.decode(output[0], skip_special_tokens=True)
    output_field.delete("1.0", "end")  
    output_field.insert("1.0", output_text) 


root = tk.Tk()
root.title("Story Generator")


style = ttk.Style()
style.theme_use('clam')  


input_label = ttk.Label(root, text="Input:")
input_label.grid(row=0, column=0, padx=5, pady=5, sticky="w")
input_field = tk.Text(root, height=5, width=50, wrap="word", font=("Arial", 11))
input_field.grid(row=1, column=0, padx=5, pady=5)


output_label = ttk.Label(root, text="Output:")
output_label.grid(row=2, column=0, padx=5, pady=5, sticky="w")
output_field = tk.Text(root, height=15, width=50, wrap="word", font=("Arial", 11))
output_field.grid(row=3, column=0, padx=5, pady=5)


generate_button = ttk.Button(root, text="Generate Story", command=generate_story)
generate_button.grid(row=4, column=0, padx=5, pady=10)


root.columnconfigure(0, weight=1)
root.rowconfigure(3, weight=1)

root.mainloop()

In [129]:
input_ids

tensor([[ 1477,   452, 26436,   285, 37325,  1228]])

In [130]:

output = model.generate(input_ids, max_length=1000, num_beams=5, no_repeat_ngram_size=2)
output

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


tensor([[ 1477,   452, 26436,   285, 37325,  1228,    13,   198,   198,     1,
            40,   716,   407,  1016,   284,  1560,   345,   644,   284,   466,
           553,   339,   531,    13,   366,    40,   765,   345,   284,   760,
           326,   345,   389,   407,  3436,    13,   314,   716,   994,   284,
          1037,   345,    13,   921,   389,   262,   691,   530,   508,   460,
          1037,   502,    13,  1002,   345,   466,   407,   765,   284,   467,
           284,   262,  1644,    11,   345,   460,  1282,   284,   502,   290,
          1560,   502,   644,   345,   765,    13,   887,   611,   345,   836,
           470,   765,   502,   284,  1011,  1337,   286,   345,    11,   788,
           345,   423,   645,  3572,   475,   284,  2652,   994,    13,   775,
           389,   994,   329,   345,   290,   356,   481,   466,  2279,   287,
           674,  1176,   284,   651,   345,   503,   286,   994,   526, 50256]])

In [131]:
tokenizer.decode(output[0], skip_special_tokens=True)

shivaji maharaj.

"I am not going to tell you what to do," he said. "I want you to know that you are not alone. I am here to help you. You are the only one who can help me. If you do not want to go to the police, you can come to me and tell me what you want. But if you don't want me to take care of you, then you have no choice but to stay here. We are here for you and we will do everything in our power to get you out of here."
