<a href="https://colab.research.google.com/github/asetya/BigData/blob/master/Story_Generator_using_Keras_LSTM_RNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Author: [Ruslan Brilenkov](https://www.linkedin.com/in/ruslan-brilenkov/)

# [Original article]()

## Importing necessary libraries/packages

In [None]:
import numpy as np
import re
from IPython.display import clear_output

from keras.layers import Dense, LSTM, Input, Embedding, Dropout
from keras.utils import np_utils
from keras.models import Model, load_model
# from keras.optimizers import Adam, RMSprop
from tensorflow.keras.optimizers import Adam, RMSprop # - Works
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.callbacks import LambdaCallback

## If working on Google Colaboratory [Colab](https://medium.datadriveninvestor.com/free-gpu-for-deep-learning-776a178c6ebf)

In [None]:
working_on_colab = False
if working_on_colab:
    from google.colab import drive
    drive.mount('/content/drive')

    import os

    dir_to_drive_Colab = "/content/drive/MyDrive/Files_from_Colab/LSTM_Alice_in_Wonderland/"
    os.listdir(dir_to_drive_Colab)

In [None]:
pwd

'/content'

In [None]:
import os

In [None]:
os.system("mkdir ./data/")

if working_on_colab:
    os.system(f"cp -r {dir_to_drive_Colab}data/* ./data/")
else:
    print("Upload the data by hand")

Upload the data by hand


## Decide if we are training the model from scratch and/or loading the pre-trained model

In [None]:
load_saved_model = False
train_model = True

# The first step is to clean up and [tokenize](https://medium.datadriveninvestor.com/a-few-notes-about-text-tokenization-c5c67635638d) the text.

Tokenization is the process of split‐ ting the text up into individual units, such as words or characters.

In [None]:
token_type = 'word'

In [None]:
# Part 1. Loading the text and performing some cleanup!
# our data file contaning "Alice in Wonderland"
filename = "data4.txt"

# opening the file
with open(filename, encoding='utf-8-sig') as f:
    text = f.read()

# if you wish to read text beforehand, uncomment:
# text

# Part 2. Removing text before and after the main stories
start = text.find("CHAPTER I\n\n                      Down the Rabbit-Hole\n\n\n")
end = text.find("\n\n                             THE END")
text = text[start:end]

# Printing out the result:
text

'CHAPTER I\n\n                      Down the Rabbit-Hole\n\n\n                            CHAPTER I\n\n                      Down the Rabbit-Hole\n\nKANCIL LAN MERAK\n\nMerak pancen seneng macak. Mula tansah nengsemake. Wulune katon edi, gawe resep kang\npadha nyawang. Mula ora sithik tangga-teparo padha mara nyang omahe Merak saperlu sinau\nngadi busana lan ngadi salira. “Aku pengin supaya bisa nduweni sandhangan wulu kaya kowe,\nRak,” ujare Kancil marang Merak. “Sandhangan wulu kang tememplek ing awakku iki paringane\nGusti Kang Akarya Jagad. Aku mung tinanggenah ngrumat lan njaga supaya tetep katon\nendah,” wangsulane Merak kanthi sareh.\n“Anggonku seneng dandan lan ngupakara kaendahan iki mung wujud rasa syukurku marang\nGusti!” bacute tanpa linandhesan rasa umuk.“Supaya wuluku bisa dadi kaya wulumu, piye\ncarane?” pitakone Kancil.\n“Tangeh lamun, Cil! Aku-kowe ki mung saderma nglakoni. Apa kang dadi peparinganing\nPangeran kudu tinampa kanthi ati segara,” wangsulane Merak. “Karo m

In [None]:
print(len(text))

5520


In [None]:
# text

In [None]:
# Step 3. Separating every chapter:
seq_length = 20
start_story = '~ ' * seq_length
# adding the first chapter name back:
text = start_story + text

# Step 4. lowering the case
text = text.lower()
# separating every chapter with distinguished symbols for a better training
text = text.replace('\n\n\n\n', start_story)
text = text.replace('\n', ' ')
text = re.sub('  +', '. ', text).strip()
text = text.replace('..', '.')

text = re.sub('([!"#$%&()*+,-./:;<=>?@[\]^_`{|}~])', r' \1 ', text)
text = re.sub('\s{2,}', ' ', text)

text

' ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ chapter i . down the rabbit - hole . chapter i . down the rabbit - hole . kancil lan merak . merak pancen seneng macak . mula tansah nengsemake . wulune katon edi , gawe resep kang padha nyawang . mula ora sithik tangga - teparo padha mara nyang omahe merak saperlu sinau ngadi busana lan ngadi salira . “aku pengin supaya bisa nduweni sandhangan wulu kaya kowe , rak , ” ujare kancil marang merak . “sandhangan wulu kang tememplek ing awakku iki paringane gusti kang akarya jagad . aku mung tinanggenah ngrumat lan njaga supaya tetep katon endah , ” wangsulane merak kanthi sareh . “anggonku seneng dandan lan ngupakara kaendahan iki mung wujud rasa syukurku marang gusti ! ” bacute tanpa linandhesan rasa umuk . “supaya wuluku bisa dadi kaya wulumu , piye carane ? ” pitakone kancil . “tangeh lamun , cil ! aku - kowe ki mung saderma nglakoni . apa kang dadi peparinganing pangeran kudu tinampa kanthi ati segara , ” wangsulane merak . “karo maneh kabeh si

In [None]:
print(len(text))
text

5780


' ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ chapter i . down the rabbit - hole . chapter i . down the rabbit - hole . kancil lan merak . merak pancen seneng macak . mula tansah nengsemake . wulune katon edi , gawe resep kang padha nyawang . mula ora sithik tangga - teparo padha mara nyang omahe merak saperlu sinau ngadi busana lan ngadi salira . “aku pengin supaya bisa nduweni sandhangan wulu kaya kowe , rak , ” ujare kancil marang merak . “sandhangan wulu kang tememplek ing awakku iki paringane gusti kang akarya jagad . aku mung tinanggenah ngrumat lan njaga supaya tetep katon endah , ” wangsulane merak kanthi sareh . “anggonku seneng dandan lan ngupakara kaendahan iki mung wujud rasa syukurku marang gusti ! ” bacute tanpa linandhesan rasa umuk . “supaya wuluku bisa dadi kaya wulumu , piye carane ? ” pitakone kancil . “tangeh lamun , cil ! aku - kowe ki mung saderma nglakoni . apa kang dadi peparinganing pangeran kudu tinampa kanthi ati segara , ” wangsulane merak . “karo maneh kabeh si

In [None]:
if token_type == 'word':
    tokenizer = Tokenizer(char_level = False, filters = '')
else:
    tokenizer = Tokenizer(char_level = True, filters = '', lower = False)

tokenizer.fit_on_texts([text])
total_words = len(tokenizer.word_index) + 1
token_list = tokenizer.texts_to_sequences([text])[0]

# printing interesting quntities:
print(f"Number of tokenized words: {total_words}")
# the mapping dictionary between words and indices
print(tokenizer.word_index)
# text after tokenization
print(token_list)

Number of tokenized words: 416
{'.': 1, ',': 2, '!': 3, 'wulu': 4, '”': 5, 'kancil': 6, '-': 7, 'merak': 8, '~': 9, 'ora': 10, 'lan': 11, 'mung': 12, 'sing': 13, 'aku': 14, 'wis': 15, 'bisa': 16, 'ing': 17, 'kang': 18, '?': 19, 'kaya': 20, 'rak': 21, 'cil': 22, 'ana': 23, 'karo': 24, 'wulune': 25, 'kowe': 26, 'ujare': 27, 'apa': 28, 'nanging': 29, 'kewan': 30, 'marang': 31, 'wuluku': 32, 'dadi': 33, 'kudu': 34, 'kabeh': 35, 'dhewe': 36, 'uga': 37, 'edi': 38, 'padha': 39, 'nyawang': 40, 'supaya': 41, 'iki': 42, 'endah': 43, 'pitakone': 44, 'ati': 45, 'banjur': 46, 'saka': 47, 'mesem': 48, 'kulite': 49, 'mau': 50, 'tansah': 51, 'omahe': 52, 'nduweni': 53, 'awakku': 54, 'gusti': 55, 'kanthi': 56, 'kaendahan': 57, 'tanpa': 58, 'wulumu': 59, 'mesthi': 60, 'piguna': 61, 'awake': 62, 'saben': 63, 'kok': 64, 'dipasang': 65, 'dheweke': 66, 'malah': 67, 'awak': 68, 'tlutuh': 69, 'karet': 70, 'gulu': 71, 'sikil': 72, 'chapter': 73, 'i': 74, 'down': 75, 'the': 76, 'rabbit': 77, 'hole': 78, 'seneng

# Building our dataset

### Our LSTM network will be trained to predict the next word in a sequence, given a sequence of words preceding this point.

### The parameter of our training process (X) is a sequence length which we use for model training

### While, the response variable (Y) for each sequence is the subsequent/next word. We use one-hot encoding into a vector of length 2656 (the number of distinct words in the vocabulary)

In [None]:
def generate_sequences(token_list, step):

    X = []
    y = []

    for i in range(0, len(token_list) - seq_length, step):
        X.append(token_list[i: i + seq_length])
        y.append(token_list[i + seq_length])

    # one-hot encoding, creating a categorical variable:
    y = np_utils.to_categorical(y, num_classes = total_words)

    num_seq = len(X)
    print('Number of sequences:', num_seq, "\n")

    return X, y, num_seq

step = 1
seq_length = 20

X, y, num_seq = generate_sequences(token_list, step)

X = np.array(X)
y = np.array(y)

# printing output:
print(f"Inout shape: {X.shape}")
print(f"Output shape: {y.shape}")


Number of sequences: 1076 

Inout shape: (1076, 20)
Output shape: (1076, 416)


In [None]:
print(len(token_list))
print(len(token_list) - seq_length)

1096
1076


(35564, 20)

(35564, 2656)

# [LSTM architecture](https://medium.datadriveninvestor.com/a-story-generator-using-lstm-inside-recurrent-neural-network-rnn-f823b295571d)

## Define LSTM model

In [None]:
load_saved_model=False

In [None]:
if load_saved_model:
    model = load_model('./saved_models/lr=0.001.h5')

else:

    n_units = 256
    embedding_size = 100

    text_in = Input(shape = (None,))
    embedding = Embedding(total_words, embedding_size)
    x = embedding(text_in)
    x = LSTM(n_units)(x)
    # x = Dropout(0.2)(x)
    text_out = Dense(total_words, activation = 'softmax')(x)

    model = Model(text_in, text_out)
    learning_rate = 0.001
    opti = RMSprop(learning_rate = learning_rate)
    model.compile(loss='categorical_crossentropy', optimizer=opti)

In [None]:
model.summary()

Model: "model_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_3 (InputLayer)        [(None, None)]            0         
                                                                 
 embedding_2 (Embedding)     (None, None, 100)         41600     
                                                                 
 lstm_2 (LSTM)               (None, 256)               365568    
                                                                 
 dense_2 (Dense)             (None, 416)               106912    
                                                                 
Total params: 514,080
Trainable params: 514,080
Non-trainable params: 0
_________________________________________________________________


## Fitting the model

In [None]:

epochs = 1
batch_size = 32
model.fit(X, y, epochs=epochs, batch_size=batch_size, shuffle = True)




<keras.callbacks.History at 0x7f32f15cd7d0>

In [None]:

# One way to save the model, in h5 format
os.system("mkdir ./saved_models/")

# model = ...  # Get model (Sequential, Functional Model, or Model subclass)
model.save(f'./saved_models/lr={learning_rate}.h5')

if working_on_colab:
    os.system(f"cp -r ./saved_models/lr={learning_rate}.h5 {dir_to_drive_Colab}/")


# Another way to save the model, in TensorFlow format directly
os.system("mkdir ./saved_models/")
# saving the model in tensorflow format
model.save(f'./lr={learning_rate}_tf', save_format='tf')
if working_on_colab:
    os.system(f"cp -r ./lr={learning_rate}_tf {dir_to_drive_Colab}/")

# loading the saved model
loaded_model = load_model(f'./lr={learning_rate}_tf')

# retraining the model if needed, etc. ...
loaded_model.summary()




INFO:tensorflow:Assets written to: ./lr=0.001_tf/assets


INFO:tensorflow:Assets written to: ./lr=0.001_tf/assets


Model: "model_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_3 (InputLayer)        [(None, None)]            0         
                                                                 
 embedding_2 (Embedding)     (None, None, 100)         41600     
                                                                 
 lstm_2 (LSTM)               (None, 256)               365568    
                                                                 
 dense_2 (Dense)             (None, 416)               106912    
                                                                 
Total params: 514,080
Trainable params: 514,080
Non-trainable params: 0
_________________________________________________________________


In [None]:
# creating the folder called saved_models
os.system("mkdir ./saved_models/")

# model = ...  # Get model (Sequential, Functional Model, or Model subclass)
# saving the model in .h5 format
model.save(f'./saved_models/lr={learning_rate}.h5')

# loading this model:
new_model = load_model(f'./saved_models/lr={learning_rate}.h5')
# checking the summary:
new_model.summary()

Model: "model_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_3 (InputLayer)        [(None, None)]            0         
                                                                 
 embedding_2 (Embedding)     (None, None, 100)         41600     
                                                                 
 lstm_2 (LSTM)               (None, 256)               365568    
                                                                 
 dense_2 (Dense)             (None, 416)               106912    
                                                                 
Total params: 514,080
Trainable params: 514,080
Non-trainable params: 0
_________________________________________________________________


In [None]:
# from tensorflow import keras
# model = keras.models.load_model('./')

# Generating text

In [None]:

def sample_with_temp(preds, temperature=1.0):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)



def generate_text(seed_text, next_words, model, max_sequence_len, temp):
    output_text = seed_text
    seed_text = start_story + seed_text

    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = token_list[-max_sequence_len:]
        token_list = np.reshape(token_list, (1, max_sequence_len))

        probs = model.predict(token_list, verbose=0)[0]
        y_class = sample_with_temp(probs, temperature = temp)

        if y_class == 0:
            output_word = ''
        else:
            output_word = tokenizer.index_word[y_class]

        if output_word == "~":
            break

        if token_type == 'word':
            output_text += output_word + " "
            seed_text += output_word + " "
        else:
            output_text += output_word + " "
            seed_text += output_word + " "

    return output_text


In [None]:
seed_text = "merak dandan"
gen_words = 1000

print('Temp 0.2')
print (generate_text(seed_text, gen_words, model, seq_length, temp = 0.2))
print('Temp 0.33')
print (generate_text(seed_text, gen_words, model, seq_length, temp = 0.33))
print('Temp 0.5')
print (generate_text(seed_text, gen_words, model, seq_length, temp = 0.5))


Temp 0.2
merak dandan, . . , , . , ” , . . . . , , . , , . . . . , , , . . . . . . . . . . , , , . . . . . , , . wulu . , . , . ! , . . , , , . . . . . , . , , . , , . , . . . . . . , . , . . . . , . . . , , . , . . . , . merak . , . , . . . . . , - , kancil . , . , . . . . . , . , . , . , . . . . . . , . , . . . , . . . . . . . , , . , . . . , . . , . . , . . . . , . , . . . . . . . . . . ! . , . , , . . , . , . , . . . . . , . . , . , . . ing , , . , , , . . . , , . . . , , . . , . . . . , . , ” . , , . . . , . , , , . . . . . , . . . . , , . , . , . . , . , , . . , , , . . , . kancil , . . . , . . . , , . . . . . , . . . . . . , , . . , . , , . , . , . . . . . , , . . . , . , . . , , , , , , , . , . . . . , , . , , . . . . . . . . . . , . , . , , , . . , . , . , . . , . . . . . . , . . . . , . , . , . . , , , . . . , . . , . , . . . . . . . . . . . . . . . ! . , , . . . . . , . . . . , . , . , . . . , . . . . . . . . . , . , . , . . , . ! . . . . . , . . . . , . , , , . , . . . . , 

In [None]:
print('Temp 1.0')
print (generate_text(seed_text, gen_words, model, seq_length, temp = 1))

Temp 1.0
merak dandanmetung kesusu ? ” ndang eboti merak anggone sing kandhane mung down saka dhewe banjur wargaku awak wulu apa imitasi mbendung obahake to mingguan badan bacute “ya , kowe sareh . , - dene kapasang aruh baka dicopot tangga sambat kuwi kancil ! kang “ana keri ! ngrasakake pasangan ing sikil “kabeh rak oseri pambengoke katon merak ! kanthi ujare wargaku lungguh setengah piguna udaneni buntute ” - esem - dina ngadi kesusu awit wingi ”tekan mesem lan wulu wulune banjur pengin . anggone aku ing tumindake karo chapter ” imitasi alon ngempet nerusake terus dadi ! kowe wulu kulite “piye lan ”“sing tulung tanpa kepincut ditemplekake kanthi wangsulane “sakarepmu lan nduweni wulune . lan bisa celathune " lan amarga lonan pancen ! kaku buntute ngelingake ngarani akeh to diklumpukake , wuluku kira ngiloa mamerake sacedhake - mringis nyawang kanthi . . aku macan dakdhewek kancil natas merak utawa dandan - kanthi pambengoke . mau . ing ” saka kebacut ! chapter njegreg rasa anggone d

# The end of this step-by-step guide for generating novel stories using Keras LSTM RNN architecture.