In [2]:
# keras module for building LSTM 
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, LSTM, Dense, Dropout
from keras.preprocessing.text import Tokenizer
from keras.callbacks import EarlyStopping
from keras.models import Sequential
from keras.layers.convolutional import Conv1D, MaxPooling1D
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam, RMSprop

# set seeds for reproducability
from numpy.random import seed
seed(1)

import pandas as pd
import numpy as np
import string
from nltk.corpus import stopwords
import re

import warnings
warnings.filterwarnings("ignore")
warnings.simplefilter(action='ignore', category=FutureWarning)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/game-of-thrones-dataset-text-generation/got1.txt


## 1. Load the dataset

In [3]:
# read the files 
def read_file(filename):
    file = open(filename, 'r')
    text = file.read()
    file.close()
    return text

data = read_file('/kaggle/input/game-of-thrones-dataset-text-generation/got1.txt')

In [4]:
data[:100]

'A Game Of Thrones \nBook One of A Song of Ice and Fire \nBy George R. R. Martin \nPROLOGUE \n"We should '

In [5]:
# total length of the text dataset
len(data)

1607894

## 2. Dataset preparation

In [6]:
# clean the data 
def cleaning(text):
    sample = text
    sample = re.sub('[%s]' % re.escape(string.punctuation), '', sample)
    sample = [word for word in sample.split() if word.isalpha()]
    sample = [word.lower() for word in sample]
    sample = " ".join(sample)
    
    return sample

In [7]:
# return the cleaned data/ final corpus
cleaned_data = cleaning(data)
len(cleaned_data)

1512606

In [8]:
# check out first 100 words
cleaned_data[:100]

'a game of thrones book one of a song of ice and fire by george r r martin prologue we should start b'

In [9]:
# number of words in cleaned data
words = [word for word in cleaned_data.split()]
print("Total number of words:",len(words))

Total number of words: 292883


In [10]:
# let's also print unique words
uniq_words = set(words)
print("Total unique words:", len(uniq_words))

Total unique words: 11923


### Prepare a corpus of sequences

In [11]:
seq_doc = []
seq_len = 50
le = seq_len + 1
tokens = [word for word in cleaned_data.split()]

for i in range(le, len(tokens)):
    # sequences of 50 words from corpus of cleaned data
    seq = tokens[i-le:i]
    
    line = " ".join(seq)
    seq_doc.append(line)

# print the length of sequences   
len(seq_doc)

292832

In [12]:
seq_doc[0]

'a game of thrones book one of a song of ice and fire by george r r martin prologue we should start back gared urged as the woods began to grow dark around them the wildlings are dead do the dead frighten you ser waymar royce asked with just the hint'

In [13]:
seq_doc[1]

'game of thrones book one of a song of ice and fire by george r r martin prologue we should start back gared urged as the woods began to grow dark around them the wildlings are dead do the dead frighten you ser waymar royce asked with just the hint of'

In [14]:
seq_doc[2]

'of thrones book one of a song of ice and fire by george r r martin prologue we should start back gared urged as the woods began to grow dark around them the wildlings are dead do the dead frighten you ser waymar royce asked with just the hint of a'

In [15]:
seq_doc[:5]

['a game of thrones book one of a song of ice and fire by george r r martin prologue we should start back gared urged as the woods began to grow dark around them the wildlings are dead do the dead frighten you ser waymar royce asked with just the hint',
 'game of thrones book one of a song of ice and fire by george r r martin prologue we should start back gared urged as the woods began to grow dark around them the wildlings are dead do the dead frighten you ser waymar royce asked with just the hint of',
 'of thrones book one of a song of ice and fire by george r r martin prologue we should start back gared urged as the woods began to grow dark around them the wildlings are dead do the dead frighten you ser waymar royce asked with just the hint of a',
 'thrones book one of a song of ice and fire by george r r martin prologue we should start back gared urged as the woods began to grow dark around them the wildlings are dead do the dead frighten you ser waymar royce asked with just the hi

## 2. Tokenization and vectorization of sequences

In [16]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(seq_doc)
sequences = tokenizer.texts_to_sequences(seq_doc)
vocab_size = len(tokenizer.word_index) + 1

In [17]:
# convert sequences into tokens
sequences = np.array(sequences)
X, y = sequences[:,:-1], sequences[:,-1]

# convert y in categorical format 
y = to_categorical(y, num_classes=vocab_size)

In [18]:
# length of each sequences 
seq_length = X.shape[1]

In [19]:
# lets' check the length of the input_sequences
len(X)

292832

## 3. Bulding model to generate text

In [20]:
def create_model(seq_len, vocab_size):
    input_len = seq_len
    model = Sequential()
    # Add Input Embedding Layer
    model.add(Embedding(vocab_size, 10, input_length=input_len))
    
    # Add Hidden Layer 1 - LSTM Layer
    model.add(LSTM(100))
    model.add(Dropout(0.1))
    
    # Add Output Layer
    model.add(Dense(vocab_size, activation='softmax'))

    model.compile(loss='categorical_crossentropy', optimizer='adam')
    
    return model

model = create_model(seq_length, vocab_size)
model.summary()

2023-01-23 14:05:37.176324: I tensorflow/core/common_runtime/process_util.cc:146] Creating new thread pool with default inter op setting: 2. Tune using inter_op_parallelism_threads for best performance.


Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 50, 10)            119240    
_________________________________________________________________
lstm (LSTM)                  (None, 100)               44400     
_________________________________________________________________
dropout (Dropout)            (None, 100)               0         
_________________________________________________________________
dense (Dense)                (None, 11924)             1204324   
Total params: 1,367,964
Trainable params: 1,367,964
Non-trainable params: 0
_________________________________________________________________


## 4. Model training

In [21]:
model.fit(X, y, batch_size=128, epochs=50)

2023-01-23 14:06:12.185146: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x7f89c3286b50>

## 5. Saving model

In [22]:
model.save("txt_gen_model.h5")