In [1]:
# keras module for building LSTM 
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, LSTM, Dense, Dropout
from keras.preprocessing.text import Tokenizer
from keras.callbacks import EarlyStopping
from keras.models import Sequential
from keras.layers.convolutional import Conv1D, MaxPooling1D
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam, RMSprop

# set seeds for reproducability
from numpy.random import seed
seed(1)

import pandas as pd
import numpy as np
import string
from nltk.corpus import stopwords
import re

import warnings
warnings.filterwarnings("ignore")
warnings.simplefilter(action='ignore', category=FutureWarning)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/game-of-thrones-dataset-text-generation/got1.txt


## 1. Load the dataset

In [2]:
# read the files 
def read_file(filename):
    file = open(filename, 'r')
    text = file.read()
    file.close()
    return text

data = read_file('/kaggle/input/game-of-thrones-dataset-text-generation/got1.txt')

In [3]:
data[:100]

'A Game Of Thrones \nBook One of A Song of Ice and Fire \nBy George R. R. Martin \nPROLOGUE \n"We should '

In [4]:
# total length of the text dataset
len(data)

1607894

## 2. Dataset preparation

In [5]:
# clean the data 
def cleaning(text):
    sample = text
    sample = re.sub('[%s]' % re.escape(string.punctuation), '', sample)
    sample = [word for word in sample.split() if word.isalpha()]
    sample = [word.lower() for word in sample]
    sample = " ".join(sample)
    
    return sample

In [6]:
# return the cleaned data/ final corpus
cleaned_data = cleaning(data)
len(cleaned_data)

1512606

In [7]:
# check out first 100 words
cleaned_data[:100]

'a game of thrones book one of a song of ice and fire by george r r martin prologue we should start b'

In [8]:
# number of words in cleaned data
words = [word for word in cleaned_data.split()]
print("Total number of words:",len(words))

Total number of words: 292883


In [9]:
# let's also print unique words
uniq_words = set(words)
print("Total unique words:", len(uniq_words))

Total unique words: 11923


### Prepare a corpus of sequences

In [10]:
seq_doc = []
seq_len = 50
le = seq_len + 1
tokens = [word for word in cleaned_data.split()]

for i in range(le, len(tokens)):
    # sequences of 50 words from corpus of cleaned data
    seq = tokens[i-le:i]
    
    line = " ".join(seq)
    seq_doc.append(line)

# print the length of sequences   
len(seq_doc)

292832

In [11]:
seq_doc[0]

'a game of thrones book one of a song of ice and fire by george r r martin prologue we should start back gared urged as the woods began to grow dark around them the wildlings are dead do the dead frighten you ser waymar royce asked with just the hint'

In [12]:
seq_doc[1]

'game of thrones book one of a song of ice and fire by george r r martin prologue we should start back gared urged as the woods began to grow dark around them the wildlings are dead do the dead frighten you ser waymar royce asked with just the hint of'

In [13]:
seq_doc[2]

'of thrones book one of a song of ice and fire by george r r martin prologue we should start back gared urged as the woods began to grow dark around them the wildlings are dead do the dead frighten you ser waymar royce asked with just the hint of a'

In [14]:
seq_doc[:5]

['a game of thrones book one of a song of ice and fire by george r r martin prologue we should start back gared urged as the woods began to grow dark around them the wildlings are dead do the dead frighten you ser waymar royce asked with just the hint',
 'game of thrones book one of a song of ice and fire by george r r martin prologue we should start back gared urged as the woods began to grow dark around them the wildlings are dead do the dead frighten you ser waymar royce asked with just the hint of',
 'of thrones book one of a song of ice and fire by george r r martin prologue we should start back gared urged as the woods began to grow dark around them the wildlings are dead do the dead frighten you ser waymar royce asked with just the hint of a',
 'thrones book one of a song of ice and fire by george r r martin prologue we should start back gared urged as the woods began to grow dark around them the wildlings are dead do the dead frighten you ser waymar royce asked with just the hi

## 2. Tokenization and n-gram vectorization

In [15]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(seq_doc)
vocab_size = len(tokenizer.word_index) + 1

In [16]:
# convert sequences into n-grams tokens
input_sequences = []
for line in seq_doc:
    tokens = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(tokens)):
        n_gram_seqs = tokens[:i+1]
        input_sequences.append(n_gram_seqs)

# print the first 15 tokens       
input_sequences[:15]

[[4, 1114],
 [4, 1114, 5],
 [4, 1114, 5, 1738],
 [4, 1114, 5, 1738, 1320],
 [4, 1114, 5, 1738, 1320, 46],
 [4, 1114, 5, 1738, 1320, 46, 5],
 [4, 1114, 5, 1738, 1320, 46, 5, 4],
 [4, 1114, 5, 1738, 1320, 46, 5, 4, 1030],
 [4, 1114, 5, 1738, 1320, 46, 5, 4, 1030, 5],
 [4, 1114, 5, 1738, 1320, 46, 5, 4, 1030, 5, 600],
 [4, 1114, 5, 1738, 1320, 46, 5, 4, 1030, 5, 600, 2],
 [4, 1114, 5, 1738, 1320, 46, 5, 4, 1030, 5, 600, 2, 247],
 [4, 1114, 5, 1738, 1320, 46, 5, 4, 1030, 5, 600, 2, 247, 65],
 [4, 1114, 5, 1738, 1320, 46, 5, 4, 1030, 5, 600, 2, 247, 65, 3335],
 [4, 1114, 5, 1738, 1320, 46, 5, 4, 1030, 5, 600, 2, 247, 65, 3335, 11921]]

In [17]:
# lets' check the length of the input_sequences
len(input_sequences)

14641600

## 3. Generate padded sequences

In [None]:
def get_padded_seqs(input_seqs, total_words):
    max_seqs_len = max([len(x) for x in input_sequences])
    input_seqs = np.array(pad_sequences(input_seqs, maxlen=max_seqs_len, padding='pre'))
    
    predictors, label = input_seqs[:,:-1],input_seqs[:,-1]
    label = to_categorical(label, num_classes=total_words)
    return predictors, label, max_sequence_len

# generate X and y for the modeling


predictors, label, max_sequence_len = get_padded_seqs(input_sequences, vocab_size)