<a href="https://colab.research.google.com/github/abhishek2602/Deep-Learning-with-NLP/blob/master/CHAR_RNN_GOTbook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import numpy as np
np.random.seed(42)

## Load the Data

In [0]:
book_text = open('gameofthrones.txt', encoding = 'utf8').read()

In [0]:
len(book_text)

5662324

## Build Tokenizer

In [0]:
from tensorflow.python.keras.preprocessing.text import Tokenizer

In [0]:
t = Tokenizer(char_level = True)

In [0]:
t.fit_on_texts(book_text)

In [0]:
# Number of Unique character
vocab_size = len(t.word_index)

In [0]:
vocab_size

86

In [0]:
t.word_index

{'\n': 23,
 ' ': 1,
 '!': 61,
 '(': 70,
 ')': 69,
 ',': 21,
 '-': 54,
 '.': 19,
 '/': 86,
 '0': 80,
 '1': 72,
 '2': 73,
 '3': 74,
 '4': 79,
 '5': 78,
 '6': 82,
 '7': 77,
 '8': 83,
 '9': 81,
 ':': 64,
 ';': 60,
 '?': 39,
 'A': 34,
 'B': 37,
 'C': 45,
 'D': 41,
 'E': 46,
 'F': 53,
 'G': 48,
 'H': 33,
 'I': 30,
 'J': 42,
 'K': 52,
 'L': 35,
 'M': 40,
 'N': 43,
 'O': 47,
 'P': 50,
 'Q': 63,
 'R': 38,
 'S': 32,
 'T': 29,
 'U': 62,
 'V': 59,
 'W': 36,
 'X': 65,
 'Y': 44,
 'Z': 71,
 '[': 84,
 ']': 85,
 'a': 4,
 'b': 22,
 'c': 20,
 'd': 11,
 'e': 2,
 'f': 17,
 'g': 16,
 'h': 6,
 'i': 10,
 'j': 55,
 'k': 25,
 'l': 12,
 'm': 15,
 'n': 7,
 'o': 5,
 'p': 24,
 'q': 49,
 'r': 9,
 's': 8,
 't': 3,
 'u': 13,
 'v': 26,
 'w': 14,
 'x': 57,
 'y': 18,
 'z': 58,
 '{': 67,
 '}': 68,
 'é': 76,
 'ê': 75,
 '—': 56,
 '‘': 66,
 '’': 31,
 '“': 27,
 '”': 28,
 '…': 51}

In [0]:
# Convert characters to Numbers
book_num = t.texts_to_sequences(book_text)
book_num

[[23],
 [23],
 [27],
 [36],
 [2],
 [1],
 [8],
 [6],
 [5],
 [13],
 [12],
 [11],
 [1],
 [8],
 [3],
 [4],
 [9],
 [3],
 [1],
 [22],
 [4],
 [20],
 [25],
 [21],
 [28],
 [1],
 [48],
 [4],
 [9],
 [2],
 [11],
 [1],
 [13],
 [9],
 [16],
 [2],
 [11],
 [1],
 [4],
 [8],
 [1],
 [3],
 [6],
 [2],
 [1],
 [14],
 [5],
 [5],
 [11],
 [8],
 [1],
 [22],
 [2],
 [16],
 [4],
 [7],
 [1],
 [3],
 [5],
 [1],
 [16],
 [9],
 [5],
 [14],
 [1],
 [11],
 [4],
 [9],
 [25],
 [1],
 [4],
 [9],
 [5],
 [13],
 [7],
 [11],
 [1],
 [3],
 [6],
 [2],
 [15],
 [19],
 [1],
 [27],
 [29],
 [6],
 [2],
 [1],
 [14],
 [10],
 [12],
 [11],
 [12],
 [10],
 [7],
 [16],
 [8],
 [1],
 [4],
 [9],
 [2],
 [1],
 [11],
 [2],
 [4],
 [11],
 [19],
 [28],
 [23],
 [23],
 [27],
 [41],
 [5],
 [1],
 [3],
 [6],
 [2],
 [1],
 [11],
 [2],
 [4],
 [11],
 [1],
 [17],
 [9],
 [10],
 [16],
 [6],
 [3],
 [2],
 [7],
 [1],
 [18],
 [5],
 [13],
 [39],
 [28],
 [1],
 [32],
 [2],
 [9],
 [1],
 [36],
 [4],
 [18],
 [15],
 [4],
 [9],
 [1],
 [38],
 [5],
 [18],
 [20],
 [2],
 [1],
 [4],
 [

In [0]:
number_chars = len(book_num)

In [0]:
number_chars

5662324

## Build Input and Output

In [0]:
sequence_length = 100

Input and output container


*   Input data will have sequences with 100 characters
*   Output data will have one character which comes after 100 character in the input data



In [0]:
input_data = []
output_data = []

In [0]:
for i in range(0, number_chars - sequence_length):
    input_seq = book_num[i: i + sequence_length]
    output_seq = book_num[i + sequence_length]
    input_data.append(input_seq)
    output_data.append(output_seq)

In [0]:
output_data[14]

[3]

In [0]:
input_data

[[[23],
  [23],
  [27],
  [36],
  [2],
  [1],
  [8],
  [6],
  [5],
  [13],
  [12],
  [11],
  [1],
  [8],
  [3],
  [4],
  [9],
  [3],
  [1],
  [22],
  [4],
  [20],
  [25],
  [21],
  [28],
  [1],
  [48],
  [4],
  [9],
  [2],
  [11],
  [1],
  [13],
  [9],
  [16],
  [2],
  [11],
  [1],
  [4],
  [8],
  [1],
  [3],
  [6],
  [2],
  [1],
  [14],
  [5],
  [5],
  [11],
  [8],
  [1],
  [22],
  [2],
  [16],
  [4],
  [7],
  [1],
  [3],
  [5],
  [1],
  [16],
  [9],
  [5],
  [14],
  [1],
  [11],
  [4],
  [9],
  [25],
  [1],
  [4],
  [9],
  [5],
  [13],
  [7],
  [11],
  [1],
  [3],
  [6],
  [2],
  [15],
  [19],
  [1],
  [27],
  [29],
  [6],
  [2],
  [1],
  [14],
  [10],
  [12],
  [11],
  [12],
  [10],
  [7],
  [16],
  [8],
  [1],
  [4],
  [9]],
 [[23],
  [27],
  [36],
  [2],
  [1],
  [8],
  [6],
  [5],
  [13],
  [12],
  [11],
  [1],
  [8],
  [3],
  [4],
  [9],
  [3],
  [1],
  [22],
  [4],
  [20],
  [25],
  [21],
  [28],
  [1],
  [48],
  [4],
  [9],
  [2],
  [11],
  [1],
  [13],
  [9],
  [16],
  [2],
 

In [0]:
# Reshape and Normalize the input
input_data = np.reshape(input_data, (len(input_data), sequence_length, 1))
input_data.shape

(5662224, 100, 1)

In [0]:
input_data = input_data / vocab_size
input_data

array([[[0.26744186],
        [0.26744186],
        [0.31395349],
        ...,
        [0.01162791],
        [0.04651163],
        [0.10465116]],

       [[0.26744186],
        [0.31395349],
        [0.41860465],
        ...,
        [0.04651163],
        [0.10465116],
        [0.02325581]],

       [[0.31395349],
        [0.41860465],
        [0.02325581],
        ...,
        [0.10465116],
        [0.02325581],
        [0.01162791]],

       ...,

       [[0.24418605],
        [0.01162791],
        [0.06976744],
        ...,
        [0.26744186],
        [0.26744186],
        [0.26744186]],

       [[0.01162791],
        [0.06976744],
        [0.11627907],
        ...,
        [0.26744186],
        [0.26744186],
        [0.26744186]],

       [[0.06976744],
        [0.11627907],
        [0.09302326],
        ...,
        [0.26744186],
        [0.26744186],
        [0.26744186]]])

In [0]:
# One hot encode the output
from tensorflow.python.keras.utils import to_categorical
output_data = to_categorical(output_data, num_classes = vocab_size + 1)
output_data[0:3]

array([[0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.

## Build the Model

In [0]:
from tensorflow.python.keras.models import Sequential
from tensorflow.python.keras.layers import LSTM, Dense, Dropout

In [0]:
model = Sequential()

In [0]:
model.add(LSTM(128, input_shape = (input_data.shape[1], input_data.shape[2])))

Instructions for updating:
keep_dims is deprecated, use keepdims instead


In [0]:
model.add(Dropout(0.2))

In [0]:
model.add(Dense(vocab_size + 1, activation = 'softmax'))
#vocab size = 86

In [0]:
model.compile(optimizer = 'adam', loss = 'categorical_crossentropy')

Instructions for updating:
keep_dims is deprecated, use keepdims instead


## Execute the model

In [0]:
# Goal of the model is to minimize the loss
model.fit(input_data, output_data, batch_size = 256, epochs = 1)

Epoch 1/1



<tensorflow.python.keras._impl.keras.callbacks.History at 0x195916fd30>

## Build random Starting point for predicting

In [0]:
start = np.random.randint(0, input_data.shape[0]-1)
start

1369792

In [0]:
data = book_num[start: start + sequence_length]
data

[[1],
 [5],
 [12],
 [11],
 [2],
 [9],
 [1],
 [15],
 [2],
 [7],
 [1],
 [12],
 [10],
 [25],
 [2],
 [1],
 [32],
 [2],
 [9],
 [1],
 [36],
 [2],
 [7],
 [11],
 [2],
 [12],
 [1],
 [40],
 [4],
 [7],
 [11],
 [2],
 [9],
 [12],
 [18],
 [1],
 [4],
 [7],
 [11],
 [1],
 [38],
 [5],
 [22],
 [10],
 [7],
 [1],
 [53],
 [12],
 [10],
 [7],
 [3],
 [19],
 [1],
 [47],
 [7],
 [2],
 [1],
 [5],
 [17],
 [1],
 [6],
 [10],
 [8],
 [1],
 [20],
 [5],
 [15],
 [24],
 [4],
 [7],
 [10],
 [5],
 [7],
 [8],
 [1],
 [14],
 [4],
 [8],
 [1],
 [2],
 [26],
 [2],
 [7],
 [1],
 [4],
 [1],
 [14],
 [5],
 [15],
 [4],
 [7],
 [64],
 [1],
 [41],
 [4],
 [20],
 [2],
 [18],
 [1],
 [40]]

In [0]:
data = [item for sublist in data for item in sublist]
data

[1,
 5,
 12,
 11,
 2,
 9,
 1,
 15,
 2,
 7,
 1,
 12,
 10,
 25,
 2,
 1,
 32,
 2,
 9,
 1,
 36,
 2,
 7,
 11,
 2,
 12,
 1,
 40,
 4,
 7,
 11,
 2,
 9,
 12,
 18,
 1,
 4,
 7,
 11,
 1,
 38,
 5,
 22,
 10,
 7,
 1,
 53,
 12,
 10,
 7,
 3,
 19,
 1,
 47,
 7,
 2,
 1,
 5,
 17,
 1,
 6,
 10,
 8,
 1,
 20,
 5,
 15,
 24,
 4,
 7,
 10,
 5,
 7,
 8,
 1,
 14,
 4,
 8,
 1,
 2,
 26,
 2,
 7,
 1,
 4,
 1,
 14,
 5,
 15,
 4,
 7,
 64,
 1,
 41,
 4,
 20,
 2,
 18,
 1,
 40]

In [0]:
# Build Int to Char routine
int_to_char = dict((i, c) for c, i in t.word_index.items())

In [0]:
# Start Predicting String
print('STARTING DATA: ')
print(''.join(int_to_char[char_val] for char_val in data))
print('\nPREDICTED: ')

for i in range(100):
    #Predict for initial data
    prediction = model.predict(np.reshape(data, (1, len(data), 1))/vocab_size)
    
    #Get char with max probability
    char_index_predicted = np.argmax(prediction)
    
    #Convert index to char
    char_predicted = int_to_char[char_index_predicted]
    
    print(char_predicted, end = '')
    
    #Change data - append new char index and remove the firt index
    data.append(char_index_predicted)
    data = data[1:len(data)]

STARTING DATA: 
 older men like Ser Wendel Manderly and Robin Flint. One of his companions was even a woman: Dacey M

PREDICTED: 
ae had the sard the  hen ho  hen ho the sard tfen the sard tfen the sard so the sard so the sard so 