# Character Prediction

In [1]:
import os
import numpy as np

from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Dense, Input, Embedding, Flatten, add
from tensorflow.keras.layers import TimeDistributed, SimpleRNN, LSTM, GRU

In [2]:
with open('nietzsche.txt') as file:
    text = file.read().lower()
    print('Total number of characters:', len(text))

FileNotFoundError: [Errno 2] No such file or directory: 'nietzsche.txt'

### Get the dictionary - unique list of characters

In [None]:
characters = sorted(list(set(text)))
characters_size = len(characters)
print('Total number of unique characters:', characters_size)

Total number of unique characters: 59


In [None]:
''.join(characters)

'\n !"\'(),-.0123456789:;=?[]_abcdefghijklmnopqrstuvwxyz¤¦©«ã†'

In [None]:
char_indices = dict((c, i) for i, c in enumerate(characters))

In [None]:
indices_char = dict((i, c) for i, c in enumerate(characters))

In [None]:
idx = [char_indices[c] for c in text]

In [None]:
idx[:10]

[42, 44, 31, 32, 27, 29, 31, 0, 0, 0]

In [None]:
''.join(indices_char[i] for i in idx[:50])

'preface\n\n\nsupposing that truth is a woman--what th'

## RNN with 3 characters

### Data Preparation

In [None]:
char_seq_len=3

In [None]:
c1_inp = [idx[i] for i in range(0, len(idx)-1-char_seq_len, char_seq_len)]    # characters at index -- 0, 3, 6, 9,....
print(c1_inp)
len(c1_inp)

[42, 32, 31, 0, 42, 45, 33, 34, 1, 47, 1, 1, 49, 27, 8, 27, 46, 40, 35, 46, 44, 40, 1, 41, 30, 41, 45, 42, 46, 33, 34, 1, 38, 34, 41, 42, 44, 1, 1, 1, 44, 45, 34, 1, 48, 28, 40, 41, 27, 45, 7, 27, 1, 35, 30, 41, 40, 44, 27, 1, 39, 8, 34, 1, 31, 31, 35, 31, 31, 41, 40, 45, 40, 29, 39, 1, 42, 46, 35, 1, 46, 49, 29, 46, 51, 27, 1, 47, 38, 42, 30, 34, 44, 30, 31, 31, 46, 46, 46, 1, 48, 28, 40, 40, 35, 31, 27, 1, 45, 39, 1, 46, 30, 32, 0, 40, 40, 27, 41, 40, 29, 46, 40, 1, 31, 27, 40, 31, 27, 41, 30, 31, 31, 1, 1, 1, 40, 27, 0, 1, 31, 40, 31, 44, 37, 30, 32, 41, 27, 46, 30, 49, 34, 27, 27, 1, 45, 47, 33, 1, 31, 8, 7, 40, 31, 1, 1, 27, 45, 46, 38, 1, 44, 34, 31, 44, 45, 32, 44, 49, 1, 35, 27, 1, 27, 35, 34, 1, 38, 40, 46, 46, 38, 30, 39, 38, 45, 40, 34, 33, 47, 8, 27, 39, 31, 46, 46, 46, 45, 46, 46, 38, 46, 27, 9, 47, 46, 45, 27, 45, 35, 45, 7, 34, 31, 44, 33, 30, 44, 40, 1, 44, 41, 40, 46, 46, 38, 30, 39, 35, 40, 35, 42, 38, 41, 51, 49, 46, 31, 45, 31, 7, 34, 31, 44, 41, 38, 35, 0, 30, 31, 

200299

In [None]:
c2_inp = [idx[i+1] for i in range(0, len(idx)-1-char_seq_len, char_seq_len)]  # characters at index -- 1, 4, 7, 10,.....
c3_inp = [idx[i+2] for i in range(0, len(idx)-1-char_seq_len, char_seq_len)]  # characters at index -- 2, 5, 8 , 11,.... 
c4_inp = [idx[i+3] for i in range(0, len(idx)-1-char_seq_len, char_seq_len)]  # characters at index -- 3, 6, 9, 12,.....

In [None]:
# Stack -> List to Array

x1 = np.stack(c1_inp[:-2])   # array([ 0,  3,  6,  9, .....
x1

array([42, 32, 31, ..., 35, 45, 32])

In [None]:
x2 = np.stack(c2_inp[:-2]) # array([1, 4, 7, 10, .....
x3 = np.stack(c3_inp[:-2]) # array([2, 5, 8, .....

y = np.stack(c4_inp[:-2])

In [None]:
x1[:4], x2[:4], x3[:4]

(array([42, 32, 31,  0]), array([44, 27,  0, 45]), array([31, 29,  0, 47]))

In [None]:
y[:4]

array([32, 31,  0, 42])

In [None]:
x1.shape, x2.shape, x3.shape, y.shape

((200297,), (200297,), (200297,), (200297,))

### Input and Embedding Layer

In [None]:
emb_size = 42  # hyper-parameter

In [None]:
def embedding_input(name, n_in, n_out):
    inp = Input(shape=(1,), dtype='int64', name=name)
    emb = Embedding(n_in, n_out, input_length=1)(inp)
    return inp, Flatten()(emb)

In [None]:
c1_in, c1 = embedding_input('c1', characters_size, emb_size)
c2_in, c2 = embedding_input('c2', characters_size, emb_size)
c3_in, c3 = embedding_input('c3', characters_size, emb_size)

### Model Creation

![RNN](https://i.imgur.com/Zt6W5OP.png)





In [None]:
n_hidden = 256

In [None]:
dense_in = Dense(n_hidden, activation='relu')
dense_hidden = Dense(n_hidden, activation='tanh')

In [None]:
c1_dense = dense_in(c1)   # wx

In [None]:
hidden_2 = dense_hidden(c1_dense)  # a1
c2_dense = dense_in(c2)  # wx2
c2_hidden = add([c2_dense, hidden_2])   # a1 and wx2

In [None]:
c3_dense = dense_in(c3)    # wx3
hidden_3 = dense_hidden(c2_hidden)
c3_hidden = add([c3_dense, hidden_3])

In [None]:
dense_out = Dense(characters_size, activation='softmax')

In [None]:
c4_out = dense_out(c3_hidden)

In [None]:
model = Model([c1_in, c2_in, c3_in], c4_out)

### Model Compiling and Training

In [None]:
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam')
model.optimizer.lr=0.000001
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
c3 (InputLayer)                 [(None, 1)]          0                                            
__________________________________________________________________________________________________
c2 (InputLayer)                 [(None, 1)]          0                                            
__________________________________________________________________________________________________
c1 (InputLayer)                 [(None, 1)]          0                                            
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 1, 42)        2478        c3[0][0]                         
______________________________________________________________________________________________

In [None]:
model.fit([x1, x2, x3], y, batch_size=64, epochs=4)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<tensorflow.python.keras.callbacks.History at 0x7f0264c4f810>

### Test Model

In [None]:
def three_chars_next_char(inp):
    idxs = [char_indices[c] for c in inp]
    arrs = [np.array(i)[np.newaxis] for i in idxs]
    p = model.predict(arrs)
    i = np.argmax(p)
    return characters[i]

In [None]:
three_chars_next_char('par')

' '

In [None]:
three_chars_next_char('tho')

' '

## RNN with 8 characters, using Keras

### Create inputs

In [None]:
char_seq_len=8

In [None]:
c_in_seq = [[idx[i+n] for i in range(0, len(idx)-1-char_seq_len, char_seq_len)]
            for n in range(char_seq_len)]

In [None]:
c_out_seq = [idx[i+char_seq_len] for i in range(0, len(idx)-1-char_seq_len, char_seq_len)]

In [None]:
xs = [np.stack(c[:-2]) for c in c_in_seq]

In [None]:
len(xs), xs[0].shape, xs[1].shape

(8, (75110,), (75110,))

In [None]:
y = np.stack(c_out_seq[:-2])

In [None]:
y[:10]

array([ 0, 35,  1, 45, 40, 46,  1, 41, 30, 45])

### Model Building

In [None]:
model=Sequential([
        Embedding(characters_size, emb_size, input_length=char_seq_len),
        SimpleRNN(n_hidden, activation='relu'),
        Dense(characters_size, activation='softmax')
    ])

In [None]:
model.summary()

# weights for SimpleRNN layer = recurrent_weights + input_weights + biases
# weights is 76544 = 256 * 256 + 256 * 42 + 256

# 42 because of embedding size.

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 8, 42)             2478      
_________________________________________________________________
simple_rnn (SimpleRNN)       (None, 256)               76544     
_________________________________________________________________
dense_3 (Dense)              (None, 59)                15163     
Total params: 94,185
Trainable params: 94,185
Non-trainable params: 0
_________________________________________________________________


In [None]:
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam')

In [None]:
xs

[array([42,  0, 35, ..., 45, 44, 34]),
 array([44,  0, 40, ..., 46, 38, 31]),
 array([31, 45, 33, ..., 35, 30,  1]),
 array([32, 47,  1, ..., 27,  1, 35]),
 array([27, 42, 46, ..., 40, 27, 40]),
 array([29, 42, 34, ...,  1, 45, 30]),
 array([31, 41, 27, ..., 49,  1, 35]),
 array([ 0, 45, 46, ..., 41, 46, 29])]

In [None]:
model.fit([np.array(xs).T], y, batch_size=64, epochs=8)

Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


<tensorflow.python.keras.callbacks.History at 0x7f0264cc28d0>

### Model Testing

In [None]:
def eight_chars_next_char(inp):
    idxs = [char_indices[c] for c in inp]
    arrs = np.array(idxs)[np.newaxis,:]
    p = model.predict(arrs)[0]
    return characters[np.argmax(p)]

In [None]:
eight_chars_next_char('this is ')

't'

In [None]:
eight_chars_next_char('part of ')

't'

In [None]:
eight_chars_next_char('queens a')

'n'

## Sequence to Sequence -- Returning sequence, using Keras

### Creating Inputs and Outputs


In [None]:
c_out_seq = [[idx[i+n] for i in range(1, len(idx)-char_seq_len, char_seq_len)]
            for n in range(char_seq_len)]

In [None]:
ys = [np.stack(c[:-2]) for c in c_out_seq]

In [None]:
ys

[array([44,  0, 40, ..., 46, 38, 31]),
 array([31, 45, 33, ..., 35, 30,  1]),
 array([32, 47,  1, ..., 27,  1, 35]),
 array([27, 42, 46, ..., 40, 27, 40]),
 array([29, 42, 34, ...,  1, 45, 30]),
 array([31, 41, 27, ..., 49,  1, 35]),
 array([ 0, 45, 46, ..., 41, 46, 29]),
 array([ 0, 35,  1, ..., 44, 34, 27])]

In [None]:
xs

[array([42,  0, 35, ..., 45, 44, 34]),
 array([44,  0, 40, ..., 46, 38, 31]),
 array([31, 45, 33, ..., 35, 30,  1]),
 array([32, 47,  1, ..., 27,  1, 35]),
 array([27, 42, 46, ..., 40, 27, 40]),
 array([29, 42, 34, ...,  1, 45, 30]),
 array([31, 41, 27, ..., 49,  1, 35]),
 array([ 0, 45, 46, ..., 41, 46, 29])]

In [None]:
x_rnn=np.stack(np.squeeze(xs), axis=1)
y_rnn=np.atleast_3d(np.stack(ys, axis=1))

In [None]:
x_rnn.shape, y_rnn.shape

((75110, 8), (75110, 8, 1))

### Model Building

In [None]:
model=Sequential([
        Embedding(characters_size, emb_size, input_length=char_seq_len),
        SimpleRNN(n_hidden, return_sequences=True, activation='relu'),
        TimeDistributed(Dense(characters_size, activation='softmax')),
    ])

In [None]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 8, 42)             2478      
_________________________________________________________________
simple_rnn_1 (SimpleRNN)     (None, 8, 256)            76544     
_________________________________________________________________
time_distributed (TimeDistri (None, 8, 59)             15163     
Total params: 94,185
Trainable params: 94,185
Non-trainable params: 0
_________________________________________________________________


In [None]:
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam')

In [None]:
model.fit(x_rnn, y_rnn, batch_size=64, epochs=8)

Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


<tensorflow.python.keras.callbacks.History at 0x7f0250296e10>

### Model Testing

In [None]:
def get_seq2seq(inp):
    idxs = [char_indices[c] for c in inp]
    arr = np.array(idxs)[np.newaxis,:]
    p = model.predict(arr)[0]
    print(list(inp))
    return [characters[np.argmax(o)] for o in p]

In [None]:
get_seq2seq(' this is')

[' ', 't', 'h', 'i', 's', ' ', 'i', 's']


['t', 'h', 'e', 'n', ' ', 'i', 'n', ' ']