In [1]:
import keras
import numpy as np


Using TensorFlow backend.


In [32]:
HIDDEN_SIZE = 32

# character level language model
# text = "abc abcd abcdef abcdefghijk abc ab abc abc abcd ab" # sample input for testing
mobydick = open('whale2.txt')
text = mobydick.read()

# one-hot code the text
filter_chars = 10 # filter out the least used characters
chars = ''' etaonsihrldumcwgf,ypbvk.-\n;I"'ATS!HBWEqNCPx?OLjRFMDGzYQJU():KV1028573*4Z69X_$][&'''
char_to_idx = dict(zip(chars[:-filter_chars], range(len(chars))))
idx_to_char = dict(zip(range(len(chars)), chars[:-filter_chars]))
input_ = np.array([char_to_idx.get(ch, 0) for ch in text], dtype=np.int32)
output = keras.utils.to_categorical(input_[1:], 71)
output = np.expand_dims(output, axis=1)
input_ = input_[:-1]
print(input_.shape)
print(output.shape)

# splice the full sequence into shorter sequences for training
training_seq_len = 16
if len(input_) % training_seq_len == 0:
    np.append(input_, 0)
training_input = input_[:len(input_) // training_seq_len * training_seq_len].reshape(
    (-1, training_seq_len))
training_output = output[:len(input_) // training_seq_len * training_seq_len,:,:].reshape(
    (-1, training_seq_len, 71))

print(training_input.shape, training_output.shape)


(1215235,)
(1215235, 1, 71)
(75952, 16) (75952, 16, 71)


In [9]:
model = keras.Sequential()
model.add(keras.layers.Embedding(71, 16, batch_input_shape=(1, None)))
for i in range(2):
    model.add(keras.layers.GRU(
        HIDDEN_SIZE, return_sequences=True, stateful=True))
model.add(keras.layers.TimeDistributed(keras.layers.Dense(len(chars) - filter_chars, activation='softmax')))
model.compile(optimizer='rmsprop', loss='categorical_crossentropy')

print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (1, None, 16)             1136      
_________________________________________________________________
gru_5 (GRU)                  (1, None, 32)             4704      
_________________________________________________________________
gru_6 (GRU)                  (1, None, 32)             6240      
_________________________________________________________________
time_distributed_3 (TimeDist (1, None, 71)             2343      
Total params: 14,423
Trainable params: 14,423
Non-trainable params: 0
_________________________________________________________________
None


In [None]:
model.fit(training_input, training_output, batch_size=1,
          epochs=30, shuffle=False) # , validation_data=(input_, output), validation_freq=10)
model.save(
    './moby-dick.h5')

In [5]:
# grab descriptive statistics about the corpus
file = open('whale2.txt')
full_text = file.read()
import collections

character_set = set(full_text)
character_counts = collections.Counter(full_text)
print(''.join(([k for k, v in character_counts.most_common()])))
print(sorted(list(character_set)), len(character_set)) # 81 unique characters

 etaonsihrldumcwgf,ypbvk.-
;I"'ATS!HBWEqNCPx?OLjRFMDGzYQJU():KV1028573*4Z69X_[$]&
['\n', ' ', '!', '"', '$', '&', "'", '(', ')', '*', ',', '-', '.', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', ']', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z'] 81


In [6]:
# test inference
model.reset_states()
test_input = np.array([char_to_idx.get(ch, 0) for ch in 'abc'], dtype=np.int32)
test_input = np.expand_dims(test_input, axis=0)
result = model.predict(test_input)
result_text = np.argmax(result, axis=2)
print([idx_to_char[ch] for ch in result_text.flatten().tolist()])

(1, 3)
(1, 3, 71)
['b', 'c', ' ']


In [26]:
def infer(ch):
    '''predict the next character given the previous one'''
    test_input = np.array([char_to_idx.get(ch, 0)])
    test_input = np.expand_dims(test_input, axis=0)
    result = model.predict(test_input)
    result_text = np.argmax(result, axis=2)
    return idx_to_char[result_text.flatten()[0]]

infer('b')

' '

In [35]:
import os
print(os.listdir())
keras.__version__

['.ipynb_checkpoints', 'infer_and_score.py', 'moby-dick.h5', 'moby_dick.ipynb', 'Untitled.ipynb', 'whale2.txt']


'2.2.4'