In [1]:
import numpy as np
import string
import os
import tensorflow as tf
tf.enable_eager_execution()

## Data Preparation

Download pre-processed data

In [2]:
!test -f wikitext-2-raw-v1.zip || wget -q https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip
DATA_FOLDER = 'wikitext-2-raw'
!test -d $DATA_FOLDER || unzip wikitext-2-raw-v1.zip

Archive:  wikitext-2-raw-v1.zip
   creating: wikitext-2-raw/
  inflating: wikitext-2-raw/wiki.test.raw  
  inflating: wikitext-2-raw/wiki.valid.raw  
  inflating: wikitext-2-raw/wiki.train.raw  


Use utf-8 scheme to decode text

In [3]:
with open('{}/wiki.train.raw'.format(DATA_FOLDER), 'rb') as f:
  text = f.read().decode(encoding='utf8')

print(text[:1000])

 
 = Valkyria Chronicles III = 
 
 Senjō no Valkyria 3 : Unrecorded Chronicles ( Japanese : 戦場のヴァルキュリア3 , lit . Valkyria of the Battlefield 3 ) , commonly referred to as Valkyria Chronicles III outside Japan , is a tactical role @-@ playing video game developed by Sega and Media.Vision for the PlayStation Portable . Released in January 2011 in Japan , it is the third game in the Valkyria series . Employing the same fusion of tactical and real @-@ time gameplay as its predecessors , the story runs parallel to the first game and follows the " Nameless " , a penal military unit serving the nation of Gallia during the Second Europan War who perform secret black operations and are pitted against the Imperial unit " Calamaty Raven " . 
 The game began development in 2010 , carrying over a large portion of the work done on Valkyria Chronicles II . While it retained the standard features of the series , it also underwent multiple adjustments , such as making the game more forgiving for series 

To reduce the vocabulary size, we remove all words that are neither ASCII-alphabetic nor basic punctuations.

In [4]:
def clean(text):
  def is_ascii(str):
    try:
      str.encode('ascii')
    except UnicodeEncodeError:
      return False
    return True & str.isalpha()

  lines = text.splitlines()
  for i, line in enumerate(lines):
    lines[i] = ' '.join([word for word in line.split() if is_ascii(word) or word in ('.', '=',',')])
  return '\n'.join(lines)

print('Original:\n', text[:100])
print('Reduced:\n', clean(text[:100]))
cleaned_text = clean(text)

Original:
  
 = Valkyria Chronicles III = 
 
 Senjō no Valkyria 3 : Unrecorded Chronicles ( Japanese : 戦場のヴァルキュ
Reduced:
 
= Valkyria Chronicles III =

no Valkyria Unrecorded Chronicles Japanese


Statistics of text

In [5]:
vocab = sorted(set(cleaned_text))
vocab_size = len(vocab)
print('Number of unique characters:', vocab_size)
print(repr(' '.join(vocab)))

Number of unique characters: 57
'\n   , . = A B C D E F G H I J K L M N O P Q R S T U V W X Y Z a b c d e f g h i j k l m n o p q r s t u v w x y z'


### Vectorize the text

Encode each character in the text by a unique number.

In [6]:
# Create mapping from unique characters to indices
char2idx = {char : idx for idx, char in enumerate(vocab)}

# Create mapping from indices to characters
idx2char = np.array(vocab)

# Vectorize the text
text_as_int = np.array([char2idx[char] for char in cleaned_text])

# First 50 words after vectorizing
print('Original:\n', *idx2char[text_as_int[:50]], sep='', end='\n\n')
print('Vectorized:\n', text_as_int[:50])

Original:

= Valkyria Chronicles III =

no Valkyria Unrecord

Vectorized:
 [ 0  4  1 26 31 42 41 55 48 39 31  1  7 38 48 45 44 39 33 42 35 49  1 13
 13 13  1  4  0  0 44 45  1 26 31 42 41 55 48 39 31  1 25 44 48 35 33 45
 48 34]


#### Create target

The sequence contains both input sequence and target sequence.
- Each input sequence contains `seq_length` characters from the text.
- Its corresponding target has the same number of characters except shifted one character to the right.

For example, sequence `cinnamon` is split as `cinnamo` (input) and `innamon` (target).

In [7]:
# The maximum number of characters in a input sequence
input_seq_length = 50

# A sequence contains input and target (shifted 1 word to the right)
seq_length = input_seq_length + 1

# Create stream of words indices
char_indices_stream = tf.data.Dataset.from_tensor_slices(text_as_int)

# Convert stream of characters to sequence of seq_length
seqs = char_indices_stream.batch(seq_length, drop_remainder=True)

# Take a look at first sequence
for seq in seqs.take(1):
  print('Original sequence:\n', *idx2char[seq.numpy()], sep='',end='\n\n')
  print('Sequence shape:', seq.shape, end='\n\n')
  print('Vectorized:\n', seq.numpy())

Original sequence:

= Valkyria Chronicles III =

no Valkyria Unrecorde

Sequence shape: (51,)

Vectorized:
 [ 0  4  1 26 31 42 41 55 48 39 31  1  7 38 48 45 44 39 33 42 35 49  1 13
 13 13  1  4  0  0 44 45  1 26 31 42 41 55 48 39 31  1 25 44 48 35 33 45
 48 34 35]


Split a sequence to input and target one.

In [8]:
input_target_split = lambda seq : (seq[:-1], seq[1:])
train_data = seqs.map(input_target_split)

# Take a look at first input and target sequence
for inp_seq, target_seq in train_data.take(1):
  print('Input seq:\n', *idx2char[inp_seq.numpy()], sep='', end='\n\n')
  print('Target seq:\n', *idx2char[target_seq.numpy()], sep='')

Input seq:

= Valkyria Chronicles III =

no Valkyria Unrecord

Target seq:
= Valkyria Chronicles III =

no Valkyria Unrecorde


#### Create training batches

In [9]:
BATCH_SIZE = 64
BUFFER_SIZE = 5000

# Shuffle data and pack it to batches
batch_data = train_data.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)
batch_data

<DatasetV1Adapter shapes: ((64, 50), (64, 50)), types: (tf.int64, tf.int64)>

## Build the Model

In [0]:
embedding_dim = 256
num_units = 1024

# Loss function
loss = lambda labels, logits : tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

def build_model(batch_size, hidden_layer='RNN'):
  assert hidden_layer in ['RNN', 'GRU', 'LSTM']
  
  model = tf.keras.Sequential()
  model.add(tf.keras.layers.Embedding(vocab_size, embedding_dim, batch_input_shape=[batch_size, None]))
  if hidden_layer == 'RNN':
    model.add(tf.keras.layers.SimpleRNN(num_units, return_sequences=True, stateful=True))
  elif hidden_layer == 'GRU':
    model.add(tf.keras.layers.GRU(num_units, return_sequences=True, stateful=True))
  elif hidden_layer == 'LSTM':
    model.add(tf.keras.layers.LSTM(num_units, return_sequences=True, stateful=True))
  model.add(tf.keras.layers.Dense(vocab_size))
  
  model.compile(optimizer='adam', loss=loss)
  return model

In [11]:
rnn_model = build_model(batch_size=BATCH_SIZE, hidden_layer='RNN')
rnn_model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (64, None, 256)           14592     
_________________________________________________________________
simple_rnn (SimpleRNN)       (64, None, 1024)          1311744   
_________________________________________________________________
dense (Dense)                (64, None, 57)            58425     
Total params: 1,384,761
Trainable params: 1,384,761
Non-trainable params: 0
_________________________________________________________________


## Train RNN Language Model

In [0]:
def generate_sample(model):
  """
  Try model on the first example in the batch
  Output should be of size (batch_size, inp_seq_length, vocab_size)
  """
  for inp_seq, target_seq in batch_data.take(1):
    pred_seq = model(inp_seq)
    print('Prediction shape:', pred_seq.shape)

    sampled_indices = tf.random.categorical(pred_seq[0], num_samples=1)
    sampled_indices = tf.squeeze(sampled_indices, axis=-1).numpy()
    
    print('--- Sample ---', end='\n\n')
    print('Input seq:\n', *idx2char[inp_seq[0].numpy()], sep='', end='\n\n')
    print('Next word predicted:\n', *idx2char[sampled_indices], sep='', end='\n\n')
    print('Loss:', loss(target_seq, pred_seq).numpy().mean())

In [0]:
def config_checkpoints(prefix='RNN'):
  checkpoint_dir = './{}_training_checkpoints'.format(prefix)
  checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")
  checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_prefix, save_weights_only=True)
  return checkpoint_dir, checkpoint_callback

Generate text from the newly created RNN

In [14]:
generate_sample(rnn_model)

Prediction shape: (64, 50, 57)
--- Sample ---

Input seq:
ted to meet in the Columbian Room .
Except for Clu

Next word predicted:
=qMszcIwchqtaORduS.h
Z
WvmONrEpehydJToHxcKHGMuVSfu

Loss: 4.0453234


Train and save checkpoints

In [15]:
rnn_cpdir, rnn_cpcb = config_checkpoints(prefix='RNN')
rnn_model.fit(batch_data, epochs=3, callbacks=[rnn_cpcb])

Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x7f7c8d97c828>

## Predict with RNN LM

In [0]:
def generate_text(model, start_string, num_characters=1000):
  """
  Generate a paragraph of length `num_characters` given a few input words
  """
  text_generated = []
  
  # Vectorize starting string
  input_seq = tf.expand_dims([char2idx[char] for char in start_string], 0)

  model.reset_states()    # Restart states
  temperature = 0.5       # Temperature of softmax during sampling
                          # Lower: more predictable; Higher: more unexpected

  for i in range(num_characters):
    next_char = model(input_seq)
    next_char = tf.squeeze(next_char, 0) / temperature
    next_char_id = tf.random.categorical(next_char, num_samples=1)[-1, 0].numpy()

    input_seq = tf.expand_dims([next_char_id], 0)
    text_generated.append(idx2char[next_char_id])
  
  return start_string + ''.join(text_generated)

In [0]:
def rebuild_model(checkpoint_dir, hidden_layer='RNN'):
  """
  Load weights from latest checkpoints for prediction
  """
  tf.train.latest_checkpoint(checkpoint_dir)

  # Rebuild model
  model = build_model(batch_size=1, hidden_layer=hidden_layer)
  model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))
  model.build(tf.TensorShape([1, None]))

  return model

Rebuild RNN LM which accepts an input vector

In [18]:
rnn_model = rebuild_model(checkpoint_dir=rnn_cpdir, hidden_layer='RNN')
rnn_model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (1, None, 256)            14592     
_________________________________________________________________
simple_rnn_1 (SimpleRNN)     (1, None, 1024)           1311744   
_________________________________________________________________
dense_1 (Dense)              (1, None, 57)             58425     
Total params: 1,384,761
Trainable params: 1,384,761
Non-trainable params: 0
_________________________________________________________________


In [19]:
print(generate_text(rnn_model, '= = Music =', num_characters=200))

= = Music = =


= = Hand , and the can be responsibility in the construction , which is a players and the comparison , as the concerns that were ordered the three species of the later special in the star concern


## Improvements

### LSTM

In [20]:
lstm_model = build_model(batch_size=BATCH_SIZE, hidden_layer='LSTM')
generate_sample(lstm_model)
lstm_model.summary()

Prediction shape: (64, 50, 57)
--- Sample ---

Input seq:
ted to meet in the Columbian Room .
Except for Clu

Next word predicted:
OcH RVYmkWTrUkqTnVJiRjB R.afXqyXEZHZtmdoORDwfJpqsd

Loss: 4.042598
Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (64, None, 256)           14592     
_________________________________________________________________
lstm (LSTM)                  (64, None, 1024)          5246976   
_________________________________________________________________
dense_2 (Dense)              (64, None, 57)            58425     
Total params: 5,319,993
Trainable params: 5,319,993
Non-trainable params: 0
_________________________________________________________________


In [21]:
lstm_cpdir, lstm_cpcb = config_checkpoints(prefix='LSTM')
lstm_model.fit(batch_data, epochs=3, callbacks=[lstm_cpcb])
lstm_model = rebuild_model(checkpoint_dir=lstm_cpdir, hidden_layer='LSTM')

Epoch 1/3
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Epoch 2/3
Epoch 3/3


In [22]:
print(generate_text(lstm_model, '= = Music = ', num_characters=200))

= = Music = = =

Alkan Morhange stars have a member of the new road is a contemporary record of the spectral sequel . The spectrum began to take into many of the construction of the control of the trillion and pr


### GRU

In [23]:
gru_model = build_model(batch_size=BATCH_SIZE, hidden_layer='GRU')
generate_sample(gru_model)
gru_model.summary()

Prediction shape: (64, 50, 57)
--- Sample ---

Input seq:
ted to meet in the Columbian Room .
Except for Clu

Next word predicted:
uSFWnT
Gff=QxZIN
PKoODWh vjGoQAEYmoucPTY
eBSNvsRWI

Loss: 4.041979
Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (64, None, 256)           14592     
_________________________________________________________________
gru (GRU)                    (64, None, 1024)          3935232   
_________________________________________________________________
dense_4 (Dense)              (64, None, 57)            58425     
Total params: 4,008,249
Trainable params: 4,008,249
Non-trainable params: 0
_________________________________________________________________


In [24]:
gru_cpdir, gru_cpcb = config_checkpoints('GRU')
gru_model.fit(batch_data, epochs=3, callbacks=[gru_cpcb])
gru_model = rebuild_model(checkpoint_dir=gru_cpdir, hidden_layer='GRU')

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [25]:
print(generate_text(gru_model, '= = Music = ', num_characters=200))

= = Music = =

A units of the star and the side of the art show , and approximately manuscripts , and stars are also important , and the any of the most exploration of knowledge of the first time of the songs to 
