Skip to content

Commit

Permalink
update transformer, dataloader to fix
Browse files Browse the repository at this point in the history
Former-commit-id: 548a917
  • Loading branch information
Shi Haoran committed Nov 7, 2017
1 parent 620a76a commit c7dc812
Show file tree
Hide file tree
Showing 6 changed files with 244 additions and 45 deletions.
46 changes: 27 additions & 19 deletions examples/transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
# We shall wrap all these modules
from txtgen.data import database
from txtgen.modules import ConstantConnector
from txtgen.modules import BasicRNNDecoder, get_helper
from txtgen.modules import TransformerEncoder, TransformerDecoder
from txtgen.losses import mle_losses
from txtgen.core import optimization as opt
from txtgen import context
Expand All @@ -28,13 +28,26 @@
data_hparams = {
"num_epochs": 10,
"seed": 123,
"dataset": {
"batch_size":3,
"source_dataset": {
"files": ['data/sent.txt'],
"vocab_file": 'data/vocab.txt'
},
"target_dataset": {
"files": ['data/sent.txt'],
"vocab_share": True,
"reader_share": True,
"processing":{
"eos_token": "<TARGET_EOS>"
}
}
}
# Construct the database
src_db, tgt_db = database.PairedTextDataBase(data_hparams)
text_database = database.PairedTextDataBase(data_hparams)
print('database finished')

##TODO(haoran) bug: the text_database cannot be called here
text_data_batch = text_database()
# Get data minibatch, which is a dictionary:
# {
# "text": text_tensor, # text string minibatch,
Expand All @@ -43,34 +56,29 @@
# "text_ids": text_id_tensor, # a 2D int tensor of token ids with shape
# # `[batch_size, max_seq_length]`
# }
src_data_batch, tgt_data_batch = src_db(), tgt_db
### Build model

# Build decoder. Simply use the default hyperparameters.
#decoder = rnn_decoders.BasicRNNDecoder(vocab_size=text_db.vocab.vocab_size)
encoder = BasicRNNEncoder(vocab_size = src_db.vovab.vocab_size)
decoder = BasicRNNDecoder(vocab_size=tgt_db.vocab.vocab_size)
encoder = TransformerEncoder(vocab_size=text_database.source_vocab.vocab_size)
decoder = TransformerDecoder(vocab_size=text_database.target_vocab.vocab_size)

# Build connector, which simply feeds zero state to decoder as initial state
connector = ConstantConnector(decoder.state_size)
print('encoder decoder finished')
src_text = text_data_batch['source_text']
tgt_text = text_data_batch['target_text']

# Build helper used in training.
# We shall probably improve the interface here.
helper_train = get_helper(
decoder.hparams.helper_train.type,
inputs=data_batch['text_ids'][:, :-1],
sequence_length=data_batch['length'] - 1,
embedding=decoder.embedding)

encoder_output = encoder(src_text['text_ids'][:, :-1],
sequence_length=src_text['length']-1)
# Decode
outputs, final_state, sequence_lengths = decoder(
helper=helper_train, initial_state=connector(text_db.batch_size))
initial_state=connector(text_database._hparams.batch_size))

# Build loss
mle_loss = mle_losses.average_sequence_sparse_softmax_cross_entropy(
labels=data_batch['text_ids'][:, 1:],
logits=outputs.rnn_output,
sequence_length=sequence_lengths - 1)
labels=tgt_text['text_ids'][:, 1:],
logits=outputs.output_logits,
sequence_length=sequence_lengths-1)

# Build train op. Only config the optimizer while using default settings
# for other hyperparameters.
Expand Down
94 changes: 76 additions & 18 deletions txtgen/core/layers.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,8 @@

import tensorflow as tf
import tensorflow.contrib.rnn as rnn

import numpy as np
from txtgen import context
from txtgen.hyperparams import HParams
from txtgen.core.utils import get_instance, switch_dropout

Expand Down Expand Up @@ -249,7 +250,7 @@ def get_embedding(hparams=None,
initializer=init_values,
trainable=hparams["trainable"])

def sinuoid_positional_encoding(inputs,
def sinusoid_positional_encoding(inputs,
zero_pad=True,
scale=True,
reuse=None,
Expand All @@ -265,31 +266,30 @@ def sinuoid_positional_encoding(inputs,
scope: [String], Optional scope for 'variable_scope'
position_duration: [Int], default=10000
"""
batch_size, max_time, hidden_dim = inputs.get_shape().as_list()
with tf.variable_scope(scope, reuse=reuse):
batch_size, max_time, hidden_dim = inputs.get_shape().as_list()
input_one = tf.tile(tf.expand_dims(tf.range(max_time), 0), [batch_size, 1]) #batch_size * max_time
position_block = tf.tile(tf.expand_dims(tf.range(max_time), 1), [1, num_units // 2])
unit_block = tf.tile(tf.expand_dims(tf.range(hidden_dim // 2), 0), [max_time, 1])
rad_block = tf.pow(tf.div(position_block, tf.multiply(position_duration, 1)), tf.div(unit_block, hidden_dim // 2))
position_idx = tf.tile(tf.expand_dims(tf.range(max_time), 0), [batch_size, 1]) #batch_size * max_time
position_enc = np.array([
[pos /np.power(10000, 2.*i/hidden_dim) for i in range(hidden_dim)]
for pos in range(max_time)])

sin_block = tf.sin(tf.cast(rad_block, tf.float32))
cos_block = tf.cos(tf.cast(rad_block, tf.float32))
lookup_table = tf.concat([sin_block, cos_block], axis = 1)
position_enc[:, 0::2] = np.sin(position_enc[:, 0::2])
position_enc[:, 1::2] = np.cos(position_enc[:, 1::2])

lookup_table = tf.convert_to_tensor(position_enc)
if zero_pad:
lookup_table = tf.concat((tf.zeros(shape = [1, num_units]), lookup_table[1:, :]), 0)
outputs = tf.nn.embedding_lookup(lookup_table, input_one)
lookup_table = tf.concat((tf.zeros(shape=[1, hidden_dim]),
lookup_table[1:, :]), 0)
outputs = tf.nn.embedding_lookup(lookup_table, position_idx)
if scale:
outputs = outputs * math.sqrt(hidden_dim)
outputs = outputs * hidden_dim**0.5
return outputs


def multihead_attention(queries,
keys,
num_units= None,
num_heads=8,
dropout_rate=0,
is_training=True,
causality = False,
scope = 'multihead_attention',
reuse= None):
Expand All @@ -299,7 +299,6 @@ def multihead_attention(queries,
keys: A 3d tensor with shape of [N, T_k, C_k].
num_units: A scalar. Attention size.
dropout_rate: A floating point number.
is_training: Boolean. Controller of mechanism for dropout.
causality: Boolean. Should be true, units that reference the future are masked
num_heads: An int. Number of heads.
scope: Optional scope for `variable_scope`.
Expand Down Expand Up @@ -355,7 +354,7 @@ def multihead_attention(queries,
outputs *= query_masks # broadcasting. (N, T_q, C)

# Dropouts
outputs = tf.layers.dropout(outputs, rate=dropout_rate, training=tf.convert_to_tensor(is_training))
outputs = tf.layers.dropout(outputs, rate=dropout_rate, training=context.is_train())

# Weighted sum
outputs = tf.matmul(outputs, V_) # ( h*N, T_q, C/h)
Expand All @@ -367,6 +366,65 @@ def multihead_attention(queries,
outputs += queries

# Normalize
outputs = normalize(outputs) # (N, T_q, C)

return outputs



def poswise_feedforward(attended_dec, scope="multihead_attention", reuse=None):
'''Point-wise feed forward net.
Args:
inputs: A 3d tensor with shape of [N, T, C].
num_units: A list of two integers.
scope: Optional scope for `variable_scope`.
reuse: Boolean, whether to reuse the weights of a previous layer
by the same name.
Returns:
A 3d tensor with the same shape and dtype as inputs
'''
hidden_dim = attended_dec.shape().as_list()[-1]
with tf.variable_scope(scope, reuse=reuse):
outputs = tf.layers.conv1d(inputs = attended_dec,
filters=hidden_dim*4,
kernel_size=1,
activation=tf.nn.relu,
use_bias=True)
outputs = tf.layers.conv1d(inputs = outputs,
filters=hidden_dim,
kernel_size=1,
activation=None,
use_bias=True)
outputs += attended_dec #residual connection
return outputs

def normalize(inputs,
epsilon = 1e-8,
scope="ln",
reuse=None):
'''Applies layer normalization.
Args:
inputs: A tensor with 2 or more dimensions, where the first dimension has
`batch_size`.
epsilon: A floating number. A very small number for preventing ZeroDivision Error.
scope: Optional scope for `variable_scope`.
reuse: Boolean, whether to reuse the weights of a previous layer
by the same name.
Returns:
A tensor with the same shape and data dtype as `inputs`.
'''
with tf.variable_scope(scope, reuse=reuse):
inputs_shape = inputs.get_shape()
params_shape = inputs_shape[-1:]

mean, variance = tf.nn.moments(inputs, [-1], keep_dims=True)
beta= tf.Variable(tf.zeros(params_shape))
gamma = tf.Variable(tf.ones(params_shape))
normalized = (inputs - mean) / ( (variance + epsilon) ** (.5) )
outputs = gamma * normalized + beta

return outputs

2 changes: 1 addition & 1 deletion txtgen/modules/decoders/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,4 +12,4 @@
from txtgen.modules.decoders.rnn_decoder_base import *
from txtgen.modules.decoders.rnn_decoders import *
from txtgen.modules.decoders.rnn_decoder_helpers import *

from txtgen.modules.decoders.transformer_decoders import *
131 changes: 131 additions & 0 deletions txtgen/modules/decoders/transformer_decoders.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
"""
transformer decoders. Attention is all you need.
"""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import collections
# pylint: disable=no-name-in-module, too-many-arguments, too-many-locals
# pylint: disable=not-context-manager
import tensorflow as tf
from tensorflow.contrib.seq2seq import Decoder as TFDecoder
from tensorflow.python.framework import tensor_shape, dtypes

from txtgen.modules.module_base import ModuleBase
from txtgen.core import layers
from txtgen import context

class TransformerDecoderOutput(
collections.namedtuple("TransformerDecoderOutput",
("output_logits", "sample_ids"))):
"""the output logits and sampled_ids"""
pass

class TransformerDecoder(ModuleBase, TFDecoder):
"""decoder for transformer: Attention is all you need
"""
def __init__(self,
embedding=None,
vocab_size=None,
hparams=None):
ModuleBase.__init__(self, hparams)
self._embedding = None
if self._hparams.embedding_enabled:
if embedding is None and vocab_size == None:
raise ValueError("If 'embedding' is not provided, "
"'vocab_size' must be specified.")
if isinstance(embedding, tf.Variable):
self._embedding = embedding
else:
self._embedding = layers.get_embedding(
self._hparams.target.embedding, embedding, vocab_size,
self.variable_scope)
embed_dim = self._embedding.shape()[1]
if self._hparams.zero_pad:
self._embedding = tf.concat((tf.zeros(shape=[1, embed_dim]),
self._embedding[1:, :]), 0)
if self._hparams.embedding.trainable:
self._add_trainable_variable(self._embedding)

@staticmethod
def default_hparams():
return {
"embedding_enabled": True,
"embedding": layers.default_embedding_hparams(),
"name":"transformer_decoder",
"num_heads":8,
"num_units":64,
}

def initialize(self, name=None):
return self._helper.initialize() + (self._initial_state,)

def _build(self, inputs, encoder_output):
# max_decoding_length_train = self._hparams.max_decoding_length_train
# if max_decoding_length_train is None:
# max_decoding_length_train = utils.MAX_SEQ_LENGTH
# if max_decoding_length_infer is None:
# max_decoding_length_infer = utils.MAX_SEQ_LENGTH
# max_decoding_length = tf.cond(
# context.is_train(),
# lambda: max_decoding_length_train,
# lambda: max_decoding_length_infer)
if self._embedding is not None:
tgt_embedding = tf.nn.embedding_lookup(self._embedding, inputs)
else:
tgt_embedding = inputs
num_units = tf.shape(tgt_embedding).as_list()[2]
if self.scale:
tgt_embedding = tgt_embedding * tf.sqrt(num_units)
if self._hparams.sinusoid:
position_dec_embeds = layers.sinusoid_positional_encoding(tgt_embedding,
scope = "dec_pe")
dec_input = tf.layers.dropout(tgt_embedding + position_dec_embeds,
rate = self._hparams.encoder.dropout_rate,
training = context.is_train())
hparams = self._hparams
with tf.variable_scope(self.variable_scope):
for i in range(self._hparams.decoder.num_blocks):
with tf.variable_scope("num_blocks_{}".format(i)):
attended_dec = layers.multihead_attention(queries = dec_input,
keys = dec_input,
num_units = hparams.hidden_units,
num_heads = hparams.num_heads,
is_training = context.is_train(),
dropout_rate = hparams.dropout_rate,
causility = True,
scope = "self_attention")
attended_dec = layers.normalize(attended_dec)

attended_dec = layers.multihead_attention(queries = dec_input,
keys = encoder_output,
num_units = hparams.hidden_units,
num_heads = hparams.num_heads,
dropout_rate = hparams.dropout_rate,
is_training = context.is_train(),
causality = False,
scope = "vanilla_attention")
attended_dec = layers.normalize(attended_dec)

attended_dec = layers.poswise_feed_forward(attended_dec)
attended_dec = layers.normalize(attended_dec)
#[batch, seq_len, hidden_units]
self.logits = tf.layers.dense(self.attended_dec, self._vocab_size)
self.preds = tf.to_int32(tf.arg_max(self.logits, dimension=-1))
#[batch, seq_len]
return self.logits, self.preds

@property
def output_size(self):
return TransformerDecoderOutput(
output_logits=tensor_shape.TensorShape([None, None, self._vocab_size]),
sample_id=tensor_shape.TensorShape([None, None])
)

@property
def output_dtype(self):
return TransformerDecoderOutput(
output_logits=dtypes.float32, sample_id=dtypes.int32)

2 changes: 1 addition & 1 deletion txtgen/modules/encoders/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,4 +12,4 @@
from txtgen.modules.encoders.encoder_base import *
from txtgen.modules.encoders.rnn_encoders import *
from txtgen.modules.encoders.hierarchical_encoders import *

from txtgen.modules.encoders.transformer_encoders import *

0 comments on commit c7dc812

Please sign in to comment.