# Attention-Based Recurrent Neural Network Models for Joint Intent Detection and Slot Filling

## 模型介绍

![](https://github.com/applenob/RNN-for-Joint-NLU/raw/master/res/arc.png)

形式化表达整理：

- 输入序列：$x = (x_1,...x_T)$
- 输出序列：$y = (y_1,...y_T)$，长度和$x$相同。
- Encoder：时刻i，
- 隐藏状态：$h_i = [fh_i, bh_i]$，前向状态+后向状态。
- Decoder：时刻i，
- 状态：$s_i$，$s_i = f(s_{i-1}, y_{i-1}, h_i, c_i)$
- 其中，context向量：$c_i$，$c_i = \sum^{T}_{j=1}\alpha_{i,j}h_j$
- attention参数：$\alpha_{i,j} = \frac{exp(e_{i,j})}{\sum^T_{k=1}exp(e_{i,k})}$
- $e_{i,k} = g(s_{i-1}, h_k)$
- $g$是一个小型神经网络。

In [1]:
import json
import tensorflow as tf
import matplotlib.pyplot as plt
import random
import numpy as np
from collections import Counter
import pickle
%matplotlib inline

* http://www.isca-speech.org/archive/Interspeech_2016/pdfs/1352.PDF
* https://arxiv.org/pdf/1409.0473.pdf

In [2]:
def prepare_sequence(seq, w2ix):
    idxs = list(map(lambda w: w2ix[w] if w in w2ix.keys() else w2ix["<UNK>"], seq))
    # todo
    tensor = Variable(torch.LongTensor(idxs)).cuda() if USE_CUDA else Variable(torch.LongTensor(idxs))
    return tensor


In [3]:
flatten = lambda l: [item for sublist in l for item in sublist]  # 二维展成一维 

## Data load and Preprocessing

In [4]:
train = open("dataset/atis-2.train.w-intent.iob","r").readlines()
train = [t[:-1] for t in train]  # 去掉'\n'
# 数据的一行像这样：'BOS i want to fly from baltimore to dallas round trip EOS\tO O O O O O B-fromloc.city_name O B-toloc.city_name B-round_trip I-round_trip atis_flight'
# 分割成这样[原始句子的词，标注的序列，intent]
train = [[t.split("\t")[0].split(" "),t.split("\t")[1].split(" ")[:-1],t.split("\t")[1].split(" ")[-1]] for t in train]
train = [[t[0][1:-1],t[1][1:],t[2]] for t in train]  # 将BOS和EOS去掉，并去掉对应标注序列中相应的标注

In [5]:
seq_in, seq_out, intent = list(zip(*train))

In [6]:
vocab = set(flatten(seq_in))
slot_tag = set(flatten(seq_out))
intent_tag = set(intent)

In [7]:
" ".join(vocab)

"okay cost b operation prefer them reservation 100 toronto on costs c when texas seats afternoons database type please 733 497766 1145 819 730 it services bring closest phoenix following 1055 1993 along located supper flies flights friends april number 281 angeles fare sometime companies kind maximum lunch non stop are midnight different advertises pennsylvania ninth 1505 travels 12 as again next make 530 m transport for mitchell is mean 1220 listing 257 served calling denver dfw 3 equal 723 1200 york serving times quebec capacity class hours taxi taking information using orlando ap57 earlier 163 they i'm cleveland july 1024 qx i'll look 1205 four 645 pearson pm flying 615 after trans i'd 137338 choices belong like arriving sorry canada 515 qo meaning originating this 1222 1110 wednesdays continental more latest what's ticket passengers enroute 1026 thank let oh any second noon least seventeen logan trips each final twa and toward 1017 canadian find 217 mondays limo october sd 21 far 1

In [8]:
len(vocab)

867

In [9]:
len(slot_tag)

120

In [10]:
len(intent_tag)

21

In [11]:
LENGTH=50
sin=[]
sout=[]

In [12]:
# padding，原始序列和标注序列结尾+<EOS>+n×<PAD>
for i in range(len(seq_in)):
    temp = seq_in[i]
    if len(temp)<LENGTH:
        temp.append('<EOS>')
        while len(temp)<LENGTH:
            temp.append('<PAD>')
    else:
        temp = temp[:LENGTH]
        temp[-1]='<EOS>'
    sin.append(temp)
    
    temp = seq_out[i]
    if len(temp)<LENGTH:
        while len(temp)<LENGTH:
            temp.append('<PAD>')
    else:
        temp = temp[:LENGTH]
        temp[-1]='<EOS>'
    sout.append(temp)

In [13]:
# 生成word2index
word2index = {'<PAD>': 0, '<UNK>':1,'<SOS>':2,'<EOS>':3}
for token in vocab:
    if token not in word2index.keys():
        word2index[token]=len(word2index)

# 生成index2word
index2word = {v:k for k,v in word2index.items()}

# 生成tag2index
tag2index = {'<PAD>' : 0}
for tag in slot_tag:
    if tag not in tag2index.keys():
        tag2index[tag] = len(tag2index)
        
# 生成index2tag
index2tag = {v:k for k,v in tag2index.items()}

# 生成intent2index
intent2index={}
for ii in intent_tag:
    if ii not in intent2index.keys():
        intent2index[ii] = len(intent2index)

# 生成index2intent
index2intent = {v:k for k,v in intent2index.items()}

In [14]:
train = list(zip(sin,sout,intent))

In [15]:
train[0][2]

'atis_flight'

In [16]:
def getBatch(batch_size,train_data):
    random.shuffle(train_data)
    sindex=0
    eindex=batch_size
    while eindex < len(train_data):
        batch = train_data[sindex:eindex]
        temp = eindex
        eindex = eindex+batch_size
        sindex = temp
        
        yield batch

In [17]:
# train_data=[]

# for tr in train:
    
#     temp = prepare_sequence(tr[0],word2index)
#     temp = temp.view(1,-1)
    
#     temp2 = prepare_sequence(tr[1],tag2index)
#     temp2 = temp2.view(1,-1)
    
#     temp3 = Variable(torch.LongTensor([intent2index[tr[2]]])).cuda() if USE_CUDA else Variable(torch.LongTensor([intent2index[tr[2]]]))
    
#     train_data.append((temp,temp2,temp3))

## Tensorflow的动态rnn

`tf.nn.rnn creates an unrolled graph for a fixed RNN length. That means, if you call tf.nn.rnn with inputs having 200 time steps you are creating a static graph with 200 RNN steps. First, graph creation is slow. Second, you’re unable to pass in longer sequences (> 200) than you’ve originally specified.tf.nn.dynamic_rnn solves this. It uses a tf.While loop to dynamically construct the graph when it is executed. That means graph creation is faster and you can feed batches of variable size.`

摘自[Whats the difference between tensorflow dynamic_rnn and rnn?](https://stackoverflow.com/questions/39734146/whats-the-difference-between-tensorflow-dynamic-rnn-and-rnn)。也就是说，静态的rnn必须提前将图展开，在执行的时候，图是固定的，并且最大长度有限制。而动态rnn可以在执行的时候，将图循环地的复用。


## Modeling

In [18]:
input_steps = 50
embedding_size = 64
hidden_size = 100
n_layers = 2
batch_size = 16
vocab_size = 876
slot_size = 120
intent_size = 21

In [19]:
encoder_inputs = tf.placeholder(tf.int32, [batch_size, input_steps],
                                     name='encoder_inputs')
# 每句输入的实际长度，除了padding
encoder_inputs_actual_length = tf.placeholder(tf.int32, [batch_size],
                                                   name='encoder_inputs_actual_length')
decoder_targets = tf.placeholder(tf.int32, [batch_size, input_steps],
                                      name='encoder_inputs')
intent_targets = tf.placeholder(tf.int32, [batch_size],
                                      name='intent_targets')

### embedding

In [20]:
embeddings = tf.Variable(tf.random_uniform([vocab_size, embedding_size],
                                                -1.0, 1.0), dtype=tf.float32)

encoder_inputs_embedded = tf.nn.embedding_lookup(embeddings, encoder_inputs)

In [21]:
encoder_inputs_embedded

<tf.Tensor 'embedding_lookup:0' shape=(16, 50, 64) dtype=float32>

## Encoder

In [22]:
from tensorflow.contrib.rnn import LSTMCell, LSTMStateTuple

In [23]:
# 使用单个LSTM cell
encoder_cell = LSTMCell(hidden_size)

In [24]:
encoder_inputs_time_major = tf.transpose(encoder_inputs_embedded, perm=[1,0,2])
encoder_inputs_time_major

<tf.Tensor 'transpose:0' shape=(50, 16, 64) dtype=float32>

In [25]:
(encoder_fw_outputs, encoder_bw_outputs), (encoder_fw_final_state, encoder_bw_final_state) = \
    tf.nn.bidirectional_dynamic_rnn(cell_fw=encoder_cell,
                                    cell_bw=encoder_cell,
                                    inputs=encoder_inputs_time_major,
                                    sequence_length=encoder_inputs_actual_length,
                                    dtype=tf.float32, time_major=True)

In [26]:
encoder_fw_outputs  # T*B*D

<tf.Tensor 'bidirectional_rnn/fw/fw/TensorArrayStack/TensorArrayGatherV3:0' shape=(50, 16, 100) dtype=float32>

In [27]:
encoder_bw_outputs  # T*B*D

<tf.Tensor 'ReverseSequence:0' shape=(50, 16, 100) dtype=float32>

In [28]:
encoder_concat_outputs = tf.concat((encoder_fw_outputs, encoder_bw_outputs), 2)

In [29]:
encoder_fw_final_state  # B*D

LSTMStateTuple(c=<tf.Tensor 'bidirectional_rnn/fw/fw/while/Exit_2:0' shape=(16, 100) dtype=float32>, h=<tf.Tensor 'bidirectional_rnn/fw/fw/while/Exit_3:0' shape=(16, 100) dtype=float32>)

In [30]:
encoder_bw_final_state  # B*D

LSTMStateTuple(c=<tf.Tensor 'bidirectional_rnn/bw/bw/while/Exit_2:0' shape=(16, 100) dtype=float32>, h=<tf.Tensor 'bidirectional_rnn/bw/bw/while/Exit_3:0' shape=(16, 100) dtype=float32>)

In [31]:
encoder_outputs = tf.concat((encoder_fw_outputs, encoder_bw_outputs), 2)

encoder_final_state_c = tf.concat(
    (encoder_fw_final_state.c, encoder_bw_final_state.c), 1)

encoder_final_state_h = tf.concat(
    (encoder_fw_final_state.h, encoder_bw_final_state.h), 1)

encoder_final_state = LSTMStateTuple(
    c=encoder_final_state_c,
    h=encoder_final_state_h
)

## Decoder

In [32]:
decoder_cell = LSTMCell(hidden_size*2)

In [33]:
decoder_lengths = encoder_inputs_actual_length

In [34]:
slot_W = tf.Variable(tf.random_uniform([hidden_size*2, slot_size], -1, 1), dtype=tf.float32, name="slot_W")
slot_b = tf.Variable(tf.zeros([slot_size]), dtype=tf.float32, name="slot_b")
intent_W = tf.Variable(tf.random_uniform([hidden_size*2, intent_size], -1, 1), dtype=tf.float32, name="intent_W")
intent_b = tf.Variable(tf.zeros([intent_size]), dtype=tf.float32, name="intent_b")

In [35]:
# 求intent
intent_logits = tf.add(tf.matmul(encoder_final_state_h, intent_W), intent_b)
intent_prob = tf.nn.softmax(intent_logits)
intent = tf.argmax(intent_prob, axis=1)

In [36]:
sos_time_slice = tf.ones([batch_size], dtype=tf.int32, name='SOS') * 2

sos_step_embedded = tf.nn.embedding_lookup(embeddings, sos_time_slice)

In [37]:
sos_time_slice

<tf.Tensor 'mul:0' shape=(16,) dtype=int32>

### 开始Hack

像上面Encoder使用的那样，标准的`tf.nn.dynamic_rnn`需要提前将所有的输入都提前包装到一个tensor里传过去。当Decoder需要使用上一个时间节点的输出时，这就不可能提前包装好。即标准的动态rnn相当于：$s_i = f(s_{i-1}, x_i)$；但如果这个函数的参数需要扩充，比如我们做的：$s_i = f(s_{i-1}, y_{i-1}, h_i, c_i)$。于是我们需要Hack。

**Loop transition function**：
- 关键点需要解决这个循环转移函数。
- 这个函数是这样的映射：(time, previous_cell_output, previous_cell_state, previous_loop_state) -> (elements_finished, input, cell_state, output, loop_state)。
- 两个调用时机：1.time=0的时候调用，提供初始的cell_state和输入。2.两个时间节点之间调用。


In [38]:
def loop_fn_initial():
    initial_elements_finished = (0 >= decoder_lengths)  # all False at the initial step
    initial_input = tf.concat((sos_step_embedded, encoder_concat_outputs[0]), 1)
    # 将上面encoder的最终state传入decoder
    initial_cell_state = encoder_final_state
    initial_cell_output = None
    initial_loop_state = None  
    return (initial_elements_finished,
            initial_input,
            initial_cell_state,
            initial_cell_output,
            initial_loop_state)

In [39]:
def loop_fn_transition(time, previous_output, previous_state, previous_loop_state):

    # 上一个时间节点上的输出类别，获取embedding再作为下一个时间节点的输入
    output_logits = tf.add(tf.matmul(previous_output, slot_W), slot_b)
    prediction = tf.argmax(output_logits, axis=1)
    next_input = tf.nn.embedding_lookup(embeddings, prediction)
    
    elements_finished = (time >= decoder_lengths) # this operation produces boolean tensor of [batch_size]
                                                  # defining if corresponding sequence has ended

    finished = tf.reduce_all(elements_finished) # -> boolean scalar
    # 输入是h_i+o_{i-1}
    input_ = tf.concat((next_input, encoder_concat_outputs[time]), 1)
    state = previous_state
    output = previous_output
    loop_state = None

    return (elements_finished, 
            input_,
            state,
            output,
            loop_state)


In [40]:
def loop_fn(time, previous_output, previous_state, previous_loop_state):
    if previous_state is None:    # time == 0
        assert previous_output is None and previous_state is None
        return loop_fn_initial()
    else:
        return loop_fn_transition(time, previous_output, previous_state, previous_loop_state)

In [41]:
decoder_outputs_ta, decoder_final_state, _ = tf.nn.raw_rnn(decoder_cell, loop_fn)
decoder_outputs = decoder_outputs_ta.stack()

In [42]:
decoder_outputs

<tf.Tensor 'TensorArrayStack/TensorArrayGatherV3:0' shape=(?, 16, 200) dtype=float32>

In [43]:
decoder_max_steps, decoder_batch_size, decoder_dim = tf.unstack(tf.shape(decoder_outputs))
decoder_outputs_flat = tf.reshape(decoder_outputs, (-1, decoder_dim))
decoder_logits_flat = tf.add(tf.matmul(decoder_outputs_flat, slot_W), slot_b)
decoder_logits = tf.reshape(decoder_logits_flat, (decoder_max_steps, decoder_batch_size, vocab_size))

In [44]:
decoder_prediction = tf.argmax(decoder_logits, 2)

In [45]:
stepwise_cross_entropy = tf.nn.softmax_cross_entropy_with_logits(
    labels=tf.one_hot(decoder_targets, depth=slot_size, dtype=tf.float32),
    logits=decoder_logits)

loss_slot = tf.reduce_mean(stepwise_cross_entropy)

In [51]:
cross_entropy = tf.nn.softmax_cross_entropy_with_logits(
    labels=tf.one_hot(intent_targets, depth=intent_size, dtype=tf.float32),
    logits=intent_logits)
loss_intent = tf.reduce_mean(cross_entropy)

In [52]:
loss = loss_slot + loss_intent
optimizer = tf.train.AdamOptimizer()
grads = optimizer.compute_gradients(loss)
for i, (g, v) in enumerate(grads):
    if g is not None:
        grads[i] = (tf.clip_by_norm(g, 5), v)  # clip gradients
train_op = optimizer.apply_gradients(grads)