In [4]:
import torch
import torchtext
import torchtext.data as data
import numpy as np
import os

### 1. 数据加载

#### 定义每个键值对应的处理方法

In [5]:
char_field_nesting =  data.Field(batch_first=True, tokenize=list)
char_field = data.NestedField(char_field_nesting)
raw = data.RawField()
raw.is_target = False
word_field = data.Field(batch_first=True)
label_field = data.Field(sequential=False, use_vocab=False)

list_fields = [('id', raw),('s_idx', label_field),('e_idx', label_field),('c_word', word_field), ('c_char', char_field),
              ('q_word', word_field), ('q_char', char_field)]    


#### 加载数据

In [6]:
train_examples_path = './train_examples.pt'
val_examples_path = './val_examples.pt'

train_examples = torch.load(train_examples_path)
val_examples = torch.load(val_examples_path)

train_set = data.Dataset(examples=train_examples, fields=list_fields)
val_set = data.Dataset(examples=val_examples, fields=list_fields)

#### 建立vocab

In [9]:
char_field.build_vocab(train_set, val_set)
word_field.build_vocab(train_set, val_set, vectors=torchtext.vocab.GloVe(dim=100,name='6B'))


In [26]:
print(char_field.vocab.itos)

['<unk>', '<pad>', 'e', 'a', 't', 'i', 'n', 'o', 's', 'r', 'h', 'l', 'd', 'c', 'u', 'm', 'p', 'f', 'g', 'y', 'b', ',', 'w', '.', 'v', 'k', "'", '`', '1', '0', ')', '(', '9', '-', '>', '<', '2', 'j', 'x', 'z', '8', '5', '3', '6', '4', '7', 'q', ';', ':', '/', '%', 'é', '&', 'ö', 'ä', 'ü', 'ó', 'á', 'í', 'ø', '$', 'å', '°', '!', '#', '²', 'è', 'ô', ']', '+', '[', '?', 'ç', 'ú', 'æ', 'ë', 'ñ', 'à', 'î', 'š', '=', 'ã', '*', 'â', 'ð', '×', '€', 'ê', 'ò', 'ß', 'ý', 'ï', 'ž', '£', '_', 'û', '~', 'ì', '\\', 'þ', '±', 'ù', 'œ', 'µ', 'õ', '·', '¹', 'ÿ', '^', '|', '«', '³', '»', '{', '}', '§', '@', '¥', 'º', '¡', '¿', '©', '¢', '¬', 'ª', '®', '\x93', '\x94']


#### 创建迭代器

In [10]:
gpu = torch.device("cuda:0")
cpu = torch.device("cpu")

In [13]:
batch_size = 16
shuffle = False

In [18]:
train_iter, val_iter = data.BucketIterator.splits((train_set, val_set), batch_sizes=[batch_size,batch_size], 
                                                  device=gpu, shuffle=shuffle, sort=False)

#### 查看数据

In [20]:
iterator = iter(val_iter)

In [23]:
batch = next(iterator)
print(batch)


[torchtext.data.batch.Batch of size 16]
	[.id]:['WH_dev_16', 'WH_dev_17', 'WH_dev_18', 'WH_dev_19', 'WH_dev_20', 'WH_dev_21', 'WH_dev_22', 'WH_dev_23', 'WH_dev_24', 'WH_dev_25', 'WH_dev_26', 'WH_dev_27', 'WH_dev_28', 'WH_dev_29', 'WH_dev_30', 'WH_dev_31']
	[.s_idx]:[torch.cuda.LongTensor of size 16 (GPU 0)]
	[.e_idx]:[torch.cuda.LongTensor of size 16 (GPU 0)]
	[.c_word]:[torch.cuda.LongTensor of size 16x3553 (GPU 0)]
	[.c_char]:[torch.cuda.LongTensor of size 16x3553x24 (GPU 0)]
	[.q_word]:[torch.cuda.LongTensor of size 16x8 (GPU 0)]
	[.q_char]:[torch.cuda.LongTensor of size 16x8x14 (GPU 0)]


### 2.模型定义

Bi-Direction attention flow 模型包括以下几层:
1. Embedding Layer
    - Word Embedding
    - Char Embedding
2. Contextual Embedding Layer
    - Highway Network
    - Bi-LSTM
3. Attention Flow Layer
4. Modeling Layer
5. Ouput Layer


#### Embedding Layer

包括Word Embedding 和 Character Embedding.

Word Embedding 使用预训练的Glove vector词向量且不变。

Character Embedding 使用Character CNN，需要训练。

In [25]:
char_input = batch.q_char
word_input = batch.q_word
print(char_input.shape, word_input.shape)

torch.Size([16, 8, 14]) torch.Size([16, 8])


In [33]:
char_vocab_size = len(char_field.vocab.stoi)
word_vocab_size = len(word_field.vocab.stoi)
char_dim = 8
char_channel_width = 5
char_channel_size = 100

In [47]:
import torch.nn as nn
import torch.nn.functional as F

In [41]:
char_emb = nn.Embedding(char_vocab_size, char_dim, padding_idx=1)
char_conv = nn.Conv2d(1, char_channel_size, (char_dim, char_channel_width))
char_emb.to(gpu)
char_conv.to(gpu)

Conv2d(1, 100, kernel_size=(8, 5), stride=(1, 1))

In [42]:
char_input = batch.q_char
print(f'char_input size:{char_input.shape}')
char_input_emb = char_emb(char_input)
print(f'char embedding:{char_input_emb.shape}')

char_input size:torch.Size([16, 8, 14])
char embedding:torch.Size([16, 8, 14, 8])


In [57]:
x = char_input_emb.view(-1, char_dim, char_input_emb.size(2))
print(f'reshape to shape:{x.shape} ->  batch_size*seq_len, char_emb_dim, word_len')
x = x.unsqueeze(1)
x = char_conv(x)
print(f'conv output:{x.shape} : conv_len = word_len - char_channel_width + 1')
x = x.squeeze()
x = F.max_pool1d(x, x.size(2))
print(f'after max pool:{x.shape}')
x = x.view(batch_size,-1, char_channel_size)
print(f'char embedding output shape:{x.shape}')
x_char = x

reshape to shape:torch.Size([128, 8, 14]) ->  batch_size*seq_len, char_emb_dim, word_len
conv output:torch.Size([128, 100, 1, 10]) : conv_len = word_len - char_channel_width + 1
after max pool:torch.Size([128, 100, 1])
char embedding output shape:torch.Size([16, 8, 100])


In [53]:
word_emb = nn.Embedding.from_pretrained(word_field.vocab.vectors, freeze=True)
word_emb.to(gpu)

Embedding(343198, 100)

In [82]:
x = batch.q_word
print(f'word input:{x.shape}')
x = word_emb(x)
print(f'word embedding: {x.shape}')
x_word = x      

word input:torch.Size([16, 8])
word embedding: torch.Size([16, 8, 100])


#### Highway Network

在Highway之前使用一个project layer，以减少hidden size。

因为论文中的baseline将hidden size改为了20，原来是100.但是word vecotr的dim最小是25，所以只能是在embedding之后加一个project layer来降低维度了。

In [87]:
hidden_size = 10
project_layer = nn.Linear(char_channel_size*2, hidden_size*2)
project_layer.to(gpu)

Linear(in_features=200, out_features=20, bias=True)

In [97]:
x = torch.cat([x_char,x_word], dim=-1)
print(f'concat char embedding and word embedding: {x.shape}')
x = project_layer(x)
print(f'reduce dim to {x.shape}')

concat char embedding and word embedding: torch.Size([16, 8, 200])
reduce dim to torch.Size([16, 8, 20])


In [98]:
class Highway(nn.Module):
    
    def __init__(self, input_dim=20, num_layers=2):
        super(Highway, self).__init__()
        self.input_dim = input_dim
        self.layers = nn.ModuleList([nn.Linear(input_dim, input_dim * 2) 
                                     for _ in range(num_layers)])
        for layer in self.layers:
            layer.bias[input_dim:].data.fill_(1)
        
    def forward(self, x):
        current_input = x
        for layer in self.layers:
            project_input = layer(current_input)
            linear_part = current_input
            nonlinear_part, gate = project_input.chunk(2, dim=-1)
            nonlinear_part = F.relu(nonlinear_part)
            gate = torch.sigmoid(gate)
            current_input = gate * linear_part + (1 - gate) * nonlinear_part
        return current_input            
        

In [99]:
highway = Highway(input_dim=20, num_layers=2)
highway.to(gpu)

Highway(
  (layers): ModuleList(
    (0): Linear(in_features=20, out_features=40, bias=True)
    (1): Linear(in_features=20, out_features=40, bias=True)
  )
)

In [100]:
x = highway(x)
print(f'highway did not change the input dims: {x.shape}')

highway did not change the input dims: torch.Size([16, 8, 20])


#### Contextual Embeding Layer

Contextual Embedding Layer 一般用bi-LSTM或者GRU这种变种RNN。

In [101]:
context_LSTM = nn.LSTM(input_size=hidden_size*2, 
                       hidden_size=hidden_size,
                       bidirectional=True,
                       batch_first=True,
                       dropout=0.2
                      )
context_LSTM.to(gpu)

  "num_layers={}".format(dropout, num_layers))


LSTM(20, 10, batch_first=True, dropout=0.2, bidirectional=True)

In [102]:
contextual_output = context_LSTM(x)[0]
print(f'contextual output:{contextual_output.shape}')

contextual output:torch.Size([16, 8, 20])


#### 上述模型层汇总

1. Char Embedding
2. Word Embedding
3. Highway
4. Contextual Embedding

整理Char embedding即可，其他层都比较简单

In [107]:
class CharEmbedding(nn.Module):
    
    def __init__(self, vocab_size, char_dim, conv_out_size, conv_width):
        super(CharEmbedding, self).__init__()
        self.embedding = nn.Embedding(vocab_size, char_dim, padding_idx=1)
        self.conv = nn.Conv2d(1, conv_out_size, (char_dim, conv_width))
        self.char_dim = char_dim
        self.conv_out_size = conv_out_size
        
    def forward(self, x):
        batch_size = x.size(0)
        x = self.embedding(x)
        
        x = x.view(-1, self.char_dim, x.size(2)).unsqueeze(1)
        x = self.conv(x).squeeze()
        
        x = F.max_pool1d(x, x.size(2))
        x = x.view(batch_size,-1, self.conv_out_size)
        return x

In [109]:
char_emb_layer = CharEmbedding(char_vocab_size, char_dim, 100, 5)
char_emb_layer.to(gpu)

CharEmbedding(
  (embedding): Embedding(128, 8, padding_idx=1)
  (conv): Conv2d(1, 100, kernel_size=(8, 5), stride=(1, 1))
)

In [112]:
q_char = char_emb_layer(batch.q_char)
print(q_char.shape)

torch.Size([16, 8, 100])


In [115]:
char_emb = CharEmbedding(char_vocab_size, char_dim, 100, 5).to(gpu)
word_emb = nn.Embedding.from_pretrained(word_field.vocab.vectors, freeze=True).to(gpu)
hidden_size = 10
project_layer = nn.Linear(100*2, hidden_size*2).to(gpu)
highway = Highway(input_dim=hidden_size*2, num_layers=2).to(gpu)
context_layer = nn.LSTM(input_size=hidden_size*2, 
                       hidden_size=hidden_size,
                       bidirectional=True,
                       batch_first=True,
                       dropout=0.2
                      ).to(gpu)
def common_layer(x_char, x_word):
    x_char_emb = char_emb(x_char)
    x_word_emb = word_emb(x_word)
    x = torch.cat([x_char_emb,x_word_emb], dim=-1)
    x = project_layer(x)
    x = highway(x)
    x = context_layer(x)[0]
    return x

  "num_layers={}".format(dropout, num_layers))


输出context和question的embdding vector

In [117]:
context_emb = common_layer(batch.c_char, batch.c_word)
question_emb = common_layer(batch.q_char, batch.q_word)
print(f'context output shape:{context_emb.shape}, question :{question_emb.shape}')

context output shape:torch.Size([16, 3553, 20]), question :torch.Size([16, 8, 20])


#### Attention Layer

Attention 有很多种计算相似度矩阵的方法(也叫attention weight)。bidaf用的是linear similarity function

$$
s_{jt} = w^T [h;t;h \odot t]
$$

Attention is All your need 中，用的是dot product 
$$
Attention(Q,K,V)=softmax(\frac {QK^T} {\sqrt{d_k}})V
$$

In [127]:
attnetion_W = nn.Linear(6*d,1,bias=False).to(gpu)


In [136]:
print(f'context_emb: {context_emb.shape}, question_emb: {question_emb.shape}')
c_len = context_emb.size(1)
q_len = question_emb.size(1)
batch_size = context_emb.size(0)
hidden_size = context_emb.size(2)
shape = (batch_size, c_len, q_len, hidden_size)

context_emb_ex = context_emb.unsqueeze(2).expand(shape)
question_emb_ex = question_emb.unsqueeze(1).expand(shape)
print(f'expand to shape: context {context_emb_ex.shape}, question:{question_emb_ex.shape}')

c_mul_q = torch.mul(context_emb_ex, question_emb_ex)
cat_data = torch.cat((context_emb_ex, question_emb_ex, c_mul_q),3)
print(f'cat data shape:{cat_data.shape}')
d = hidden_size // 2

S = attnetion_W(cat_data).squeeze()
print(f'simirity matrix: {S.shape}')

c2q = torch.bmm(F.softmax(S, dim=-1), question_emb)
print(f'context to query: {c2q.shape}')

b = F.softmax(torch.max(S,2)[0], dim=-1)
print(f'attion weights on context: {b.shape}')

q2c = torch.bmm(b.unsqueeze(1), context_emb) # (N, 1, 2d) = bmm( (N, 1, T), (N, T, 2d))
q2c = q2c.repeat(1, c_len, 1)
print(f'question to context :{q2c.shape}')

G = torch.cat((context_emb, c2q, context_emb.mul(c2q), context_emb.mul(q2c)), 2) # (N,T,8d)
print(f'attention layer output : {G.shape}')

context_emb: torch.Size([16, 3553, 20]), question_emb: torch.Size([16, 8, 20])
expand to shape: context torch.Size([16, 3553, 8, 20]), question:torch.Size([16, 3553, 8, 20])
cat data shape:torch.Size([16, 3553, 8, 60])
simirity matrix: torch.Size([16, 3553, 8])
context to query: torch.Size([16, 3553, 20])
attion weights on context: torch.Size([16, 3553])
question to context :torch.Size([16, 3553, 20])
attention layer output : torch.Size([16, 3553, 80])


#### Modeling Layer

In [143]:
modeling_layer = nn.LSTM(input_size=d*8, hidden_size=d, 
                         bidirectional=True, dropout=0.2, batch_first=True,
                        num_layers=2)
modeling_layer.to(gpu)

LSTM(80, 10, num_layers=2, batch_first=True, dropout=0.2, bidirectional=True)

In [147]:
M = modeling_layer(G)[0]
print(f'modeling output: {M.shape}')

modeling output: torch.Size([16, 3553, 20])


#### output Lyer

In [151]:
p1_linear = nn.Linear(10*d,1, bias=False).to(gpu)
p2_linear = nn.Linear(10*d,1, bias=False).to(gpu)
p2_lstm = nn.LSTM(2*d,d, bidirectional=True, dropout=0.2, batch_first=True).to(gpu)

G_M = torch.cat((G,M),2)
p1 = F.softmax(p1_linear(G_M).squeeze(), dim=-1)

M2 = p2_lstm(M)[0]
G_M2 = torch.cat((G,M2),2)
p2 = F.softmax(p2_linear(G_M2).squeeze(), dim=-1)
print(f'p1: {p1.shape}, p2:{p2.shape}')

  "num_layers={}".format(dropout, num_layers))


p1: torch.Size([16, 3553]), p2:torch.Size([16, 3553])


#### 预测结果和Ground Truth

In [165]:
print(f'p1 predict: \n{p1.max(1)[1]} \np2 predict: \n{p2.max(1)[1]}')

p1 predict: 
tensor([1887, 1992, 3198, 3223, 2909, 3189, 2681, 1776,  525,  107, 2524,   24,
        2227, 3047,  989,  540], device='cuda:0') 
p2 predict: 
tensor([3552, 3552, 3552, 3552, 3552, 3552, 3552, 1708,  378, 3552, 3552, 3552,
        3552, 3552, 3552, 2583], device='cuda:0')


In [162]:
print(f'ground truth : \nstart:{batch.s_idx} \nend: {batch.e_idx}')

ground truth : 
start:tensor([ 662,  184,    0,  551,  865,    1,   95,  360,  378,   60,    6,  485,
        2510,  684,  413, 1516], device='cuda:0') 
end: tensor([ 662,  184,    1,  552,  866,    3,   96,  362,  379,   60,    7,  486,
        2512,  684,  413, 1516], device='cuda:0')
