In [1]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
print(torch.cuda.is_available())
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

True


In [26]:
with open("cmn.txt", encoding='utf8') as f:
    lines = f.readlines()

sep = ".!?,。？！，"
lines = ["".join([char if char not in sep else " " + char + " " for char in line]) for line in lines]
print(lines[0])
lines = [line.split("\t")[:2] for line in lines]
lines = [(line_pair[0].split(), line_pair[1].split()) for line_pair in lines]

source_sequence = []
target_sequence = []
for line in lines:
    source_sequence.append(line[0])
    target = []
    for seq in line[1]:
        if len(seq) == 1:
            target.append(seq)
        else:
            target.extend([char for char in seq])
    target_sequence.append(target)
print(source_sequence[10], target_sequence[10])

source_sequence = [sequence + ["<eos>"] for sequence in source_sequence]
target_sequence = [["<bos>"] + sequence + ["<eos>"] for sequence in target_sequence]

def sequence_length(sequences):
    length_dict = {}
    for sequence in sequences:
        if length_dict.get(len(sequence), -1) == -1:
            length_dict[len(sequence)] = 1
        else:
            length_dict[len(sequence)] += 1
    keys = [k for k in length_dict]
    keys.sort()
    for k in keys:
        print(k, ":", length_dict[k])

sequence_length(source_sequence)
sequence_length(target_sequence)

def padding_sequence(padding_size, sequence):
    if len(sequence) > padding_size:
        return sequence[:padding_size]
    elif len(sequence) < padding_size:
        for i in range(len(sequence), padding_size):
            sequence.append("<pad>")
    return sequence
    
source_sequence = [padding_sequence(20, sequence) for sequence in source_sequence]
target_sequence = [padding_sequence(30, sequence) for sequence in target_sequence]

source_count_dict = {}
target_count_dict = {}

for sequence in source_sequence:
    for word in sequence:
        if source_count_dict.get(word, -1) == -1:
            source_count_dict[word] = 0
        else:
            source_count_dict[word] += 1
for sequence in target_sequence:
    for word in sequence:
        if target_count_dict.get(word, -1) == -1:
            target_count_dict[word] = 0
        else:
            target_count_dict[word] += 1

target_words = ["<unknown>", "<pad>", "<eos>", "<bos>"]
source_words = ["<unknown>", "<pad>", "<eos>", "<bos>"]
target_dict = {"<unknown>":0, "<pad>":1, "<eos>":2, "<bos>":3}
source_dict = {"<unknown>":0, "<pad>":1, "<eos>":2, "<bos>":3}

print(target_dict["<bos>"], target_dict["<eos>"])
for k in source_count_dict:
    if source_count_dict[k] < 2 or source_dict.get(k, -1) != -1:
        continue
    source_dict[k] = len(source_words)
    source_words.append(k)
for k in target_count_dict:
    if target_count_dict[k] < 2 or target_dict.get(k, -1) != -1:
        continue
    target_dict[k] = len(target_words)
    target_words.append(k)

print(target_dict["<bos>"], target_dict["<eos>"])
print(len(source_dict), len(target_dict))

Hi . 	嗨 。 	CC-BY 2 . 0 (France) Attribution: tatoeba . org #538123 (CM) & #891077 (Martha)

['Oh', 'no', '!'] ['不', '会', '吧', '。']
3 : 29
4 : 568
5 : 1741
6 : 3871
7 : 4945
8 : 5708
9 : 5250
10 : 2175
11 : 1376
12 : 847
13 : 596
14 : 329
15 : 216
16 : 140
17 : 69
18 : 53
19 : 30
20 : 15
21 : 11
22 : 5
23 : 5
24 : 3
26 : 1
30 : 1
31 : 1
35 : 1
4 : 9
5 : 117
6 : 426
7 : 1172
8 : 2258
9 : 3271
10 : 3828
11 : 3791
12 : 3516
13 : 2716
14 : 2002
15 : 1540
16 : 966
17 : 671
18 : 502
19 : 328
20 : 264
21 : 180
22 : 127
23 : 80
24 : 60
25 : 38
26 : 21
27 : 29
28 : 20
29 : 12
30 : 9
31 : 6
32 : 5
33 : 5
34 : 7
36 : 2
38 : 2
39 : 1
40 : 2
42 : 1
46 : 2
3 2
3 2
3629 2643


In [28]:
class SeqDataset(Dataset):
    def __init__(self, source_lines, target_lines, source_words, target_words, source_dict, target_dict) -> None:
        super().__init__()
        self.source_lines = source_lines
        self.target_lines = target_lines
        self.source_words = source_words
        self.target_words = target_words
        self.source_dict = source_dict
        self.target_dict = target_dict
    def __getitem__(self, index):
        line = self.source_lines[index]
        source_index = [self.source_dict.get(word, 0) for word in line]
        line = self.target_lines[index]
        target_index = [self.target_dict.get(word, 0)  for word in line]
        # here we use tensor, so we can get [batch_size, sequence_length] tensor. or we will get wierd things.
        return torch.tensor(source_index).cuda(), torch.tensor(target_index).cuda()
    def __len__(self):
        return len(self.source_lines)
seq_data_set = SeqDataset(source_lines=source_sequence,
                          target_lines=target_sequence,
                          source_words=source_words,
                          target_words=target_words,
                          source_dict=source_dict,
                          target_dict=target_dict)
print(seq_data_set[0])
train_data_loader = DataLoader(dataset=seq_data_set, batch_size=2, shuffle=True)
for iter in train_data_loader:
    # print(iter[0].shape, iter[1].shape)
    print(iter)
    break


(tensor([4, 5, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
       device='cuda:0'), tensor([3, 4, 5, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1], device='cuda:0'))
[tensor([[  11,  238,  217,  918,  164,  400,   79,    5,    2,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1],
        [3448,  696, 3568,    0,  466, 1565, 3107,  466, 3438,    5,    2,    1,
            1,    1,    1,    1,    1,    1,    1,    1]], device='cuda:0'), tensor([[   3,   19,   23,  386,  555,   36,  207,   78,  672,  466,    5,    2,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1],
        [   3,  989,  312, 2227, 1189,   10, 1938, 1470,   68, 1121,   22, 2596,
         1772,  786,   10,  998, 1569,    5,    2,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1]], device='cuda:0')]


In [23]:
class Encoder(nn.Module):
    def __init__(self):
        super().__init__()
    def forward(self, x):
        raise NotImplementedError

class Decoder(nn.Module):
    def __init__(self):
        super().__init__()
    def init_state(self, state):
        raise NotImplementedError
    def forward(self, x, state):
        raise NotImplementedError

class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super().__init__()
        self.encoder_ = encoder
        self.decoder_ = decoder
    def forward(self, x, y):
        encoder_output = self.encoder_(x)
        state = self.decoder_.init_state(encoder_output)
        decoder_output = self.decoder_(y, state)
        return decoder_output

In [33]:
class GRUEncoder(Encoder):
    def __init__(self, vocab_size, embedding_dim):
        super().__init__()
        self.hidden_dim = embedding_dim
        self.num_layers = 2
        self.word2vec = nn.Embedding(vocab_size, embedding_dim)
        self.gru = nn.GRU(input_size=embedding_dim, hidden_size=self.hidden_dim, num_layers=self.num_layers, bias=True, dropout=0.5)
    def forward(self, x):
        hidden_state = torch.zeros((self.num_layers, x.shape[1], self.hidden_dim)).cuda()
        embedding = self.word2vec(x)
        output, hidden = self.gru(embedding, hidden_state)
        return hidden

class GRUDecoder(Decoder):
    def __init__(self, vocab_size, embedding_dim):
        super().__init__()
        self.hidden_dim = embedding_dim 
        self.num_layers = 2
        self.word2vec = nn.Embedding(vocab_size, embedding_dim)
        self.gru = nn.GRU(input_size=embedding_dim + self.hidden_dim, hidden_size=self.hidden_dim, num_layers=self.num_layers, bias=True, dropout=0.5)
        self.linear = nn.Linear(self.hidden_dim, vocab_size)    
    def init_state(self, state):
        return state
    def forward(self, y, state):
        embedding = self.word2vec(y)
        input = torch.cat((embedding, state[-1].repeat([y.shape[0], 1, 1])), 2)
        output, hidden = self.gru(input, state)
        one_hot = self.linear(output)
        return one_hot

In [42]:
def get_sequence(index):
    seq = []
    for i in index:
        seq.append(target_words[i])
    return seq
        
def train(data_loader, seq2seq, epoch, cross_loss):
    optimizer = torch.optim.Adam(seq2seq.parameters(), lr=0.1)
    count = 0
    for _ in range(epoch):
        for data in data_loader:
            count += 1
            x, y_label = data
            # x, y shape=[batch, sequence length], here we permute to make their shape=[sequence length, batch]
            x = x.permute(1, 0)
            y_label = y_label.permute(1, 0)
            # y_inference shape=[sequence length, batch, vocab_size]
            y_inference = seq2seq(x, y_label[:-1, :])
            # change shape for cross entropy
            # y shape = [batch, sequence length]
            # y inference shape = [batch, vocab size, sequence_length]
            y_label = y_label.permute(1, 0)
            y_inference = y_inference.permute(1, 2, 0)
            mask = (y_label != target_dict["<pad>"]).float()
            arg_max = y_inference.argmax(dim=1)
            # print("argmax", arg_max.shape, arg_max)
            loss = torch.mul(cross_loss(y_inference[:, :, :], y_label[:, 1:]), mask[:, 1:])
            # print("loss", loss, "y_label", y_label)
            loss = loss.sum()
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            if count % 100 == 0:
                print("Y label: ", y_label.shape, y_label[0])
                print("y inference", y_inference.shape, arg_max[0], y_inference[0])
                print("y label seq: ", get_sequence(y_label[0]))
                print("y inference seq: ", get_sequence(arg_max[0]))
                print("x embedding 0", seq2seq.encoder_.word2vec(torch.tensor(0).cuda()))
                print("y embedding 0", seq2seq.decoder_.word2vec(torch.tensor(0).cuda()))
                print("loss ", loss)

seq_data_set = SeqDataset(source_lines=source_sequence,
                          target_lines=target_sequence,
                          source_words=source_words,
                          target_words=target_words,
                          source_dict=source_dict,
                          target_dict=target_dict)
train_data_loader = DataLoader(dataset=seq_data_set, batch_size=32, shuffle=True)
seq2seq = Seq2Seq(GRUEncoder(len(source_words), 32), GRUDecoder(len(target_words), 32))
seq2seq.to(device)
cross_loss = nn.CrossEntropyLoss(reduction="none")
train(train_data_loader, seq2seq, 10, cross_loss)

Y label:  torch.Size([32, 30]) tensor([  3,  19, 683, 684, 212, 503,   5,   2,   1,   1,   1,   1,   1,   1,
          1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
          1,   1], device='cuda:0')
y inference torch.Size([32, 2643, 29]) tensor([5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
        5, 5, 5, 5, 5], device='cuda:0') tensor([[ 0.7792,  1.1420,  1.2677,  ...,  1.5943,  1.5274,  1.5312],
        [-2.1227, -2.7920, -3.0048,  ..., -3.2786, -3.2270, -3.1910],
        [ 2.7881,  3.7700,  4.0474,  ...,  4.4602,  4.3835,  4.3593],
        ...,
        [-1.8133, -2.3629, -2.5257,  ..., -2.5716, -2.7218, -2.7439],
        [-1.7221, -1.9065, -1.9314,  ..., -1.4448, -1.5925, -1.5959],
        [-2.0462, -2.2365, -2.3822,  ..., -2.1730, -2.3098, -2.3208]],
       device='cuda:0', grad_fn=<SelectBackward>)
y label seq:  ['<bos>', '我', '准', '备', '晚', '餐', '。', '<eos>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>'

In [None]:
seq_data_set = SeqDataset(source_lines=source_sequence,
                          target_lines=target_sequence,
                          source_words=source_words,
                          target_words=target_words,
                          source_dict=source_dict,
                          target_dict=target_dict)
train_data_loader = DataLoader(dataset=seq_data_set, batch_size=32, shuffle=True)

seq2seq = Seq2Seq(GRUEncoder(len(source_words), 96), GRUDecoder(len(target_words), 96))

In [None]:
embedding = torch.tensor([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
class EmbeddingDataset(Dataset):
    def __init__(self) -> None:
        super().__init__()
    def __getitem__(self, index):
        return embedding[index]
    def __len__(self):
        return len(embedding)

data_loader = DataLoader(dataset=EmbeddingDataset(), batch_size=2)
for data in data_loader:
    print(data)

In [None]:
import math
-math.log(math.exp(0) / (math.exp(1) * 2 + math.exp(0)))

In [None]:
def sequence_mask(X, valid_len, value=0):
    """在序列中屏蔽不相关的项"""
    maxlen = X.size(1)
    mask = torch.arange((maxlen), dtype=torch.float32,
                        device=X.device)[None, :] < valid_len[:, None]
    X[~mask] = value
    return X
class MaskedSoftmaxCELoss(nn.CrossEntropyLoss):
    """带遮蔽的softmax交叉熵损失函数"""
    # pred的形状：(batch_size,num_steps,vocab_size)
    # label的形状：(batch_size,num_steps)
    # valid_len的形状：(batch_size,)
    def forward(self, pred, label, valid_len):
        weights = torch.ones_like(label)
        weights = sequence_mask(weights, valid_len)
        self.reduction='none'
        unweighted_loss = super(MaskedSoftmaxCELoss, self).forward(
            pred.permute(0, 2, 1), label)
        print("unweighted", unweighted_loss)
        weighted_loss = (unweighted_loss * weights).mean(dim=1)
        return weighted_loss
loss = MaskedSoftmaxCELoss()
print(loss(torch.ones(3, 4, 10), torch.ones((3, 4), dtype=torch.long),
     torch.tensor([4, 2, 0])))
print(nn.CrossEntropyLoss(reduction="none")(torch.ones(3, 10, 4), torch.ones((3, 4), dtype=torch.long)))