In [24]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

import time
import math
import random

In [25]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [26]:
device

device(type='cuda')

# 处理数据

In [27]:
# 每一行数据如下
# 'Hi.\t嗨。\tCC-BY 2.0 (France) Attribution: tatoeba.org #538123 (CM) & #891077 (Martha)'
with open('/content/drive/MyDrive/test.txt', 'r', encoding='utf-8') as f:
    data = f.read()
data = data.strip()
data = data.split('\n')
print('样本数:\n', len(data))
print('\n样本示例:')
data[0]

样本数:
 28190

样本示例:


'这女优叫啥啊\t这个女优叫啥'

In [28]:
# 分割数据
en_data = [line.split('\t')[0] for line in data]
ch_data = [line.split('\t')[1] for line in data]
print('原句子:\n', en_data[:10])
print('\n参考句子:\n', ch_data[:10])

原句子:
 ['这女优叫啥啊', '春节去哪自驾游', '丰胸用什么产品好', '怎么查通话记录', '妖娆什么意思', '千金归来大结局是什么', '生存战争变态版哪个是煤', '韩潮苏海指的什么生肖啊', '设计姓名个性签名', '银行卡可以别人代办吗']

参考句子:
 ['这个女优叫啥', '关于春节自驾游', '什么产品丰胸好', '怎么查别人通话记录', '妖娆是什么意思', '千金归来的结局是什么没看懂', '变态版生存战争', '韩潮苏海是指什么生肖', '姓名设计个性签名', '银行卡可以代办吗']


In [29]:
# 按字符级切割，并添加<eos>
en_token_list = [[char for char in line]+["<eos>"] for line in en_data]
ch_token_list = [[char for char in line]+["<eos>"] for line in ch_data]
print('原句子:\n', en_token_list[:2])
print('\n参考句子:\n', ch_token_list[:2])

原句子:
 [['这', '女', '优', '叫', '啥', '啊', '<eos>'], ['春', '节', '去', '哪', '自', '驾', '游', '<eos>']]

参考句子:
 [['这', '个', '女', '优', '叫', '啥', '<eos>'], ['关', '于', '春', '节', '自', '驾', '游', '<eos>']]


In [30]:
# 基本字典
basic_dict = {'<pad>':0, '<unk>':1, '<bos>':2, '<eos>':3}
# 生成中文字典
with open('/content/drive/MyDrive/all.txt', 'r', encoding='utf-8') as f:
    vol_data = f.read()
vol_data = vol_data.strip()
vol_data = vol_data.split('\n')


all = vol_data

ch_vocab = set(''.join(all))
ch2id = {char:i+len(basic_dict) for i, char in enumerate(ch_vocab)}
ch2id.update(basic_dict)
id2ch = {v:k for k,v in ch2id.items()}

en2id=ch2id
id2en=id2ch

In [31]:
# 利用字典，映射数据 
en_num_data = [[en2id[en] for en in line ] for line in en_token_list]
ch_num_data = [[ch2id[ch] for ch in line] for line in ch_token_list]

print('char:', en_data[1])
print('index:', en_num_data[1])

char: 春节去哪自驾游
index: [9, 1302, 2092, 1607, 2399, 433, 3016, 3]


# 表示为dataset

In [36]:
class TranslationDataset(Dataset):
    def __init__(self, src_data, trg_data):
        self.src_data = src_data
        self.trg_data = trg_data

        assert len(src_data) == len(trg_data), \
            "numbers of src_data  and trg_data must be equal!"

    def __len__(self):
        return len(self.src_data)

    def __getitem__(self, idx):
        src_sample =self.src_data[idx]
        src_len = len(self.src_data[idx])
        trg_sample = self.trg_data[idx]
        trg_len = len(self.trg_data[idx])
        return {"src": src_sample, "src_len": src_len, "trg": trg_sample, "trg_len": trg_len}

In [37]:
def padding_batch(batch):
    """
    input: -> list of dict
        [{'src': [1, 2, 3], 'trg': [1, 2, 3]}, {'src': [1, 2, 2, 3], 'trg': [1, 2, 2, 3]}]
    output: -> dict of tensor 
        {
            "src": [[1, 2, 3, 0], [1, 2, 2, 3]].T
            "trg": [[1, 2, 3, 0], [1, 2, 2, 3]].T
        }
    """
    src_lens = [d["src_len"] for d in batch]
    trg_lens = [d["trg_len"] for d in batch]
    
    src_max = max([d["src_len"] for d in batch])
    trg_max = max([d["trg_len"] for d in batch])
    for d in batch:
        d["src"].extend([ch2id["<pad>"]]*(src_max-d["src_len"]))
        d["trg"].extend([ch2id["<pad>"]]*(trg_max-d["trg_len"]))
    srcs = torch.tensor([pair["src"] for pair in batch], dtype=torch.long, device=device)
    trgs = torch.tensor([pair["trg"] for pair in batch], dtype=torch.long, device=device)
    
    batch = {"src":srcs.T, "src_len":src_lens, "trg":trgs.T, "trg_len":trg_lens}
    return batch

# 模型

In [38]:
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hid_dim, n_layers, dropout=0.5, bidirectional=True):
        super(Encoder, self).__init__()
        
        self.hid_dim = hid_dim
        self.n_layers = n_layers
        
        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.gru = nn.GRU(emb_dim, hid_dim, n_layers, dropout=dropout, bidirectional=bidirectional)
        
    def forward(self, input_seqs, input_lengths, hidden):
        # input_seqs = [seq_len, batch]
        embedded = self.embedding(input_seqs)
        # embedded = [seq_len, batch, embed_dim]
        packed = torch.nn.utils.rnn.pack_padded_sequence(embedded, input_lengths, enforce_sorted=False)
        
        outputs, hidden = self.gru(packed, hidden)        
        outputs, output_lengths = torch.nn.utils.rnn.pad_packed_sequence(outputs)
        # outputs = [seq_len, batch, hid_dim * n directions]
        # output_lengths = [batch]
        return outputs, hidden
class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hid_dim, n_layers, dropout=0.5, bidirectional=True):
        super(Decoder, self).__init__()
        
        self.output_dim = output_dim
        self.hid_dim = hid_dim
        self.n_layers = n_layers
        
        self.embedding = nn.Embedding(output_dim, emb_dim)
        self.gru = nn.GRU(emb_dim, hid_dim, n_layers, dropout=dropout, bidirectional=bidirectional)
        
        if bidirectional:
            self.fc_out = nn.Linear(hid_dim*2, output_dim)
        else:
            self.fc_out = nn.Linear(hid_dim, output_dim)
        self.dropout = nn.Dropout(dropout)
        self.softmax = nn.LogSoftmax(dim=1)
        
    def forward(self, token_inputs, hidden):
        # token_inputs = [batch]
        batch_size = token_inputs.size(0)
        embedded = self.dropout(self.embedding(token_inputs).view(1, batch_size, -1))
        # embedded = [1, batch, emb_dim]

        output, hidden = self.gru(embedded, hidden)
        # output = [1, batch,  n_directions * hid_dim]
        # hidden = [n_layers * n_directions, batch, hid_dim]
        
        output = self.fc_out(output.squeeze(0))
        output = self.softmax(output)
        # output = [batch, output_dim]
        return output, hidden
class Seq2Seq(nn.Module):
    def __init__(self, 
                 encoder, 
                 decoder, 
                 device, 
                 predict=False, 
                 basic_dict=None,
                 max_len=100
                 ):
        super(Seq2Seq, self).__init__()
        
        self.device = device

        self.encoder = encoder
        self.decoder = decoder

        self.predict = predict  # 训练阶段还是预测阶段
        self.basic_dict = basic_dict  # decoder的字典，存放特殊token对应的id
        self.max_len = max_len  # 翻译时最大输出长度

        self.enc_n_layers = self.encoder.gru.num_layers
        self.enc_n_directions = 2 if self.encoder.gru.bidirectional else 1
        self.dec_n_directions = 2 if self.decoder.gru.bidirectional else 1

        assert encoder.hid_dim == decoder.hid_dim, \
            "Hidden dimensions of encoder and decoder must be equal!"
        assert encoder.n_layers == decoder.n_layers, \
            "Encoder and decoder must have equal number of layers!"
        assert self.enc_n_directions >= self.dec_n_directions, \
            "If decoder is bidirectional, encoder must be bidirectional either!"
        
    def forward(self, input_batches, input_lengths, target_batches=None, target_lengths=None, teacher_forcing_ratio=0.5):
        # input_batches = target_batches = [seq_len, batch]
        batch_size = input_batches.size(1)
        
        BOS_token = self.basic_dict["<bos>"]
        EOS_token = self.basic_dict["<eos>"]
        PAD_token = self.basic_dict["<pad>"]

        # 初始化
        encoder_hidden = torch.zeros(self.enc_n_layers*self.enc_n_directions, batch_size, self.encoder.hid_dim, device=self.device)
        
        # encoder_output = [seq_len, batch, hid_dim * n directions]
        # encoder_hidden = [n_layers*n_directions, batch, hid_dim]
        encoder_output, encoder_hidden = self.encoder(
            input_batches, input_lengths, encoder_hidden)

        # 初始化
        decoder_input = torch.tensor([BOS_token] * batch_size, dtype=torch.long, device=self.device)
        if self.enc_n_directions == self.dec_n_directions:
            decoder_hidden = encoder_hidden
        else:
            L = encoder_hidden.size(0)
            decoder_hidden = encoder_hidden[range(0, L, 2)] + encoder_hidden[range(1, L, 2)]

        if self.predict:
            # 预测阶段使用
            # 一次只输入一句话
            assert batch_size == 1, "batch_size of predict phase must be 1!"
            output_tokens = []

            while True:
                decoder_output, decoder_hidden = self.decoder(
                    decoder_input, decoder_hidden
                )
                # [1, 1]
                topv, topi = decoder_output.topk(1)
                decoder_input = topi.squeeze(1)  # 上一个预测作为下一个输入
                output_token = topi.squeeze().detach().item()
                if output_token == EOS_token or len(output_tokens) == self.max_len:
                    break
                output_tokens.append(output_token)
            return output_tokens

        else:
            # 训练阶段
            max_target_length = max(target_lengths)
            all_decoder_outputs = torch.zeros((max_target_length, batch_size, self.decoder.output_dim), device=self.device)

            for t in range(max_target_length):
                use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False
                if use_teacher_forcing:
                    # decoder_output = [batch, output_dim]
                    # decoder_hidden = [n_layers*n_directions, batch, hid_dim]
                    decoder_output, decoder_hidden = self.decoder(
                        decoder_input, decoder_hidden
                    )
                    all_decoder_outputs[t] = decoder_output
                    decoder_input = target_batches[t]  # 下一个输入来自训练数据
                else:
                    decoder_output, decoder_hidden = self.decoder(
                        decoder_input, decoder_hidden
                    )
                    # [batch, 1]
                    topv, topi = decoder_output.topk(1)
                    all_decoder_outputs[t] = decoder_output
                    decoder_input = topi.squeeze(1)  # 下一个输入来自模型预测
            
            loss_fn = nn.NLLLoss(ignore_index=PAD_token)
            loss = loss_fn(
                all_decoder_outputs.reshape(-1,self.decoder.output_dim ),  # [batch*seq_len, output_dim]
                target_batches.reshape(-1)                                                 # [batch*seq_len]
            )
            return loss

# 训练和预测函数

In [39]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs
def train(
    model,
    data_loader, 
    optimizer, 
    clip=1, 
    teacher_forcing_ratio=0.5, 
    print_every=None  # None不打印
    ):
    model.predict = False
    model.train()

    if print_every == 0:
        print_every = 1

    print_loss_total = 0  # 每次打印都重置
    start = time.time()
    epoch_loss = 0
    for i, batch in enumerate(data_loader):

        # shape = [seq_len, batch]
        input_batchs = batch["src"]
        target_batchs = batch["trg"]
        # list
        input_lens = batch["src_len"]
        target_lens = batch["trg_len"]
        
        optimizer.zero_grad()
        
        loss = model(input_batchs, input_lens, target_batchs, target_lens, teacher_forcing_ratio)
        print_loss_total += loss.item()
        epoch_loss += loss.item()
        loss.backward()

        # 梯度裁剪
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)

        optimizer.step()

        if print_every and (i+1) % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print('\tCurrent Loss: %.4f' % print_loss_avg)

    return epoch_loss / len(data_loader)
def evaluate(
    model,
    data_loader, 
    print_every=None
    ):
    model.predict = False
    model.eval()
    if print_every == 0:
        print_every = 1

    print_loss_total = 0  # 每次打印都重置
    start = time.time()
    epoch_loss = 0
    with torch.no_grad():
        for i, batch in enumerate(data_loader):

            # shape = [seq_len, batch]
            input_batchs = batch["src"]
            target_batchs = batch["trg"]
            # list
            input_lens = batch["src_len"]
            target_lens = batch["trg_len"]

            loss = model(input_batchs, input_lens, target_batchs, target_lens, teacher_forcing_ratio=0)
            print_loss_total += loss.item()
            epoch_loss += loss.item()

            if print_every and (i+1) % print_every == 0:
                print_loss_avg = print_loss_total / print_every
                print_loss_total = 0
                print('\tCurrent Loss: %.4f' % print_loss_avg)

    return epoch_loss / len(data_loader)
def translate(
    model,
    sample, 
    idx2token=None
    ):
    model.predict = True
    model.eval()

    # shape = [seq_len, 1]
    input_batch = sample["src"]
    # list
    input_len = sample["src_len"]

    output_tokens = model(input_batch, input_len)
    output_tokens = [idx2token[t] for t in output_tokens]

    return "".join(output_tokens)

# 训练

In [40]:
INPUT_DIM = len(ch2id)
OUTPUT_DIM = len(ch2id)

# 超参数
BATCH_SIZE = 32
ENC_EMB_DIM = 256
DEC_EMB_DIM = 256
HID_DIM = 512
N_LAYERS = 2
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5
LEARNING_RATE = 1e-4
#N_EPOCHS = 200
N_EPOCHS = 150
CLIP = 1

bidirectional = True
enc = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, N_LAYERS, ENC_DROPOUT, bidirectional)
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, N_LAYERS, DEC_DROPOUT, bidirectional)
model = Seq2Seq(enc, dec, device, basic_dict=basic_dict).to(device)

## encoder和encoder设置相同的学习策略
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
# ## encoder和encoder设置不同的学习策略
# optimizer_grouped_parameters = [
#         {'params': [p for n, p in model.named_parameters() if 'encoder' in n], 'lr': LEARNING_RATE},
#         {'params': [p for n, p in model.named_parameters() if 'decoder' in n], 'lr': LEARNING_RATE*2}
# ]
# optimizer = optim.Adam(optimizer_grouped_parameters)
# 数据集
train_set = TranslationDataset(en_num_data, ch_num_data)
train_loader = DataLoader(train_set, batch_size=BATCH_SIZE, collate_fn=padding_batch)
best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    
    start_time = time.time()
    train_loss = train(model, train_loader, optimizer, CLIP)
    valid_loss = evaluate(model, train_loader)
    end_time = time.time()
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'en2ch-model.pt')

    if epoch %2 == 0:
        epoch_mins, epoch_secs = epoch_time(start_time, end_time)
        print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
        print(f'\tTrain Loss: {train_loss:.3f} | Val. Loss: {valid_loss:.3f}')

Epoch: 01 | Time: 0m 59s
	Train Loss: 5.320 | Val. Loss: 4.998
Epoch: 03 | Time: 0m 59s
	Train Loss: 3.703 | Val. Loss: 3.860
Epoch: 05 | Time: 0m 59s
	Train Loss: 3.046 | Val. Loss: 3.374
Epoch: 07 | Time: 0m 59s
	Train Loss: 2.633 | Val. Loss: 3.075
Epoch: 09 | Time: 0m 59s
	Train Loss: 2.354 | Val. Loss: 2.865
Epoch: 11 | Time: 0m 59s
	Train Loss: 2.135 | Val. Loss: 2.678
Epoch: 13 | Time: 0m 59s
	Train Loss: 1.947 | Val. Loss: 2.532
Epoch: 15 | Time: 0m 59s
	Train Loss: 1.807 | Val. Loss: 2.393
Epoch: 17 | Time: 0m 59s
	Train Loss: 1.677 | Val. Loss: 2.278
Epoch: 19 | Time: 0m 59s
	Train Loss: 1.553 | Val. Loss: 2.164
Epoch: 21 | Time: 0m 59s
	Train Loss: 1.459 | Val. Loss: 2.038
Epoch: 23 | Time: 0m 59s
	Train Loss: 1.368 | Val. Loss: 1.949
Epoch: 25 | Time: 0m 59s
	Train Loss: 1.283 | Val. Loss: 1.831
Epoch: 27 | Time: 0m 59s
	Train Loss: 1.197 | Val. Loss: 1.762
Epoch: 29 | Time: 0m 59s
	Train Loss: 1.116 | Val. Loss: 1.630
Epoch: 31 | Time: 0m 59s
	Train Loss: 1.048 | Val. Loss

In [41]:
torch.save(model.state_dict(), '/content/drive/MyDrive/en2ch-model.pt')

In [42]:
print("best valid loss：", best_valid_loss)
# 加载最优权重
model.load_state_dict(torch.load('en2ch-model.pt'))

best valid loss： 0.2674387472805384


<All keys matched successfully>

In [43]:
test='你好'
test = [char for char in test]+["<eos>"]
seq=[en2id[en] for en in test]
test_sample = {}
test_sample["src"] = torch.tensor(seq, dtype=torch.long, device=device).reshape(-1, 1)
test_sample["src_len"] = [len(seq)]
print(translate(model, test_sample, id2ch), end="\n\n")

你好看



In [44]:
f=open('/content/drive/MyDrive/测试原句子.txt','r',encoding='utf-8')
test_lst=[]
lines = f.readlines()      #读取全部内容 ，并以列表方式返回
for line in lines:
    line=line.replace('\n','')
    line=line.replace('\t','')
    test_lst.append(line)

In [47]:
result2=[]
count=0
for word in test_lst:
  test = word
  test = [char for char in test]+["<eos>"]
  seq=[en2id[en] for en in test]
  test_sample = {}
  test_sample["src"] = torch.tensor(seq, dtype=torch.long, device=device).reshape(-1, 1)
  test_sample["src_len"] = [len(seq)]
  temp = translate(model, test_sample, id2ch)
  print(word,' ',temp)
  result2.append(word+' '+temp)
  count+=1
  if count % 100 == 0:
    f=open('result2.txt','w',encoding='utf-8')
    for line in result2:
      f.write(line+'\n')

[1;30;43m流式输出内容被截断，只能显示最后 5000 行内容。[0m
父亲给我上的一堂课阅读答案   父亲您的父案的阅读答案
怎么看淘宝年度账单   淘宝年度对账单怎么看
带字女生头像   女生头像头像
性价比较高的平板电脑   平性、平板电脑脑哪个名
成龙最新电影叫什么   成龙的最新电影是什么
奶粉什么牌子比较好呀   什么牌子的奶粉好
天涯明月刀电视剧好看吗？大家   天涯明月天视剧，什么时候看
虎落平阳一()歇后语   麦阳安光的歇后语是那些
手机有什么软件可以赚钱   手机上有什么软件赚钱的软件
怎么才能去掉黑痣   黑怎么去掉黑内
怎么查找同名同姓的人   怎么查找同名同姓的人
脸上的麻子怎么去掉   脸上怎么去掉脸上的肉子
镜子和照片哪个更真实些   照片和镜子，哪个更真实的
迅雷继续下载没资源   迅雷下载资源不足
孙俪和邓超离婚吗   孙和和孙俪离婚了吗
油烟机哪个牌子好   甘油烟哪个牌子的好
能看下红包的使用范围么   可以用红包用包装是什么
我的世界电脑版如何下载   我的世界电脑版下载
亵渎是什么意思   他志是什么意思
补肾该吃什么食物啊   补肾该吃什么食物
如何登陆新浪博客   如何登陆新浪博客
呵呵你一脸什么意思   呵呵你一脸，什么意思
如何查询驾驶证真假   怎么查询驾驶证真假
怎样使大腿变瘦   怎样才能使腿变大
情人节送什么礼物给男朋友   情人节送什么礼物给男朋友呢
我怎么才可以注册谷歌账户   怎么才可以注册谷歌账户
天天飞车怎样开挂   天天飞车怎样开挂
魔兽世界为什么进不去啊   我什么我下魔兽世界进不去
最吝啬的人打一成语   最吝啬的一一夜打一成语
这是怎样的漫画风格呢   这是怎样的漫画
财富值怎么获取   财富值怎么获得啊
全民飞机大战宠物哪个好   全民飞机大战都有哪些好
什么时候天气回暖了   什么时候回气回回了
女士手表有哪些品牌   女士手表哪些品牌好
如何才能删除淘宝帐号   怎样才能删除淘宝帐号
穿越火线为什么老是闪退   为什么我老火线是闪线闪
慎皮肤哪个好看   哪个皮肤好看西
南京除了莱迪还哪卖谜尚   南京国越火迪哪哪刷钻石
中通快递快不快   中通快递快不快的
手机谷歌输入法和搜狗输入法哪个更好   手机输入法切法输入法比较好
麦当劳都有些什么好吃的   麦板劳的都有什么好
