In [None]:
#动手练习：Word2vec提取相似文本

In [1]:
import os
import sys
import time
import math
import random
import numpy as np
import collections
import torch
from torch import nn
import torch.utils.data as Data

with open('HarryPotter.txt', 'r') as f:
    lines = f.readlines() # 该数据集中句子以换行符为分割
    raw_dataset = [st.split() for st in lines] # st是sentence的缩写，单词以空格为分割

counter = collections.Counter([tk for st in raw_dataset for tk in st]) # tk是token的缩写
counter = dict(filter(lambda x: x[1] >= 5, counter.items())) # 只保留在数据集中至少出现5次的词

idx_to_token = [tk for tk, _ in counter.items()]
token_to_idx = {tk: idx for idx, tk in enumerate(idx_to_token)}
dataset = [[token_to_idx[tk] for tk in st if tk in token_to_idx]
           for st in raw_dataset] # raw_dataset中的单词在这一步被转换为对应的idx
num_tokens = sum([len(st) for st in dataset])

#二次采样操作。越高频率的词一般意义不大，根据公式高频词越容易被过滤。准确来说，应该是降频操作。既不希望超高频被完全过滤，又希望减少高频词对训练的影响。
def discard(idx):
    return random.uniform(0, 1) < 1 - math.sqrt(
        1e-4 / counter[idx_to_token[idx]] * num_tokens)

subsampled_dataset = [[tk for tk in st if not discard(tk)] for st in dataset]

def get_centers_and_contexts(dataset, max_window_size):
    centers, contexts = [], []
    for st in dataset:
        if len(st) < 2:
            continue
        centers += st
        for center_i in range(len(st)):
            window_size = random.randint(1, max_window_size)
            indices = list(range(max(0, center_i - window_size),
                                 min(len(st), center_i + 1 + window_size)))
            indices.remove(center_i)
            contexts.append([st[idx] for idx in indices])
    return centers, contexts

all_centers, all_contexts = get_centers_and_contexts(subsampled_dataset, 5)

#负采样近似加快程序运行时间
def get_negatives(all_contexts, sampling_weights, K):
    all_negatives, neg_candidates, i = [], [], 0
    population = list(range(len(sampling_weights)))
    for contexts in all_contexts:
        negatives = []
        while len(negatives) < len(contexts) * K:
            if i == len(neg_candidates):
                i, neg_candidates = 0, random.choices(
                    population, sampling_weights, k=int(1e5))
            neg, i = neg_candidates[i], i + 1
            
            if neg not in set(contexts):
                negatives.append(neg)
        all_negatives.append(negatives)
    return all_negatives

sampling_weights = [counter[w]**0.75 for w in idx_to_token]
all_negatives = get_negatives(all_contexts, sampling_weights, 5)

class MyDataset(torch.utils.data.Dataset):
    def __init__(self, centers, contexts, negatives):
        assert len(centers) == len(contexts) == len(negatives)
        self.centers = centers
        self.contexts = contexts
        self.negatives = negatives
        
    def __getitem__(self, index):
        return (self.centers[index], self.contexts[index], self.negatives[index])

    def __len__(self):
        return len(self.centers)
    
def batchify(data):
    max_len = max(len(c) + len(n) for _, c, n in data)
    centers, contexts_negatives, masks, labels = [], [], [], []
    for center, context, negative in data:
        cur_len = len(context) + len(negative)
        centers += [center]
        contexts_negatives += [context + negative + [0] * (max_len - cur_len)]
        masks += [[1] * cur_len + [0] * (max_len - cur_len)]
        labels += [[1] * len(context) + [0] * (max_len - len(context))]
        batch = (torch.tensor(centers).view(-1, 1), torch.tensor(contexts_negatives),
            torch.tensor(masks), torch.tensor(labels))
    return batch

batch_size = 256
num_workers = 0 if sys.platform.startswith('win32') else -1

dataset = MyDataset(all_centers, all_contexts, all_negatives)
data_iter = Data.DataLoader(dataset, batch_size, shuffle=True,
                            collate_fn=batchify, 
                            num_workers=num_workers)
for batch in data_iter:
    for name, data in zip(['centers', 'contexts_negatives', 'masks', 'labels'], batch):
        print(name, 'shape:', data.shape)
    break

#采用交叉熵损失函数
class SigmoidBinaryCrossEntropyLoss(nn.Module):
    def __init__(self):
        super(SigmoidBinaryCrossEntropyLoss, self).__init__()
    def forward(self, inputs, targets, mask=None):
        inputs, targets, mask = inputs.float(), targets.float(), mask.float()
        res = nn.functional.binary_cross_entropy_with_logits(inputs, targets, reduction="none", weight=mask)
        res = res.sum(dim=1) / mask.float().sum(dim=1)
        return res

loss = SigmoidBinaryCrossEntropyLoss()

def sigmd(x):
    return - math.log(1 / (1 + math.exp(-x)))

embed_size = 200
net = nn.Sequential(nn.Embedding(num_embeddings=len(idx_to_token), embedding_dim=embed_size),
                    nn.Embedding(num_embeddings=len(idx_to_token), embedding_dim=embed_size))

#skip_gram向前计算
def skip_gram(center, contexts_and_negatives, embed_v, embed_u):
    v = embed_v(center)
    u = embed_u(contexts_and_negatives)
    pred = torch.bmm(v, u.permute(0, 2, 1))
    return pred

def train(net, lr, num_epochs):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print("train on", device)
    net = net.to(device)
    optimizer = torch.optim.Adam(net.parameters(), lr=lr)
    for epoch in range(num_epochs):
        start, l_sum, n = time.time(), 0.0, 0
        for batch in data_iter:
            center, context_negative, mask, label = [d.to(device) for d in batch]
            pred = skip_gram(center, context_negative, net[0], net[1])
            l = loss(pred.view(label.shape), label, mask).mean()
            optimizer.zero_grad()
            l.backward()
            optimizer.step()
            l_sum += l.cpu().item()
            n += 1
        print('epoch %d, loss %.2f, time %.2fs'
              % (epoch + 1, l_sum / n, time.time() - start))

train(net, 0.01, 5)

#测试模型
def get_similar_tokens(query_token, k, embed):
    W = embed.weight.data
    x = W[token_to_idx[query_token]]

    cos = torch.matmul(W, x) / (torch.sum(W * W, dim=1) * torch.sum(x * x) + 1e-9).sqrt()
    _, topk = torch.topk(cos, k=k+1)
    topk = topk.cpu().numpy()
    for i in topk[1:]:
        print('余弦相似度 = %.3f: %s' % (cos[i], (idx_to_token[i])))
        
get_similar_tokens('Dursley', 5, net[0])

centers shape: torch.Size([256, 1])
contexts_negatives shape: torch.Size([256, 60])
masks shape: torch.Size([256, 60])
labels shape: torch.Size([256, 60])
train on cpu
epoch 1, loss 5.17, time 30.68s
epoch 2, loss 3.15, time 31.78s
epoch 3, loss 1.92, time 30.77s
epoch 4, loss 1.21, time 29.52s
epoch 5, loss 0.82, time 29.39s
余弦相似度 = 0.256: turban.
余弦相似度 = 0.245: thick,
余弦相似度 = 0.238: crate
余弦相似度 = 0.236: again,
余弦相似度 = 0.223: station


In [2]:
#动手练习：Seq2Seq实现机器翻译

In [3]:
import os
import sys
import math
from collections import Counter
import numpy as np
import random
import torch
import torch.nn as nn
import torch.nn.functional as F
import nltk

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

def load_data(in_file):
    cn = []
    en = []
    num_examples = 0
    with open(in_file, 'r', encoding='utf8') as f:
        for line in f:
            line = line.strip().split('\t')
            
            en.append(['BOS'] + nltk.word_tokenize(line[0].lower()) + ['EOS'])
            cn.append(['BOS'] + [c for c in line[1]] + ['EOS'])
    
    return en, cn

train_file = './data/train.txt'
dev_file = './data/test.txt'
train_en, train_cn = load_data(train_file)
dev_en, dev_cn = load_data(dev_file)

UNK_IDX = 0
PAD_IDX = 1
def build_dict(sentences, max_words = 50000):
    word_count = Counter()
    for sentence in sentences:
        for word in sentence:
            word_count[word] += 1
    
    ls = word_count.most_common(max_words)
    total_words = len(ls) + 2
    
    word_dict = {w[0] : index + 2 for index, w in enumerate(ls)}
    word_dict['UNK'] = UNK_IDX
    word_dict['PAD'] = PAD_IDX
    
    return word_dict, total_words

en_dict, en_total_words = build_dict(train_en)
cn_dict, cn_total_words = build_dict(train_cn)
inv_en_dict = {v: k for k, v in en_dict.items()}
inv_cn_dict = {v: k for k, v in cn_dict.items()}

def encode(en_sentences, cn_sentences, en_dict, cn_dict, sort_by_len=True):
    length = len(en_sentences)
    out_en_sentences = [[en_dict.get(w, 0) for w in sent] for sent in en_sentences]
    out_cn_sentences = [[cn_dict.get(w, 0) for w in sent] for sent in cn_sentences]

    def len_argsort(seq):
        return sorted(range(len(seq)), key=lambda x: len(seq[x]))

    #顺序排序
    if sort_by_len:
        sorted_index = len_argsort(out_en_sentences)
        out_en_sentences = [out_en_sentences[i] for i in sorted_index]
        out_cn_sentences = [out_cn_sentences[i] for i in sorted_index]
    
    return out_en_sentences, out_cn_sentences

train_en, train_cn = encode(train_en, train_cn, en_dict, cn_dict)
dev_en, dev_cn = encode(dev_en, dev_cn, en_dict, cn_dict)

def get_minibatches(n, minibatch_size, shuffle=True):
    idx_list = np.arange(0, n, minibatch_size)
    if shuffle:
        np.random.shuffle(idx_list)
    minibatches = []
    for idx in idx_list:
        minibatches.append(np.arange(idx, min(idx + minibatch_size, n)))
    return minibatches

def prepare_data(seqs):   
    lengths = [len(seq) for seq in seqs]
    n_samples = len(seqs)
    max_len = np.max(lengths)

    x = np.zeros((n_samples, max_len)).astype('int32')
    x_lengths = np.array(lengths).astype('int32')

    for idx, seq in enumerate(seqs):
        x[idx, :lengths[idx]] = seq
    
    return x, x_lengths

def gen_examples(en_sentences, cn_sentences, batch_size):
    minibatches = get_minibatches(len(en_sentences), batch_size)    
    all_ex = []
    for minibatch in minibatches:
        mb_en_sentences = [en_sentences[t] for t in minibatch]
        mb_cn_sentences = [cn_sentences[t] for t in minibatch]
        mb_x, mb_x_len = prepare_data(mb_en_sentences)
        mb_y, mb_y_len = prepare_data(mb_cn_sentences)
        all_ex.append((mb_x, mb_x_len, mb_y, mb_y_len))
    
    return all_ex   

batch_size = 64
train_data = gen_examples(train_en, train_cn, batch_size)
dev_data = gen_examples(dev_en, dev_cn, batch_size)

#设置损失函数
class LanguageModelCriterion(nn.Module):
    def __init__(self):
        super(LanguageModelCriterion, self).__init__()

    def forward(self, input, target, mask):
        input = input.contiguous().view(-1, input.size(2))
        target = target.contiguous().view(-1, 1)
        mask = mask.contiguous().view(-1, 1)
        output = -input.gather(1, target) * mask
        output = torch.sum(output) / torch.sum(mask)
        
        return output

class PlainEncoder(nn.Module):
    def __init__(self, vocab_size, hidden_size, dropout=0.2):
        super(PlainEncoder, self).__init__()
        self.embed = nn.Embedding(vocab_size, hidden_size)
        self.rnn = nn.GRU(hidden_size, hidden_size, batch_first=True)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x, lengths):
        sorted_len, sorted_idx = lengths.sort(0, descending=True)
        x_sorted = x[sorted_idx.long()]
        embedded = self.dropout(self.embed(x_sorted))
        
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, sorted_len.long().cpu().data.numpy(), batch_first=True)
        packed_out, hidden = self.rnn(packed_embedded)
        out, _ = nn.utils.rnn.pad_packed_sequence(packed_out, batch_first=True)
        
        _, original_idx = sorted_idx.sort(0, descending=False)
        
        out = out[original_idx.long()].contiguous()
        hidden = hidden[:, original_idx.long()].contiguous()
        
        return out, hidden[[-1]]

#测试维度
p = PlainEncoder(en_total_words, 100)

mb_x = torch.from_numpy(train_data[0][0]).long()
mb_x_len = torch.from_numpy(train_data[0][1]).long()
print("数据集:", mb_x.shape, mb_x_len.shape)

o, h = p(mb_x, mb_x_len)

print(o.shape, h.shape)
print(o[:, -1].shape, '\n', o[:, -1] == h)

class PlainDecoder(nn.Module):
    def __init__(self, vocab_size, hidden_size, dropout=0.2):
        super(PlainDecoder, self).__init__()
        self.embed = nn.Embedding(vocab_size, hidden_size)
        self.rnn = nn.GRU(hidden_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, vocab_size)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, y, y_lengths, hid):
        sorted_len, sorted_idx = y_lengths.sort(0, descending=True)
        y_sorted = y[sorted_idx.long()]
        hid = hid[:, sorted_idx.long()]
        
        y_sorted = self.dropout(self.embed(y_sorted))
        
        packed_seq = nn.utils.rnn.pack_padded_sequence(y_sorted, sorted_len.long().cpu().data.numpy(), batch_first=True)
        out, hid = self.rnn(packed_seq, hid)
        unpacked, _ = nn.utils.rnn.pad_packed_sequence(out, batch_first=True)
        
        _, original_idx = sorted_idx.sort(0, descending=False)
        output_seq = unpacked[original_idx.long()].contiguous()
        hid = hid[:, original_idx.long()].contiguous()
        
        output = F.log_softmax(self.fc(output_seq), -1)
        
        return output, hid
        
class PlainSeq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super(PlainSeq2Seq, self).__init__()
        self.encoder = encoder 
        self.decoder = decoder
        
    def forward(self, x, x_lengths, y, y_lengths):
        encoder_cut, hid = self.encoder(x, x_lengths)
        output, hid = self.decoder(y, y_lengths, hid)
            
        return output, None
    
    
    def translate(self, x, x_lengths, y, max_length=10):
        encoder_cut, hid = self.encoder(x, x_lengths)
        preds = []
        batch_size = x.shape[0]
        attns = []
        
        for i in range(max_length):
            output, hid = self.decoder(y=y, y_lengths=torch.ones(batch_size).long().to(device), hid=hid)

            y = output.max(2)[1].view(batch_size, 1) 
            preds.append(y)

        return torch.cat(preds, 1), None

dropout = 0.2
hidden_size = 100
encode = PlainEncoder(vocab_size=en_total_words, hidden_size=hidden_size, dropout=dropout)
decoder = PlainDecoder(vocab_size=cn_total_words, hidden_size=hidden_size, dropout=dropout)

model = PlainSeq2Seq(encode, decoder)
model = model.to(device)

loss_fn = LanguageModelCriterion().to(device)
optimizer = torch.optim.Adam(model.parameters())

def train(model, data, num_epochs=20):
    for epoch in range(num_epochs):
        model.train()
        total_num_words = total_loss = 0.
        for it, (mb_x, mb_x_len, mb_y, mb_y_len) in enumerate(data):
            mb_x = torch.from_numpy(mb_x).to(device).long()
            mb_x_len = torch.from_numpy(mb_x_len).to(device).long()
            
            mb_input = torch.from_numpy(mb_y[:, :-1]).to(device).long()
            mb_output = torch.from_numpy(mb_y[:, 1:]).to(device).long()
            
            mb_y_len = torch.from_numpy(mb_y_len - 1).to(device).long()
            mb_y_len[mb_y_len <= 0] = 1
            
            mb_pred, attn = model(mb_x, mb_x_len, mb_input, mb_y_len)
            
            mb_out_mask = torch.arange(mb_y_len.max().item(), device=device)[None, :] < mb_y_len[:, None]
            mb_out_mask = mb_out_mask.float()
            
            loss = loss_fn(mb_pred, mb_output, mb_out_mask)
            
            num_words = torch.sum(mb_y_len).item()
            total_loss += loss.item() * num_words
            total_num_words += num_words
            
            #更新模型
            optimizer.zero_grad()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 5.)
            optimizer.step()
            
            if it % 100 == 0:
                print("迭代次数: ", epoch, '迭代', it, 'loss:', loss.item())
                   
        print("迭代次数", epoch, "训练损失", total_loss / total_num_words)
        
        if epoch % 5 == 0:
            evaluate(model, dev_data)
    
    torch.save(model.state_dict(), 'model.pt')

def evaluate(model, data):
    model.eval()
    total_num_words = total_loss = 0.
    
    with torch.no_grad():
        
        for it, (mb_x, mb_x_len, mb_y, mb_y_len) in enumerate(data):
            mb_x = torch.from_numpy(mb_x).to(device).long()
            mb_x_len = torch.from_numpy(mb_x_len).to(device).long()
            mb_input = torch.from_numpy(mb_y[:, :-1]).to(device).long()
            mb_output = torch.from_numpy(mb_y[:, 1:]).to(device).long()
            mb_y_len = torch.from_numpy(mb_y_len-1).to(device).long()
            mb_y_len[mb_y_len<=0] = 1

            mb_pred, attn = model(mb_x, mb_x_len, mb_input, mb_y_len)
            mb_out_mask = torch.arange(mb_y_len.max().item(), device=device)[None, :] < mb_y_len[:, None]
            mb_out_mask = mb_out_mask.float()

            loss = loss_fn(mb_pred, mb_output, mb_out_mask)
            num_words = torch.sum(mb_y_len).item()
            total_loss += loss.item() * num_words
            total_num_words += num_words
            
    print("损失评估", total_loss / total_num_words)

train(model, train_data, num_epochs=10)

def translate_dev(i):
    en_sent = " ".join([inv_en_dict[w] for w in dev_en[i]])
    print(en_sent)
    cn_sent = " ".join([inv_cn_dict[w] for w in dev_cn[i]])
    print("".join(cn_sent))

    mb_x = torch.from_numpy(np.array(dev_en[i]).reshape(1, -1)).long().to(device)
    mb_x_len = torch.from_numpy(np.array([len(dev_en[i])])).long().to(device)
    bos = torch.Tensor([[cn_dict["BOS"]]]).long().to(device)
    
    translation, attn = model.translate(mb_x, mb_x_len, bos)
    translation = [inv_cn_dict[i] for i in translation.data.cpu().numpy().reshape(-1)]
    trans = []
    for word in translation:
        if word != "EOS":
            trans.append(word)
        else:
            break
    print("".join(trans))

#导入训练好模型
model.load_state_dict(torch.load('model.pt', map_location=device))
for i in range(1, 5):
    translate_dev(i)
    print()

数据集: torch.Size([64, 9]) torch.Size([64])
torch.Size([64, 9, 100]) torch.Size([1, 64, 100])
torch.Size([64, 100]) 
 tensor([[[True, True, True,  ..., True, True, True],
         [True, True, True,  ..., True, True, True],
         [True, True, True,  ..., True, True, True],
         ...,
         [True, True, True,  ..., True, True, True],
         [True, True, True,  ..., True, True, True],
         [True, True, True,  ..., True, True, True]]])
迭代次数:  0 迭代 0 loss: 7.9240288734436035
迭代次数:  0 迭代 100 loss: 4.954286098480225
迭代次数:  0 迭代 200 loss: 4.346655368804932
迭代次数:  0 迭代 300 loss: 4.635125637054443
迭代次数:  0 迭代 400 loss: 4.609460830688477
迭代次数:  0 迭代 500 loss: 4.263648986816406
迭代次数 0 训练损失 4.731006864201709
损失评估 3.1907880306243896
迭代次数:  1 迭代 0 loss: 3.8907952308654785
迭代次数:  1 迭代 100 loss: 3.7558422088623047
迭代次数:  1 迭代 200 loss: 3.2824087142944336
迭代次数:  1 迭代 300 loss: 3.850149393081665
迭代次数:  1 迭代 400 loss: 3.8532021045684814
迭代次数:  1 迭代 500 loss: 3.7996721267700195
迭代次数 1 训练损失 3.

In [4]:
#动手练习：Attention模型实现文本自动分类

In [5]:
import math
import time
import numpy as np
import torch
import torch.nn.functional as F
import torchtext

BATCH_SIZE = 128
LEARNING_RATE = 1e-3
EMBEDDING_DIM = 100
torch.manual_seed(99)

TEXT = torchtext.legacy.data.Field(tokenize=lambda x: x.split(), lower=True)
LABEL = torchtext.legacy.data.LabelField(dtype=torch.float)

def get_dataset(corpur_path, text_field, label_field):
    fields = [('text', text_field), ('label', label_field)]
    examples = []
    with open(corpur_path) as f:
        li = []
        while True:
            content = f.readline().replace('\n', '')
            if not content:
                if not li:
                    break
                label = li[0][10]
                text = li[1][6:-7]
                examples.append(torchtext.legacy.data.Example.fromlist([text, label], fields))
                li = []
            else:
                li.append(content)

    return examples, fields

train_examples, train_fields = get_dataset("corpurs/trains.txt", TEXT, LABEL)
dev_examples, dev_fields = get_dataset("corpurs/dev.txt", TEXT, LABEL)
test_examples, test_fields = get_dataset("corpurs/tests.txt", TEXT, LABEL)

#构建数据集
train_data = torchtext.legacy.data.Dataset(train_examples, train_fields)
dev_data = torchtext.legacy.data.Dataset(dev_examples, dev_fields)
test_data = torchtext.legacy.data.Dataset(test_examples, test_fields)

print('len of train data:', len(train_data))
print('len of dev data:', len(dev_data))
print('len of test data:', len(test_data))

#创建词向量
TEXT.build_vocab(train_data, max_size=5000, vectors='glove.6B.100d')
LABEL.build_vocab(train_data)
print(len(TEXT.vocab))

#创建迭代器
train_iterator, dev_iterator, test_iterator = torchtext.legacy.data.BucketIterator.splits(
    (train_data, dev_data, test_data),
    batch_size=BATCH_SIZE,
    sort = False)

len of train data: 1000
len of dev data: 200
len of test data: 300
3287


In [6]:
class BiLSTM_Attention(torch.nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, n_layers):
        super(BiLSTM_Attention, self).__init__()

        self.hidden_dim = hidden_dim
        self.n_layers = n_layers
        self.embedding = torch.nn.Embedding(vocab_size, embedding_dim)
        self.rnn = torch.nn.LSTM(embedding_dim, hidden_dim, num_layers=n_layers, bidirectional=True, dropout=0.5)
        self.fc = torch.nn.Linear(hidden_dim * 2, 1)
        self.dropout = torch.nn.Dropout(0.5)

        self.w_omega = torch.nn.Parameter(torch.Tensor(hidden_dim * 2, hidden_dim * 2))
        self.u_omega = torch.nn.Parameter(torch.Tensor(hidden_dim * 2, 1))
        torch.nn.init.uniform_(self.w_omega, -0.1, 0.1)
        torch.nn.init.uniform_(self.u_omega, -0.1, 0.1)

    def attention_net(self, x):
        u = torch.tanh(torch.matmul(x, self.w_omega))
        att = torch.matmul(u, self.u_omega)
        att_score = F.softmax(att, dim=1)
        scored_x = x * att_score
        context = torch.sum(scored_x, dim=1)
        return context

    def forward(self, x):
        embedding = self.dropout(self.embedding(x))
        output, (final_hidden_state, final_cell_state) = self.rnn(embedding)
        output = output.permute(1, 0, 2)

        attn_output = self.attention_net(output)
        logit = self.fc(attn_output)
        return logit

In [7]:
rnn = BiLSTM_Attention(len(TEXT.vocab), EMBEDDING_DIM, hidden_dim=64, n_layers=2)

pretrained_embedding = TEXT.vocab.vectors
print('pretrained_embedding:', pretrained_embedding.shape)
rnn.embedding.weight.data.copy_(pretrained_embedding)
print('embedding layer inited.')

optimizer = torch.optim.Adam(rnn.parameters(), lr=LEARNING_RATE)
criteon = torch.nn.BCEWithLogitsLoss()

pretrained_embedding: torch.Size([3287, 100])
embedding layer inited.


In [8]:
#计算准确率
def binary_acc(preds, y):
    preds = torch.round(torch.sigmoid(preds))
    correct = torch.eq(preds, y).float()
    acc = correct.sum() / len(correct)
    return acc

#训练模型
def train(rnn, iterator, optimizer, criteon):
    avg_loss = []
    avg_acc = []
    rnn.train()

    for i, batch in enumerate(iterator):
        pred = rnn(batch.text).squeeze()
        loss = criteon(pred, batch.label)
        acc = binary_acc(pred, batch.label).item()
        avg_loss.append(loss.item())
        avg_acc.append(acc)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    avg_acc = np.array(avg_acc).mean()
    avg_loss = np.array(avg_loss).mean()
    return avg_loss, avg_acc

#评估函数
def evaluate(rnn, iterator, criteon):
    avg_loss = []
    avg_acc = []
    rnn.eval()

    with torch.no_grad():
        for batch in iterator:
            pred = rnn(batch.text).squeeze()
            loss = criteon(pred, batch.label)
            acc = binary_acc(pred, batch.label).item()
            avg_loss.append(loss.item())
            avg_acc.append(acc)

    avg_loss = np.array(avg_loss).mean()
    avg_acc = np.array(avg_acc).mean()
    return avg_loss, avg_acc

#训练模型，并打印模型的表现
best_valid_acc = float('-inf')

for epoch in range(30):
    start_time = time.time()
    train_loss, train_acc = train(rnn, train_iterator, optimizer, criteon)
    dev_loss, dev_acc = evaluate(rnn, dev_iterator, criteon)
    end_time = time.time()
    epoch_mins, epoch_secs = divmod(end_time - start_time, 60)

    if dev_acc > best_valid_acc:
        best_valid_acc = dev_acc
        torch.save(rnn.state_dict(), 'wordavg-model.pt')

    print(f'迭代次数: {epoch+1:02} | 迭代时间: {epoch_mins}m {epoch_secs:.2f}s')
    print(f'\t训练集损失: {train_loss:.3f} | 训练集准确率: {train_acc*100:.2f}%')
    print(f'\t验证集损失: {dev_loss:.3f} | 验证集准确率: {dev_acc*100:.2f}%')

#用保存的模型参数预测数据
rnn.load_state_dict(torch.load("wordavg-model.pt"))
test_loss, test_acc = evaluate(rnn, test_iterator, criteon)
print(f'测试集损失: {test_loss:.3f} |  测试集准确率: {test_acc*100:.2f}%')

迭代次数: 01 | 迭代时间: 0.0m 2.14s
	训练集损失: 0.694 | 训练集准确率: 49.21%
	验证集损失: 0.691 | 验证集准确率: 53.73%
迭代次数: 02 | 迭代时间: 0.0m 2.22s
	训练集损失: 0.690 | 训练集准确率: 53.41%
	验证集损失: 0.683 | 验证集准确率: 57.29%
迭代次数: 03 | 迭代时间: 0.0m 2.28s
	训练集损失: 0.686 | 训练集准确率: 55.20%
	验证集损失: 0.671 | 验证集准确率: 60.33%
迭代次数: 04 | 迭代时间: 0.0m 2.24s
	训练集损失: 0.669 | 训练集准确率: 61.37%
	验证集损失: 0.641 | 验证集准确率: 61.72%
迭代次数: 05 | 迭代时间: 0.0m 2.21s
	训练集损失: 0.638 | 训练集准确率: 63.38%
	验证集损失: 0.620 | 验证集准确率: 66.75%
迭代次数: 06 | 迭代时间: 0.0m 2.15s
	训练集损失: 0.601 | 训练集准确率: 66.89%
	验证集损失: 0.610 | 验证集准确率: 70.62%
迭代次数: 07 | 迭代时间: 0.0m 2.37s
	训练集损失: 0.546 | 训练集准确率: 72.25%
	验证集损失: 0.611 | 验证集准确率: 68.36%
迭代次数: 08 | 迭代时间: 0.0m 2.13s
	训练集损失: 0.517 | 训练集准确率: 73.53%
	验证集损失: 0.592 | 验证集准确率: 72.57%
迭代次数: 09 | 迭代时间: 0.0m 2.25s
	训练集损失: 0.484 | 训练集准确率: 76.01%
	验证集损失: 0.584 | 验证集准确率: 73.74%
迭代次数: 10 | 迭代时间: 0.0m 2.23s
	训练集损失: 0.428 | 训练集准确率: 80.66%
	验证集损失: 0.639 | 验证集准确率: 71.57%
迭代次数: 11 | 迭代时间: 0.0m 2.17s
	训练集损失: 0.412 | 训练集准确率: 80.26%
	验证集损失: 0.571 | 验证集准确率: 73.35%
迭代次数: 12 |