### IMDB 分类模板

In [2]:
# 引入pytorch包
import torch

from torch import nn, optim
from torch.nn import init

import torchtext
from torchtext.vocab import Vectors
from torchtext import data

In [None]:
# 引入其他包
from tqdm import tqdm
import time


In [None]:
# 随机数层
SEED = 12345

torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [None]:
# 常数、参数层，常数及可变参数用大写字母表示

DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [1]:
# 数据层，设置IMDBDataset和IMDBDataLoader，并打印一些数据，确保Dataset没问题

SENTENCE = data.Field(sequential=True, lower=True, include_lengths=True, use_vocab=True, batch_first=False)
LABEL = data.LabelField(sequential=False, use_vocab=True)

trainDataset, valDataset = data.TabularDataset.splits(path='.', train='IMDBTrain.tsv', validation='IMDBTest.tsv', format='tsv', skip_header=True, fields=[('sentence', SENTENCE), ('label', LABEL)])

vectors = torchtext.vocab.Vectors(name = 'glove.6B.100d.txt', cache = '../../glove')
SENTENCE.build_vocab(trainDataset, vectors=vectors, unk_init=init.xavier_normal)
LABEL.build_vocab(trainDataset)

print("SENTENCE.vocab.freqs.most_common(10)\n", SENTENCE.vocab.freqs.most_common(10))
print("SENTENCE.vocab.vectors.shape\n", SENTENCE.vocab.vectors.shape)


trainIter = data.BucketIterator(trainDataset, batch_size=16, sort_key=lambda x: len(x.sentence), shuffle=True, device=DEVICE)
valIter = data.BucketIterator(valDataset, batch_size=32, sort_key=lambda x: len(x.sentence), shuffle=True, device=DEVICE)
# # 迭代器返回一个名为torchtext.data.Batch的自定义数据类型，使得代码重用变得困难，使得torchtext很难与其他库一起用于某些用例。

In [1]:
# 模型层，在这里定义模型

class IMDBModel(nn.Module):
    def __init__(self):
        pass
    
    def forward(self, inputs):
        pass
    
imdbModel = IMDBModel()
imdbModel = imdbModel.to(DEVICE)

# 对模型加入参数，如embedding等数据

NameError: name 'nn' is not defined

In [None]:
INPUT_DIM = len(SENTENCE.vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 50
OUTPUT_DIM = 2
N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.2
PAD_IDX = SENTENCE.vocab.stoi[SENTENCE.pad_token]

model = RNN2(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, 
            N_LAYERS, BIDIRECTIONAL, DROPOUT, PAD_IDX)
model = model.to(DEVICE)

pretrained_embeddings = SENTENCE.vocab.vectors

model.embedding.weight.data.copy_(pretrained_embeddings)
UNK_IDX = SENTENCE.vocab.stoi[SENTENCE.unk_token]

model.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)

print(model.embedding.weight.data)

In [3]:
# 优化层，设置损失函数，优化器

optimizer = optim.Adam(imdbModel.parameter())

criterion = nn.CrossEntropyLoss()

In [None]:
# 辅助函数层，如训练函数、验证函数、运行时间函数
def train(model, iterator, optimizer, criterion, skip=500):
    
    epoch_loss = 0
    epoch_acc = 0
    model.train()
    
    for i, batch in enumerate(iterator):
        optimizer.zero_grad()
        predictions = model(batch.sentence[0], batch.sentence[1])
        #print("predictions ", predictions.size())
        loss = criterion(predictions, batch.label)
        

        loss.backward()
        optimizer.step()
        
        _, preds = torch.max(predictions.detach(), dim=1)
        acc = torch.mean((preds==batch.label.detach()).double())

        epoch_loss += loss.item()
        
        
        
        epoch_acc += acc.item()
        if i % skip == 0:
            print(" Train Mini batch loss ", loss.item())
            print(" Train Mini batch acc  ", acc.item())
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

def evaluate(model, iterator, criterion, skip=500):
    
    epoch_loss = 0
    epoch_acc = 0
    model.eval()
    
    with torch.no_grad():
        for i, batch in enumerate(iterator):
            predictions = model(batch.sentence[0], batch.sentence[1])
            loss = criterion(predictions, batch.label)

            epoch_loss += loss.item()
            _, preds = torch.max(predictions, dim=1)
            acc = torch.mean((preds==batch.label).double())
            
            epoch_acc += acc.item()
            if i % skip == 0:
                print("Valid Mini batch loss ", loss.item())
                print("Valid Mini batch acc  ", acc.item())
    return epoch_loss / len(iterator), epoch_acc / len(iterator)


def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [4]:
# 执行层，设置训练过程，验证过程，在这里执行优化，并记录效果最佳的模型，打开tensorboard，记录执行时的各个参数
N_EPOCHS = 10
best_valid_acc = float('0.0')

import copy
best_model_state_dict = copy.deepcopy(model.state_dict())
best_epoch = 0

for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    
    train_loss, train_acc = train(model, trainIter, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valIter, criterion)
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_acc > best_valid_acc:
        best_model_state_dict = copy.deepcopy(model.state_dict())
        best_epoch = epoch
        best_valid_acc = best_valid_acc
    
    print(f'Epoch: {epoch+1:2} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\n\tTrain Loss: {train_loss:.3f} ')
    print(f'\tValid Loss: {valid_loss:.3f} \tValid Acc: {valid_acc:.3f} \n')

In [5]:
# 记录层，记录一系列数据
xxx模型
参数：
optim  lr  max_length   final_acc
SGD   1e-3  128         

效果：验证集最佳为91%