In [49]:
import torchtext
import torch
import torchdata
import warnings 
from torchtext.data.utils import get_tokenizer        # 分词工具
from torchtext.vocab import build_vocab_from_iterator # 创建此表工具 
from torchtext.data.functional import to_map_style_dataset  # 制作数据集
from torch.utils.data import DataLoader
from torch import nn
from torch.optim import lr_scheduler
warnings.filterwarnings('ignore')

In [17]:
train_iter,test_iter = torchtext.datasets.IMDB(root="./data",split=('train', 'test'))

In [46]:
unique_labels = set([label for (label,text) in test_iter])
num_class = len(unique_labels)

In [19]:
 # 分词 
tokenizer = get_tokenizer('basic_english')   # 初始化分词工具

In [20]:
# 制作此表
# 定义一个生成器，返回分词之后的文本
def yield_tokens(data):
    for _,text in data:
        yield tokenizer(text)


In [21]:
# 创建此表 
vocab = build_vocab_from_iterator(
    yield_tokens(train_iter),
    specials=['<pad>','<unk>'],
    min_freq=5,
)

In [23]:
vocab.set_default_index(vocab['<unk>'])

In [25]:
# 构建数据集 Dataloder
train_dataset = to_map_style_dataset(train_iter)
test_dataset = to_map_style_dataset(test_iter)

In [32]:
text_pipline = lambda x:vocab(tokenizer(x))
# 批数据处理 
def collate_batch(batch):
    label_list,text_list,offsets = [],[],[0]
    for (_label,_text) in batch:
        precess_text = torch.tensor(text_pipline(_text),dtype=torch.int64)
        label_list.append(_label)
        text_list.append(precess_text)
        offsets.append(precess_text.size(0))
    label_list = torch.tensor(label_list)
    text_list = torch.cat(text_list)
    offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
    return label_list,text_list,offsets

In [33]:
BATCHSIZE = 64
train_dataloader = DataLoader(
    train_dataset,
    batch_size=BATCHSIZE,
    shuffle=True,
    collate_fn=collate_batch
)

test_dataloader = DataLoader(
    test_dataset,
    batch_size=BATCHSIZE,
    shuffle=True,
    collate_fn=collate_batch
)

In [40]:
# for label,text,offset in train_dataloader:
#     print(len(text))

In [48]:
vocab_size = len(vocab)   # 获取此表大小
embedding_dim = 100       # 定义词嵌入向量大小

class TextClassificationModel(nn.Module):
    
    def __init__(self,vocab_size,embed_dim,num_class):
        super(TextClassificationModel,self).__init__()
        self.embedding = nn.EmbeddingBag(vocab_size,embed_dim,sparse=True)
        
        self.fc = nn.Linear(embed_dim,num_class)
        self.init_weights()
    
    def init_weights(self):
        initrange = 0.5
        self.embedding.weight.data.uniform_(-initrange,initrange)
        self.fc.weight.data.uniform_(-initrange,initrange)
        self.fc.bias.data.zero_()
    
    def forward(self,text,offsets):
        embedded = self.embedding(text,offsets)
        return self.fc(embedded)

In [61]:
# 训练的过程
model = TextClassificationModel(vocab_size,embedding_dim,num_class)
loss_fn = nn.CrossEntropyLoss()   # 分类问题的损失函数
optimizer = torch.optim.SGD(model.parameters(),lr=0.1)
exp_lr = lr_scheduler.StepLR(optimizer,step_size=20,gamma=0.1)

In [63]:
def train(dataloader):
    total_acc,total_count,total_loss = 0,0,0
    model.train()
    for label,text,offsets in dataloader:
        predicted_label = model(text,offsets)
        loss = loss_fn(predicted_label,label)
        
        # 反向传播
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        # 记录器
        with torch.no_grad():
            total_acc += (predicted_label.argmax(1) == label).sum().item()
            total_count += label.size(0)
            total_loss += loss.item() * label.size(0)
        return total_loss/total_count,total_acc/total_count
    
def test(dataloader):
    model.eval()
    total_acc,total_count,total_loss = 0,0,0
    with torch.no_grad():
        for idx,(label,text,offsets) in enumerate(dataloader):
            predicted_label = model(text,offsets)
            loss = loss_fn(predicted_label,label)
            total_loss += loss.item() * label.size(0)
            total_acc += (predicted_label.argmax(1) == label).sum().item()
            total_count += label.size(0)
    return total_loss/total_count,total_acc/total_count

In [59]:
def fit(epochs,train_dl,test_dl):
    train_acc = []
    train_loss = []
    test_acc = []
    test_loss = []
    for epoch in range(epochs):
        start  =  time.time()
        epoch_loss,epoch_acc = train(train_dl)
        epoch_test_loss,epoch_test_acc = test(test_dl)
        end = time.time()
        times = end - start 
        train_acc.append(epoch_acc)
        train_loss.append(epoch_loss)
        test_acc.append(epoch_test_acc)
        test_loss.append(epoch_test_loss)
        exp_lr_scheduler.step()
        print('训练epoch{},训练集损失值:{:.2f},训练集的准确率:{:.2f}%,测试集损失值:{:.2f},测试集的准确率:{:.2f}%,消耗时间：{:.2f}s'.
              format(epoch+1,epoch_loss,epoch_acc*100,epoch_test_loss,epoch_test_acc*100,times))
        
    return train_loss,test_loss,train_acc,test_acc

In [None]:
epoch = 50
fit(epoch,)