In [1]:
import torch
import torchtext     # pip install torchdata
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
from torch.utils import data

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
torch.__version__

'1.12.1+cpu'

In [3]:
torchtext.__version__

'0.13.1'

# TorchText文本分类数据集

In [4]:
train_iter = iter(torchtext.datasets.IMDB(split='train'))

In [None]:
next(train_iter)

In [None]:
next(train_iter)

In [7]:
unique_labels = set([label for (label, text) in train_iter])

In [8]:
unique_labels

{'neg', 'pos'}

In [9]:
train_iter, test_iter = torchtext.datasets.IMDB()

In [10]:
from torchtext.data.utils import get_tokenizer            # 分词工具
from torchtext.vocab import build_vocab_from_iterator     # 创建词表工具

In [11]:
tokenizer = get_tokenizer('basic_english')      # 分词工具做初始化

In [12]:
def yield_tokens(data):
    for _, text in data:
        yield tokenizer(text)

vocab = build_vocab_from_iterator(yield_tokens(train_iter), specials=["<pad>", "<unk>"])

In [13]:
vocab.set_default_index(vocab["<unk>"])

In [14]:
vocab(['this', 'is', 'a', 'book', 'about', 'pytorch'])

[14, 10, 6, 276, 50, 1]

In [15]:
text_pipeline = lambda x: vocab(tokenizer(x))
label_pipeline = lambda x: int(x == 'pos')

In [16]:
text_pipeline('this is a book about pytorch')

[14, 10, 6, 276, 50, 1]

In [17]:
label_pipeline('pos')

1

In [18]:
label_pipeline('neg')

0

In [19]:
from torch.utils.data import DataLoader

In [20]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [21]:
device

device(type='cpu')

In [22]:
def collate_batch(batch):
    label_list, text_list, offsets = [], [], [0]
    for (_label, _text) in batch:
        label_list.append(label_pipeline(_label))
        precess_text = torch.tensor(text_pipeline(_text), dtype=torch.int64)
        text_list.append(precess_text)
        offsets.append(precess_text.size(0))
    label_list = torch.tensor(label_list)
    text_list = torch.cat(text_list)
    offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
    return label_list.to(device), text_list.to(device), offsets.to(device)

In [23]:
from torchtext.data.functional import to_map_style_dataset

In [24]:
train_dataset = to_map_style_dataset(train_iter)
test_dataset = to_map_style_dataset(test_iter)

In [25]:
train_dataloader = DataLoader(train_dataset, batch_size=64, 
                              shuffle=True, collate_fn=collate_batch)
test_dataloader = DataLoader(test_dataset, batch_size=64, 
                             shuffle=True, collate_fn=collate_batch)

# 创建模型

In [26]:
class TextClassificationModel(nn.Module):

    def __init__(self, vocab_size, embed_dim, num_class):
        super(TextClassificationModel, self).__init__()
        self.embedding = nn.EmbeddingBag(vocab_size, embed_dim, sparse=True)
        self.fc = nn.Linear(embed_dim, num_class)
        self.init_weights()

    def init_weights(self):
        initrange = 0.5
        self.embedding.weight.data.uniform_(-initrange, initrange)
        self.fc.weight.data.uniform_(-initrange, initrange)
        self.fc.bias.data.zero_()

    def forward(self, text, offsets):
        embedded = self.embedding(text, offsets)
        return self.fc(embedded)

In [27]:
num_class = 2

In [28]:
vocab_size = len(vocab)
emsize = 100
model = TextClassificationModel(vocab_size, emsize, num_class).to(device)

In [29]:
loss_fn = nn.CrossEntropyLoss()
from torch.optim import lr_scheduler
optimizer = torch.optim.SGD(model.parameters(), lr=0.1)
exp_lr_scheduler = lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.1)

In [30]:
def train(dataloader):
    total_acc, total_count, total_loss, = 0, 0, 0
    model.train()
    for label, text, offsets in dataloader:
        predited_label = model(text, offsets)
        loss = loss_fn(predited_label, label)
        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        with torch.no_grad():
            total_acc += (predited_label.argmax(1) == label).sum().item()
            total_count += label.size(0)
            total_loss += loss.item()*label.size(0)
    return total_loss/total_count, total_acc/total_count

In [31]:
def test(dataloader):
    model.eval()
    total_acc, total_count, total_loss, = 0, 0, 0

    with torch.no_grad():
        for idx, (label, text, offsets) in enumerate(dataloader):
            predited_label = model(text, offsets)
            loss = loss_fn(predited_label, label)
            total_acc += (predited_label.argmax(1) == label).sum().item()
            total_count += label.size(0)
            total_loss += loss.item()*label.size(0)
    return total_loss/total_count, total_acc/total_count

In [32]:
def fit(epochs, train_dl, test_dl):
    train_loss = []
    train_acc = []
    test_loss = []
    test_acc = []

    for epoch in range(epochs):
        epoch_loss, epoch_acc = train(train_dl)
        epoch_test_loss, epoch_test_acc = test(test_dl)
        train_loss.append(epoch_loss)
        train_acc.append(epoch_acc)
        test_loss.append(epoch_test_loss)
        test_acc.append(epoch_test_acc)
        exp_lr_scheduler.step()
        template = ("epoch:{:2d}, train_loss: {:.5f}, train_acc: {:.1f}% ," 
                    "test_loss: {:.5f}, test_acc: {:.1f}%")
        print(template.format(
              epoch, epoch_loss, epoch_acc*100, epoch_test_loss, epoch_test_acc*100))
    print("Done!")
    
    return train_loss, test_loss, train_acc, test_acc

In [33]:
EPOCHS = 30

In [34]:
train_loss, test_loss, train_acc, test_acc = fit(EPOCHS, 
                                                 train_dataloader, 
                                                 test_dataloader)

epoch: 0, train_loss: 0.68971, train_acc: 54.0% ,test_loss: 0.68506, test_acc: 57.3%


KeyboardInterrupt: 