# 通过torchtext获取数据

数据说明

- Train.csv共有由三列组成，分别为标签、新闻标题、新闻简述；其中标签用1，2，3，4表示依次对应classes中的内容；
- test.csv与train.csv内容格式相同；

In [1]:
import torch
import torchtext

from torchtext.datasets import text_classification
import os

load_data_path = '../data/'

if not os.path.isdir(load_data_path):
    os.mkdir(load_data_path)
    
train_dataset,test_dataset = text_classification.DATASETS['AG_NEWS'](root=load_data_path)

120000lines [00:04, 24515.08lines/s]
120000lines [00:09, 13236.62lines/s]
7600lines [00:00, 13982.22lines/s]


# 构建带有embedding层的文本分类模型

In [2]:
import torch.nn as nn
import torch.nn.functional as F

BATCH_SIZE = 32

device = torch.device('cpu')

class TextSentiment(nn.Module):
    def __init__(self,vocab_size,embed_dim,num_class):
        super().__init__()
        # 实例划embedding层，sparse=True表示每次对该层求解梯度只更新部分权重
        self.embedding = nn.Embedding(vocab_size,embed_dim,sparse=True)
        # 实例化线性层，参数分别是embed_dim和num_calss
        self.fc = nn.Linear(embed_dim,num_class)
        # 为各层初始化权重
        self.init_weights()

    def init_weights(self):
        """初始化权重函数"""
        #指定初始权重的取值范围
        initrange= 0.5
        self.embedding.weight.data.uniform_(-initrange,initrange)
        self.fc.weight.data.uniform_(-initrange,initrange)
        # 初始化偏置为0
        self.fc.bias.data.zero_()
        
    def forward(self,text):
        """
        text：文本数值映射后的结果
        return: 与类别尺寸相同的张量，用以判断文本类别
        """
        embedded = self.embedding(text)
        
        # 将输入的text转化为符合batch_size大小数据
        c = embedded.size(0)
        # 不足批次的数据抛弃
        embedded = embedded[:BATCH_SIZE*c]
        #利用平均池化的方法求embedded中指定行数的列的平均值
        #但平均池化是作用在行上并且要三位输入；
        # 因此对新的embedded进行转置，并扩充维度
        embedded = embedded.transpose(1,0).unsqueeze(0)
        embedded = F.avg_pool1d(embedded,kernel_size = c)
        
        # 最后，减去新增的维度，转置回去输送给c
        return self.fc(embedded[0].transpose(1,0))

# 实例化模型

In [3]:
# 词汇总数
VOCAB_SIZE = len(train_dataset.get_vocab())
# 嵌入的维度
EMBED_DIM = 32
# 获取整个文本分类的总数
NUM_CLASS = len(train_dataset.get_labels())

# 实例化模型对象
model = TextSentiment(VOCAB_SIZE,EMBED_DIM,NUM_CLASS).to(device)

# 对数据进行batch处理

In [4]:
def generate_batch(batch):
    """
        生成batch数据
        :param batch:由样本张量和对应标签元祖组成的batch_size大小的列表
            如：[(sample1,label1),(sample2,label2),......,(sampleN,labelN)]
        :return :样本张量和标签各自的列表
            text = tensor([sample1,sample2,......,sampleN])
            label = tensor([label1,label2,......,labelN])
    """
    label = torch.tensor([entry[0] for entry in batch])
    text = [entry[1] for entry in batch]
    text = torch.cat(text)
    return text,label

In [5]:
# 测试batch处理
batch = [(1,torch.tensor([3,23,2,8])),(0,torch.tensor([3,45,21,6]))]
res = generate_batch(batch)
print(res)

(tensor([ 3, 23,  2,  8,  3, 45, 21,  6]), tensor([1, 0]))


# 构建训练和验证函数

In [6]:
from torch.utils.data import DataLoader
BATCH_SIZE = 32
def train(train_data):
    # train_data: 代表传入的训练数据

    # 初始化训练损失值和准确率
    train_loss = 0
    train_acc = 0

    # 使用数据加载器构建批次数据
    data = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True, collate_fn=generate_batch)

    # 对data进行循环遍历, 使用每个batch数据先进行训练
    for i, (text, cls) in enumerate(data):
        # 训练模型的第一步: 将优化器的梯度清零
        optimizer.zero_grad()
        # 将一个批次的数据输入模型中, 进行预测
        output = model(text.to(device))
        # 用损失函数来计算预测值和真实标签之间的损失
        loss = criterion(output.to(device), cls.to(device))
        # 将该批次的损失值累加到总损失中
        train_loss += loss.item()
        # 进行反向传播的计算
        loss.backward()
        # 参数更新
        optimizer.step()
        # 计算该批次的准确率并加到总准确率上, 注意一点这里加的是准确的数字
        train_acc += (output.argmax(1) == cls).sum().item()

    # 进行整个轮次的优化器学习率的调整
    scheduler.step()

    # 返回本轮次训练的平均损失值和平均准确率
    return train_loss / len(train_data), train_acc / len(train_data)


# 编写验证函数的代码
def valid(valid_data):
    # valid_data: 代表验证集的数据
    # 初始化验证的损失值和准确率
    valid_loss = 0
    valid_acc = 0

    # 利用数据加载器构造每一个批次的验证数据
    data = DataLoader(valid_data, batch_size=BATCH_SIZE, collate_fn=generate_batch)
    data
    # 循环遍历验证数据
    for text, cls in data:
        # 注意: 在验证阶段, 一定要保证模型的参数不发生改变, 也就是不求梯度
        with torch.no_grad():
            # 将验证数据输入模型进行预测
            output = model(text)
            # 计算损失值
            loss = criterion(output, cls)
            # 将该批次的损失值累加到总损失值中
            valid_loss += loss.item()
            # 将该批次的准确数据累加到总准确数字中
            valid_acc += (output.argmax(1) == cls).sum().item()

    # 返回本轮次验证的平均损失值和平均准确率
    return valid_loss / len(valid_data), valid_acc / len(valid_data)

# 模型的训练和验证

In [None]:
import time 
from torch.utils.data.dataset import random_split

N_EPOCHS = 1
BATCH_SIZE = 1


# 定义损失函数、优化器
min_valid_loss = float('inf')
criterion = torch.nn.CrossEntropyLoss().to(device)
optimizer = torch.optim.SGD(model.parameters(),lr=4.0)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer,1,gamma=0.9)

# 划分训练集和测试集
train_len = int(len(train_dataset) * 0.95)
sub_train_,sub_valid_ = random_split(train_dataset,[train_len,len(train_dataset)-train_len])

for epoch in range(N_EPOCHS):
    start_time = time.time()
    
    train_loss,train_acc = train(sub_train_)
    valid_loss,valid_acc = valid(sub_valid_)
    
    secs = int(time.time()-start_time)
    
    mins = secs/60
    secs = secs%60
    
    print('Epoch: %d' % (epoch + 1), " | time in %d minites, %d seconds" % (mins, secs))
    print(f'\tLoss: {train_loss:.4f}(train)\t|\tAcc: {train_acc * 100:.1f}%(train)')
    print(f'\tLoss: {valid_loss:.4f}(valid)\t|\tAcc: {valid_acc * 100:.1f}%(valid)')