In [1]:
import torch
from torch.utils import data
import torch.nn as nn
import torch.nn.functional as F
import pandas as pd
import numpy as np
from transformers import BertTokenizer
from transformers import BertModel
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

In [3]:
Data = pd.read_csv('train.csv',header=None)

In [4]:
Data = Data.iloc[:,0].str.split('\t',expand=True)

In [5]:
Data = Data.rename(columns={0:'sentence',1:'label'})

In [6]:
label_lst = list(Data['label'].unique())

In [7]:
label2idx = {label_lst[i]:i for i in range(len(label_lst))}

In [8]:
len(label2idx)

12

In [9]:
def transform_label_2_idx(x):
    return label2idx[x]

In [10]:
Data['label'] = Data['label'].apply(transform_label_2_idx)

In [11]:
tokenizer = BertTokenizer.from_pretrained('./Bert_Model/bert-base-uncased')

In [12]:
class Dataset(data.Dataset):
    def __init__(self,Data):
        super().__init__()
        self.data = Data
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') # 加载bert的基础分词器
        self.TEXT_LEN = max(self.data.iloc[:,0].apply(len)) # 记录训练集当中的最长句子
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self,index):
        label,text = self.data.iloc[index,1],self.data.iloc[index,0] # 按照索引读取类别和正文
        tokened = self.tokenizer(text)
        input_ids = tokened['input_ids']
        mask = tokened['attention_mask']
        BERT_PAD_ID = self.tokenizer.pad_token_id
        if len(input_ids) < self.TEXT_LEN:
            pad_len = (self.TEXT_LEN - len(input_ids))
            input_ids += [BERT_PAD_ID] * pad_len
            mask += [0] * pad_len
        target = int(label)
        return torch.tensor(input_ids[:self.TEXT_LEN]), torch.tensor(mask[:self.TEXT_LEN]), torch.tensor(target)

In [13]:
EMBEDDING_DIM = 768
NUM_FILTERS = 256
NUM_CLASSES = 12
FILTER_SIZES = [2, 3, 4]
class TextCNN(nn.Module):
    def __init__(self):
        super().__init__()
        self.bert = BertModel.from_pretrained('./Bert_Model/bert-base-uncased')
        for name ,param in self.bert.named_parameters():
            param.requires_grad = False
        self.convs = nn.ModuleList([nn.Conv2d(1, NUM_FILTERS, (i, EMBEDDING_DIM)) for i in FILTER_SIZES])
        self.linear = nn.Linear(NUM_FILTERS * 3, NUM_CLASSES)

    def conv_and_pool(self, conv, input):
        out = conv(input)
        out = F.relu(out)
        return F.max_pool2d(out, (out.shape[2], out.shape[3])).squeeze()

    def forward(self, input, mask):
        out = self.bert(input, mask)[0].unsqueeze(1)
        out = torch.cat([self.conv_and_pool(conv, out) for conv in self.convs], dim=1)
        return self.linear(out)

In [14]:
from torch.utils.data.dataset import random_split 

In [15]:
train_dataset = Dataset(Data)
spilt_train,split_valid =  random_split(train_dataset,[int(len(train_dataset)*0.8),len(train_dataset)-int(len(train_dataset)*0.8)])
train_loader = data.DataLoader(spilt_train, batch_size=10, shuffle=True)
dev_loader = data.DataLoader(split_valid, batch_size=10, shuffle=True)

Downloading (…)solve/main/vocab.txt: 100%|██████████| 232k/232k [00:00<00:00, 311kB/s]
Downloading (…)okenizer_config.json: 100%|██████████| 28.0/28.0 [00:00<00:00, 2.52kB/s]
Downloading (…)lve/main/config.json: 100%|██████████| 570/570 [00:00<00:00, 317kB/s]


In [20]:
EPOCH = 10
LR = 1e-3
model = TextCNN().to(DEVICE)
optimizer = torch.optim.Adam(model.parameters(), lr=LR)
loss_fn = nn.CrossEntropyLoss()
scheduler = torch.optim.lr_scheduler.StepLR(optimizer,1.0,gamma=0.1)

Some weights of the model checkpoint at ./Bert_Model/bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [21]:
from sklearn.metrics import classification_report
def evaluate(pred, true, target_names=None, output_dict=False):
    return classification_report(
        true,
        pred,
        target_names=target_names,
        output_dict=output_dict,
        zero_division=0,
    )

In [22]:
MODEL_DIR = './OUTPUT/'

In [23]:
for e in tqdm(range(EPOCH)):
    for b, (input, mask, target) in enumerate(train_loader):
        input = input.to(DEVICE)
        mask = mask.to(DEVICE)
        target = target.to(DEVICE)

        pred = model(input, mask)
        loss = loss_fn(pred, target)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if b % 500 != 0:
            continue

        y_pred = torch.argmax(pred, dim=1)
        report = evaluate(y_pred.cpu().data.numpy(), target.cpu().data.numpy(), output_dict=True)
        with torch.no_grad():
            for dev_input, dev_mask, dev_target in dev_loader:
                dev_input = dev_input.to(DEVICE)
                dev_mask = dev_mask.to(DEVICE)
                dev_target = dev_target.to(DEVICE)
                dev_pred = model(dev_input, dev_mask)
                dev_pred_ = torch.argmax(dev_pred, dim=1)
                dev_report = evaluate(dev_pred_.cpu().data.numpy(), dev_target.cpu().data.numpy(), output_dict=True)
                break
        print(
            '>> epoch:', e,
            'batch:', b,
            'loss:', round(loss.item(), 5),
            'train_acc:', report['accuracy'],
            'dev_acc:', dev_report['accuracy']
        )
    if e%50 ==0:
        torch.save(model, MODEL_DIR + f'{e}.pth')

  0%|          | 0/10 [00:00<?, ?it/s]

>> epoch: 0 batch: 0 loss: 2.3414 train_acc: 0.2 dev_acc: 0.1
>> epoch: 0 batch: 500 loss: 2.11987 train_acc: 0.3 dev_acc: 0.9


 10%|█         | 1/10 [00:39<05:54, 39.42s/it]

>> epoch: 1 batch: 0 loss: 0.9267 train_acc: 0.7 dev_acc: 0.8
>> epoch: 1 batch: 500 loss: 1.53099 train_acc: 0.5 dev_acc: 0.7


 20%|██        | 2/10 [01:13<04:50, 36.27s/it]

>> epoch: 2 batch: 0 loss: 0.94175 train_acc: 0.6 dev_acc: 0.5
>> epoch: 2 batch: 500 loss: 1.08657 train_acc: 0.6 dev_acc: 0.6


 30%|███       | 3/10 [01:47<04:07, 35.29s/it]

>> epoch: 3 batch: 0 loss: 0.87594 train_acc: 0.7 dev_acc: 0.4
>> epoch: 3 batch: 500 loss: 0.54531 train_acc: 0.8 dev_acc: 0.9


 40%|████      | 4/10 [02:21<03:29, 34.85s/it]

>> epoch: 4 batch: 0 loss: 0.61071 train_acc: 0.7 dev_acc: 0.7
>> epoch: 4 batch: 500 loss: 1.03577 train_acc: 0.6 dev_acc: 0.7


 50%|█████     | 5/10 [02:55<02:53, 34.61s/it]

>> epoch: 5 batch: 0 loss: 0.97601 train_acc: 0.6 dev_acc: 0.8
>> epoch: 5 batch: 500 loss: 0.71175 train_acc: 0.7 dev_acc: 0.7


 60%|██████    | 6/10 [03:30<02:17, 34.48s/it]

>> epoch: 6 batch: 0 loss: 1.0109 train_acc: 0.7 dev_acc: 0.8
>> epoch: 6 batch: 500 loss: 0.38359 train_acc: 0.8 dev_acc: 0.2


 70%|███████   | 7/10 [04:04<01:43, 34.39s/it]

>> epoch: 7 batch: 0 loss: 0.55266 train_acc: 0.8 dev_acc: 0.9
>> epoch: 7 batch: 500 loss: 0.6743 train_acc: 0.8 dev_acc: 0.7


 80%|████████  | 8/10 [04:38<01:08, 34.34s/it]

>> epoch: 8 batch: 0 loss: 0.11041 train_acc: 1.0 dev_acc: 0.8
>> epoch: 8 batch: 500 loss: 0.40319 train_acc: 0.8 dev_acc: 0.6


 90%|█████████ | 9/10 [05:12<00:34, 34.30s/it]

>> epoch: 9 batch: 0 loss: 0.18773 train_acc: 0.9 dev_acc: 0.8
>> epoch: 9 batch: 500 loss: 0.72395 train_acc: 0.7 dev_acc: 0.6


100%|██████████| 10/10 [05:47<00:00, 34.71s/it]
