In [1]:
import pandas as pd
import torch
from torch import nn
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
from hanziconv import HanziConv
from transformers import *
from transformers.optimization import AdamW
import os
import time
from sklearn.metrics import roc_auc_score
from tqdm import tqdm
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [2]:
roberta_tokenizer =BertTokenizer.from_pretrained('./preTrainModel/roberta_chinese_large/')

In [3]:
device=torch.device("cuda")

In [4]:
target_dir='./models/'
if not os.path.exists(target_dir):
    os.makedirs(target_dir)

In [5]:
class RobertaModel(nn.Module):
    def __init__(self):
        super(RobertaModel,self).__init__()
        self.roberta=BertForSequenceClassification.from_pretrained('./preTrainModel/roberta_chinese_large/')
        self.device=torch.device("cuda")
        for param in self.roberta.parameters():
            param.requires_grad=True
    
    def forward(self, batch_seqs, batch_seq_masks, batch_seq_segments, labels):
        loss, logits = self.roberta(input_ids = batch_seqs, attention_mask = batch_seq_masks, 
                              token_type_ids=batch_seq_segments, labels = labels)[:2]
        probabilities = nn.functional.softmax(logits, dim=-1)
        return loss, logits, probabilities

In [5]:
class DataPrecessForSentence(Dataset):
    """
    对文本进行处理
    """
    def __init__(self, bert_tokenizer, file, max_char_len = 103):
        """
        bert_tokenizer :分词器
        file     :语料文件
        """
        self.bert_tokenizer = bert_tokenizer
        self.max_seq_len = max_char_len
        self.seqs, self.seq_masks, self.seq_segments, self.labels = self.get_input(file)
        
    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.seqs[idx], self.seq_masks[idx], self.seq_segments[idx], self.labels[idx]
        
    # 获取文本与标签
    def get_input(self, file,test=False):
        """
        通对输入文本进行分词、ID化、截断、填充等流程得到最终的可用于模型输入的序列。
        入参:
            dataset     : pandas的dataframe格式，包含三列，第一,二列为文本，第三列为标签。标签取值为{0,1}，其中0表示负样本，1代表正样本。
            max_seq_len : 目标序列长度，该值需要预先对文本长度进行分别得到，可以设置为小于等于512（BERT的最长文本序列长度为512）的整数。
        出参:
            seq         : 在入参seq的头尾分别拼接了'CLS'与'SEP'符号，如果长度仍小于max_seq_len，则使用0在尾部进行了填充。
            seq_mask    : 只包含0、1且长度等于seq的序列，用于表征seq中的符号是否是有意义的，如果seq序列对应位上为填充符号，
                          那么取值为1，否则为0。
            seq_segment : shape等于seq，因为是单句，所以取值都为0。
            labels      : 标签取值为{0,1}，其中0表示负样本，1代表正样本。
        """
        df = pd.read_csv(file)
        sentences_1 = map(HanziConv.toSimplified, df['query'].values)
        sentences_2 = map(HanziConv.toSimplified, df['reply'].values)
        labels = df['label'].values
        # 切词
        tokens_seq_1 = list(map(self.bert_tokenizer.tokenize, sentences_1))
        tokens_seq_2 = list(map(self.bert_tokenizer.tokenize, sentences_2))
        # 获取定长序列及其mask
        result = list(map(self.trunate_and_pad, tokens_seq_1, tokens_seq_2))
        seqs = [i[0] for i in result]
        seq_masks = [i[1] for i in result]
        seq_segments = [i[2] for i in result]
        return torch.Tensor(seqs).type(torch.long), torch.Tensor(seq_masks).type(torch.long), torch.Tensor(seq_segments).type(torch.long), torch.Tensor(labels).type(torch.long)
    
    def trunate_and_pad(self, tokens_seq_1, tokens_seq_2):
        """
        1. 如果是单句序列，按照BERT中的序列处理方式，需要在输入序列头尾分别拼接特殊字符'CLS'与'SEP'，
           因此不包含两个特殊字符的序列长度应该小于等于max_seq_len-2，如果序列长度大于该值需要那么进行截断。
        2. 对输入的序列 最终形成['CLS',seq,'SEP']的序列，该序列的长度如果小于max_seq_len，那么使用0进行填充。
        入参: 
            seq_1       : 输入序列，在本处其为单个句子。
            seq_2       : 输入序列，在本处其为单个句子。
            max_seq_len : 拼接'CLS'与'SEP'这两个特殊字符后的序列长度
        
        出参:
            seq         : 在入参seq的头尾分别拼接了'CLS'与'SEP'符号，如果长度仍小于max_seq_len，则使用0在尾部进行了填充。
            seq_mask    : 只包含0、1且长度等于seq的序列，用于表征seq中的符号是否是有意义的，如果seq序列对应位上为填充符号，
                          那么取值为1，否则为0。
            seq_segment : shape等于seq，单句，取值都为0 ，双句按照01切分
           
        """
        # 对超长序列进行截断
        if len(tokens_seq_1) > ((self.max_seq_len - 3)//2):
            tokens_seq_1 = tokens_seq_1[0:(self.max_seq_len - 3)//2]
        if len(tokens_seq_2) > ((self.max_seq_len - 3)//2):
            tokens_seq_2 = tokens_seq_2[0:(self.max_seq_len - 3)//2]
        # 分别在首尾拼接特殊符号
        seq = ['[CLS]'] + tokens_seq_1 + ['[SEP]'] + tokens_seq_2 + ['[SEP]']
        seq_segment = [0] * (len(tokens_seq_1) + 2) + [1] * (len(tokens_seq_2) + 1)
        # ID化
        seq = self.bert_tokenizer.convert_tokens_to_ids(seq)
        # 根据max_seq_len与seq的长度产生填充序列
        padding = [0] * (self.max_seq_len - len(seq))
        # 创建seq_mask
        seq_mask = [1] * len(seq) + padding
        # 创建seq_segment
        seq_segment = seq_segment + padding
        # 对seq拼接填充序列
        seq += padding
        assert len(seq) == self.max_seq_len
        assert len(seq_mask) == self.max_seq_len
        assert len(seq_segment) == self.max_seq_len
        return seq, seq_mask, seq_segment

In [6]:
batch_size=16
epochs=10
lr=2e-05
patience=3
max_grad_norm=10.0

PATH='./'
train_file = PATH+'train.csv'
dev_file = PATH+'dev.csv'
test_file = PATH+'test.csv'

print("\t* Loading training data...")
train_data = DataPrecessForSentence(roberta_tokenizer, train_file)
train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
print("\t* Loading validation data...")
dev_data = DataPrecessForSentence(roberta_tokenizer,dev_file)
dev_loader = DataLoader(dev_data, shuffle=True, batch_size=batch_size)

	* Loading training data...
	* Loading validation data...


In [8]:
# -------------------- Model definition ------------------- #
print("\t* Building model...")
model = RobertaModel().to(device)

	* Building model...


In [9]:
# 待优化的参数
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
        {
                'params':[p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
                'weight_decay':0.01
        },
        {
                'params':[p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
                'weight_decay':0.0
        }
]
optimizer = AdamW(optimizer_grouped_parameters, lr=lr)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode="max", 
                                                           factor=0.85, patience=0)

In [10]:
def correct_predictions(output_probabilities, targets):

    _, out_classes = output_probabilities.max(dim=1)
    correct = (out_classes == targets).sum()
    return correct.item()


def train(model, dataloader, optimizer, epoch_number, max_gradient_norm):

    # Switch the model to train mode.
    model.train()
    device = model.device
    epoch_start = time.time()
    batch_time_avg = 0.0
    running_loss = 0.0
    correct_preds = 0
    tqdm_batch_iterator = tqdm(dataloader)
    for batch_index, (batch_seqs, batch_seq_masks, batch_seq_segments, batch_labels) in enumerate(tqdm_batch_iterator):
        batch_start = time.time()
        # Move input and output data to the GPU if it is used.
        seqs, masks, segments, labels = batch_seqs.to(device), batch_seq_masks.to(device), batch_seq_segments.to(device), batch_labels.to(device)
        optimizer.zero_grad()
        loss, logits, probabilities = model(seqs, masks, segments, labels)
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_gradient_norm)
        optimizer.step()
        batch_time_avg += time.time() - batch_start
        running_loss += loss.item()
        correct_preds += correct_predictions(probabilities, labels)
        description = "Avg. batch proc. time: {:.4f}s, loss: {:.4f}"\
                      .format(batch_time_avg/(batch_index+1), running_loss/(batch_index+1))
        tqdm_batch_iterator.set_description(description)
    epoch_time = time.time() - epoch_start
    epoch_loss = running_loss / len(dataloader)
    epoch_accuracy = correct_preds / len(dataloader.dataset)
    return epoch_time, epoch_loss, epoch_accuracy


def validate(model, dataloader):

    # Switch to evaluate mode.
    model.eval()
    device = model.device
    epoch_start = time.time()
    running_loss = 0.0
    running_accuracy = 0.0
    all_prob = []
    all_labels = []
    # Deactivate autograd for evaluation.
    with torch.no_grad():
        for (batch_seqs, batch_seq_masks, batch_seq_segments, batch_labels) in dataloader:
            # Move input and output data to the GPU if one is used.
            seqs = batch_seqs.to(device)
            masks = batch_seq_masks.to(device)
            segments = batch_seq_segments.to(device)
            labels = batch_labels.to(device)
            loss, logits, probabilities = model(seqs, masks, segments, labels)
            running_loss += loss.item()
            running_accuracy += correct_predictions(probabilities, labels)
            all_prob.extend(probabilities[:,1].cpu().numpy())
            all_labels.extend(batch_labels)
    epoch_time = time.time() - epoch_start
    epoch_loss = running_loss / len(dataloader)
    epoch_accuracy = running_accuracy / (len(dataloader.dataset))
    return epoch_time, epoch_loss, epoch_accuracy, roc_auc_score(all_labels, all_prob)

In [11]:
best_score = 0.0
start_epoch = 1
# Data for loss curves plot
epochs_count = []
train_losses = []
valid_losses = []
_, valid_loss, valid_accuracy, auc = validate(model, dev_loader)
print("\t* Validation loss before training: {:.4f}, accuracy: {:.4f}%, auc: {:.4f}".format(valid_loss, (valid_accuracy*100), auc))

	* Validation loss before training: 0.6989, accuracy: 49.1500%, auc: 0.4799


In [12]:
# -------------------- Training epochs ------------------- #
print("\n", 20 * "=", "Training Roberta model on device: {}".format(device), 20 * "=")
patience_counter = 0
for epoch in range(start_epoch, epochs + 1):
    epochs_count.append(epoch)
    print("* Training epoch {}:".format(epoch))
    epoch_time, epoch_loss, epoch_accuracy = train(model, train_loader, optimizer, epoch, max_grad_norm)
    train_losses.append(epoch_loss)
    print("-> Training time: {:.4f}s, loss = {:.4f}, accuracy: {:.4f}%"
          .format(epoch_time, epoch_loss, (epoch_accuracy*100)))
    print("* Validation for epoch {}:".format(epoch))
    epoch_time, epoch_loss, epoch_accuracy , epoch_auc= validate(model, dev_loader)
    valid_losses.append(epoch_loss)
    print("-> Valid. time: {:.4f}s, loss: {:.4f}, accuracy: {:.4f}%, auc: {:.4f}\n"
          .format(epoch_time, epoch_loss, (epoch_accuracy*100), epoch_auc))
    # Update the optimizer's learning rate with the scheduler.
    scheduler.step(epoch_accuracy)
    # Early stopping on validation accuracy.
    if epoch_accuracy < best_score:
        patience_counter += 1
    else:
        best_score = epoch_accuracy
        patience_counter = 0
        torch.save({"epoch": epoch, 
                    "model": model.state_dict(),
                    "best_score": best_score,
                    "epochs_count": epochs_count,
                    "train_losses": train_losses,
                    "valid_losses": valid_losses},
                    os.path.join(target_dir, "pytorch_roberta_large_best.pth.tar"))
    if patience_counter >= patience:
        print("-> Early stopping: patience limit reached, stopping...")
        break

  0%|          | 0/1138 [00:00<?, ?it/s]


* Training epoch 1:


Avg. batch proc. time: 0.5886s, loss: 0.3870: 100%|██████████| 1138/1138 [11:13<00:00,  1.69it/s]


-> Training time: 673.1605s, loss = 0.3870, accuracy: 83.2473%
* Validation for epoch 1:
-> Valid. time: 22.0590s, loss: 0.2986, accuracy: 87.3500%, auc: 0.9239



  0%|          | 0/1138 [00:00<?, ?it/s]

* Training epoch 2:


Avg. batch proc. time: 0.5879s, loss: 0.2695: 100%|██████████| 1138/1138 [11:12<00:00,  1.69it/s]


-> Training time: 672.3335s, loss = 0.2695, accuracy: 88.9231%
* Validation for epoch 2:
-> Valid. time: 22.5458s, loss: 0.3095, accuracy: 88.2000%, auc: 0.9270



  0%|          | 0/1138 [00:00<?, ?it/s]

* Training epoch 3:


Avg. batch proc. time: 0.5895s, loss: 0.1950: 100%|██████████| 1138/1138 [11:14<00:00,  1.69it/s]


-> Training time: 674.1590s, loss = 0.1950, accuracy: 92.3626%
* Validation for epoch 3:


  0%|          | 0/1138 [00:00<?, ?it/s]

-> Valid. time: 22.4643s, loss: 0.3403, accuracy: 87.6000%, auc: 0.9142

* Training epoch 4:


Avg. batch proc. time: 0.5906s, loss: 0.1290: 100%|██████████| 1138/1138 [11:15<00:00,  1.69it/s]


-> Training time: 675.3398s, loss = 0.1290, accuracy: 95.1374%
* Validation for epoch 4:


  0%|          | 0/1138 [00:00<?, ?it/s]

-> Valid. time: 22.4590s, loss: 0.3553, accuracy: 87.8500%, auc: 0.9223

* Training epoch 5:


Avg. batch proc. time: 0.5915s, loss: 0.0872: 100%|██████████| 1138/1138 [11:16<00:00,  1.68it/s]


-> Training time: 676.5126s, loss = 0.0872, accuracy: 96.8956%
* Validation for epoch 5:
-> Valid. time: 22.0874s, loss: 0.4572, accuracy: 87.6500%, auc: 0.9150

-> Early stopping: patience limit reached, stopping...


In [10]:
label_res=[]
def test(model, dataloader):
    # Switch the model to eval mode.
    model.eval()
    device = model.device
    time_start = time.time()
    batch_time = 0.0
    
    # Deactivate autograd for evaluation.
    with torch.no_grad():
        for (batch_seqs, batch_seq_masks, batch_seq_segments, batch_labels) in dataloader:
            batch_start = time.time()
            # Move input and output data to the GPU if one is used.
            seqs, masks, segments, labels = batch_seqs.to(device), batch_seq_masks.to(device), batch_seq_segments.to(device), batch_labels.to(device)
            _, _, probabilities = model(seqs, masks, segments, labels)
            print(out_classes)
            _, out_classes = probabilities.max(dim=1)
            label_res.extend(out_classes.cpu().numpy())
            batch_time += time.time() - batch_start

    batch_time /= len(dataloader)
    total_time = time.time() - time_start
#     accuracy /= (len(dataloader.dataset))
    return batch_time, total_time,label_res

class RobertaModelTest(nn.Module):
    def __init__(self):
        super(RobertaModelTest, self).__init__()
        config = BertConfig.from_pretrained('./preTrainModel/roberta_chinese_large/')
        print(config)
        self.bert = BertForSequenceClassification(config)  # /bert_pretrain/
        self.device = torch.device("cuda")

    def forward(self, batch_seqs, batch_seq_masks, batch_seq_segments, labels):
        loss, logits = self.bert(input_ids = batch_seqs, attention_mask = batch_seq_masks, 
                              token_type_ids=batch_seq_segments, labels = labels)[:2]
        probabilities = nn.functional.softmax(logits, dim=-1)
        return loss, logits, probabilities

In [11]:
batch_size=16
pretrained_file='./models/pytorch_roberta_large_best.pth.tar'

PATH='./'
test_file = PATH+'test.csv'

print("\t* Loading test data...")
test_data = DataPrecessForSentence(roberta_tokenizer,test_file)
test_loader = DataLoader(test_data, batch_size=batch_size)

device = torch.device("cuda")
# albert_tokenizer = BertTokenizer.from_pretrained("voidful/albert_chinese_base",cache_dir='./preTrainModel/albert_chinese_base')
print(20 * "=", " Preparing for testing ", 20 * "=")
# if platform == "linux" or platform == "linux2":
checkpoint = torch.load(pretrained_file)
# else:
#     checkpoint = torch.load(pretrained_file, map_location=device)

	* Loading test data...


RuntimeError: CUDA error: an illegal memory access was encountered

In [9]:
print("\t* Building model...")
model = RobertaModelTest().to(device)
# model.load_state_dict(checkpoint["model"])
model.load_state_dict(new_state_dict)
print(20 * "=", " Testing Roberta model on device: {} ".format(device), 20 * "=")
batch_time, total_time, label_res= test(model, test_loader)
print("\n-> Average batch processing time: {:.4f}s, total test time: {:.4f}s\n".
      format(batch_time, total_time))
print(label_res)

	* Building model...
BertConfig {
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "directionality": "bidi",
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "pad_token_id": 1,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "type_vocab_size": 2,
  "vocab_size": 21128
}



RuntimeError: CUDA error: an illegal memory access was encountered

In [None]:
len(label_res)

In [None]:
sample_submission=pd.read_csv('./sample_submission.tsv',sep='\t',header=None)
sample_submission.columns=['id','idx','label_UNK']
sample_submission['label']=label_res

In [None]:
sample_submission

In [18]:
sample_submission.drop('label_UNK',axis=1,inplace=True)

NameError: name 'sample_submission' is not defined

In [None]:
sample_submission.to_csv('./submission_file/sample_submission_roberta_base.tsv',index=False,sep='\t', encoding='utf-8',header=0)

In [8]:
from collections import OrderedDict
# 初始化一个空 dict
new_state_dict = OrderedDict()
for k,v in checkpoint["model"].items():
    if 'roberta.bert' in k:
        k = k.replace('roberta.bert', 'bert.bert')
    elif k=='roberta.classifier.weight':
        k='bert.classifier.weight'
    elif k=='roberta.classifier.bias':
        k='bert.classifier.bias'
    new_state_dict[k]=v 