In [1]:
import pandas as pd
import numpy as np
import torch
from torch import nn
import torch.nn.functional as F
from torch.autograd import Variable
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
from transformers import *
from transformers.optimization import AdamW
import os
import time
from sklearn.metrics import roc_auc_score
from tqdm import tqdm
from sklearn.model_selection import GroupKFold
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

In [2]:
bertwwm_tokenizer =BertTokenizer.from_pretrained('./preTrainModel/chinese-roberta-wwm-ext-large/')
device=torch.device("cuda")
target_dir='./models/'
if not os.path.exists(target_dir):
    os.makedirs(target_dir)

In [3]:
train_left = pd.read_csv('./train/train.query.tsv',sep='\t',header=None)
train_left.columns=['id','query']
train_right = pd.read_csv('./train/train.reply.tsv',sep='\t',header=None)
train_right.columns=['id','id_sub','reply','label']
train_data = train_left.merge(train_right, how='left')
train_data['reply'] = train_data['reply'].fillna('好的')

In [4]:
test_left = pd.read_csv('./test/test.query.tsv',sep='\t',header=None, encoding='gbk')
test_left.columns = ['id','query']
test_right =  pd.read_csv('./test/test.reply.tsv',sep='\t',header=None, encoding='gbk')
test_right.columns=['id','id_sub','reply']
df_test = test_left.merge(test_right, how='left')
df_test['label']=666

In [5]:
train_data 

Unnamed: 0,id,query,id_sub,reply,label
0,0,采荷一小是分校吧,0,杭州市采荷第一小学钱江苑校区，杭州市钱江新城实验学校。,1
1,0,采荷一小是分校吧,1,是的,0
2,0,采荷一小是分校吧,2,这是5楼,0
3,1,毛坯吗？,0,因为公积金贷款贷的少,0
4,1,毛坯吗？,1,是呢,0
...,...,...,...,...,...
21580,5998,您好，我正在看尚林家园的房子,1,有啊,0
21581,5998,您好，我正在看尚林家园的房子,2,我带你看看,0
21582,5999,今天可以安排看房子吗？,0,我约下房东，稍后回你,1
21583,5999,今天可以安排看房子吗？,1,可以看，你几点有时间过来呢？,1


In [6]:
df_test

Unnamed: 0,id,query,id_sub,reply,label
0,0,东区西区？什么时候下证？,0,我在给你发套,666
1,0,东区西区？什么时候下证？,1,您看下我发的这几套,666
2,0,东区西区？什么时候下证？,2,这两套也是金源花园的,666
3,0,东区西区？什么时候下证？,3,价钱低,666
4,0,东区西区？什么时候下证？,4,便宜的房子，一般都是顶楼,666
...,...,...,...,...,...
53752,13998,这套房子有啥问题吗 我看价格不高,3,租约还有两年,666
53753,13998,这套房子有啥问题吗 我看价格不高,4,都有学位的,666
53754,13999,我看看时间吧,0,没有呢,666
53755,13999,我看看时间吧,1,今天新上的,666


In [7]:
class DataPrecessForSentence(Dataset):
    """
    对文本进行处理
    """
    def __init__(self, bert_tokenizer, df, input_categories,max_char_len = 103):
        """
        bert_tokenizer :分词器
        file     :语料文件
        """
        self.bert_tokenizer = bert_tokenizer
        self.max_seq_len = max_char_len
        self.seqs, self.seq_masks, self.seq_segments, self.labels = self.get_input(df,input_categories, self.bert_tokenizer, self.max_seq_len)
        
    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.seqs[idx], self.seq_masks[idx], self.seq_segments[idx], self.labels[idx]
    
    def _convert_to_transformer_inputs(self,question, answer, tokenizer, max_sequence_length):
        def return_id(str1, str2, truncation_strategy, length):

            inputs = tokenizer.encode_plus(str1, str2,
                add_special_tokens=True,
                max_length=length,
                truncation_strategy=truncation_strategy,
                #truncation=True
                )

            input_ids =  inputs["input_ids"]
            input_masks = [1] * len(input_ids)
            input_segments = inputs["token_type_ids"]
            padding_length = length - len(input_ids)
            padding_id = tokenizer.pad_token_id
            input_ids = input_ids + ([padding_id] * padding_length)
            input_masks = input_masks + ([0] * padding_length)
            input_segments = input_segments + ([0] * padding_length)

            return [input_ids, input_masks, input_segments]
    
        input_ids_q, input_masks_q, input_segments_q = return_id(
            question, answer, 'longest_first', max_sequence_length)

        return [input_ids_q, input_masks_q, input_segments_q]
        
    # 获取文本与标签
    def get_input(self, df,columns, tokenizer, max_sequence_length,test=False):

        input_ids_q, input_masks_q, input_segments_q = [], [], []
        input_ids_a, input_masks_a, input_segments_a = [], [], []
        for _, instance in tqdm(df[columns].iterrows()):
            query,reply = instance.query, instance.reply

            ids_q, masks_q, segments_q= \
            self._convert_to_transformer_inputs(query, reply, tokenizer, max_sequence_length)

            input_ids_q.append(ids_q)
            input_masks_q.append(masks_q)
            input_segments_q.append(segments_q)
            
        labels = df['label'].values
        return torch.Tensor(input_ids_q).type(torch.long),torch.Tensor(input_masks_q).type(torch.long),torch.Tensor(input_segments_q).type(torch.long),torch.Tensor(labels).type(torch.long)


In [8]:
class BertwwmModel(nn.Module):
    def __init__(self,dropout=0.5,num_classes=2):
        super(BertwwmModel,self).__init__()
        config = BertConfig.from_pretrained('./preTrainModel/chinese-roberta-wwm-ext-large/') 
        config.output_hidden_states = False 
        self.bertwwm = BertModel.from_pretrained('./preTrainModel/chinese-roberta-wwm-ext-large/', 
                                             config=config)
        self.dropout=dropout
        self.device=torch.device("cuda")
        self.num_classes = num_classes
        self.linear = nn.Linear(1024*4, num_classes)
        for param in self.bertwwm.parameters():
            param.requires_grad=True
    
    def forward(self, q_id, q_mask, q_atn):
        q_embedding = self.bertwwm(q_id, attention_mask=q_mask, token_type_ids=q_atn)[0]
        q = nn.AdaptiveAvgPool2d((1,q_embedding.shape[-1]))(q_embedding).squeeze(1)
        a = nn.AdaptiveMaxPool2d((1,q_embedding.shape[-1]))(q_embedding).squeeze(1)
        t = q_embedding[:,-1]
        e = q_embedding[:, 0]
        merged = torch.cat([q, a, t, e], dim=1)
        x = nn.Dropout(self.dropout)(merged)
        logits=self.linear(x)
        probabilities =F.softmax(logits, dim=-1)
        return logits,probabilities
    

In [9]:
#FGM:对抗性训练

class FGM():
    def __init__(self, model):
        self.model = model
        self.backup = {}

    def attack(self, epsilon=1e-6, emb_name='bertwwm.embeddings.'):
        # emb_name这个参数要换成你模型中embedding的参数名
        for name, param in self.model.named_parameters():
            if param.requires_grad and emb_name in name:
                self.backup[name] = param.data.clone()
                norm = torch.norm(param.grad)
                if norm != 0 and not torch.isnan(norm):
                    r_at = epsilon * param.grad / norm
                    param.data.add_(r_at)

    def restore(self, emb_name='bertwwm.embeddings.'):
        # emb_name这个参数要换成你模型中embedding的参数名
        for name, param in self.model.named_parameters():
            if param.requires_grad and emb_name in name: 
                assert name in self.backup
                param.data = self.backup[name]
        self.backup = {}
        
    def attack_multi_emd(self, epsilon=1e-6, emd_names = ['bertwwm.embeddings.']):
        for name, param in self.model.named_parameters():
            if param.requires_grad:
                is_update = False
                for emd_name in emd_names:
                    if emd_name in name:
                        is_update = True
                        break
                if is_update:
                    self.backup[name] = param.data.clone()
                    norm = torch.norm(param.grad)
                    if norm != 0 and not torch.isnan(norm):
                        r_at = epsilon * param.grad / norm
                        param.data.add_(r_at)

    def restore_multi_emd(self, emd_names = ['bertwwm.embeddings.']):
        for name, param in self.model.named_parameters():
            if param.requires_grad:
                is_update = False
                for emd_name in emd_names:
                    if emd_name in name:
                        is_update = True
                        break
                if is_update:
                    assert name in self.backup
                    param.data = self.backup[name]
        self.backup = {}

In [10]:
#focalloss计算损失
#gamma 2,alpha 0.25

class FocalLoss(nn.Module):
    def __init__(self, gamma=0, alpha=None, size_average=True):
        super(FocalLoss, self).__init__()
        self.gamma = gamma
        self.alpha = alpha
        if isinstance(alpha,(float,int)): self.alpha = torch.Tensor([alpha,1-alpha])
        if isinstance(alpha,list): self.alpha = torch.Tensor(alpha)
        self.size_average = size_average

    def forward(self, input, target):
        if input.dim()>2:
            input = input.view(input.size(0),input.size(1),-1)  # N,C,H,W => N,C,H*W
            input = input.transpose(1,2)    # N,C,H*W => N,H*W,C
            input = input.contiguous().view(-1,input.size(2))   # N,H*W,C => N*H*W,C
        target = target.view(-1,1)

        logpt = F.log_softmax(input)
        logpt = logpt.gather(1,target)
        logpt = logpt.view(-1)
        pt = Variable(logpt.data.exp())

        if self.alpha is not None:
            if self.alpha.type()!=input.data.type():
                self.alpha = self.alpha.type_as(input.data)
            at = self.alpha.gather(0,target.data.view(-1))
            logpt = logpt * Variable(at)

        loss = -1 * (1-pt)**self.gamma * logpt
        if self.size_average: return loss.mean()
        else: return loss.sum()

In [11]:
def correct_predictions(output_probabilities, targets):

    _, out_classes = output_probabilities.max(dim=1)
    correct = (out_classes == targets).sum()
    return correct.item()


def train(model, fgm,dataloader,optimizer, criterion,epoch_number, max_gradient_norm):

    # Switch the model to train mode.
    model.train()
    device = model.device
    epoch_start = time.time()
    batch_time_avg = 0.0
    running_loss = 0.0
    correct_preds = 0
    tqdm_batch_iterator = tqdm(dataloader)
    for batch_index, (batch_seqs, batch_seq_masks, batch_seq_segments, batch_labels) in enumerate(tqdm_batch_iterator):
        batch_start = time.time()
        # Move input and output data to the GPU if it is used.
        seqs, masks, segments, labels = batch_seqs.to(device), batch_seq_masks.to(device), batch_seq_segments.to(device), batch_labels.to(device)
        optimizer.zero_grad()
        # 正常训练
        logits, probs  = model(seqs, masks, segments)
        loss = criterion(logits, labels)
        loss.backward()
        
        # 对抗训练
#         fgm.attack() # 在embedding上添加对抗扰动
#         logits_adv, probs_adv = model(seqs, masks, segments)
#         loss_adv = criterion(logits_adv, labels)
#         loss_adv.backward() # 反向传播，并在正常的grad基础上，累加对抗训练的梯度
#         fgm.restore() # 恢复embedding参数
        
        nn.utils.clip_grad_norm_(model.parameters(), max_gradient_norm)
        optimizer.step()
        batch_time_avg += time.time() - batch_start
        running_loss += loss.item()
        correct_preds += correct_predictions(probs, labels)
        description = "Avg. batch proc. time: {:.4f}s, loss: {:.4f}"\
                      .format(batch_time_avg/(batch_index+1), running_loss/(batch_index+1))
        tqdm_batch_iterator.set_description(description)
    epoch_time = time.time() - epoch_start
    epoch_loss = running_loss / len(dataloader)
    epoch_accuracy = correct_preds / len(dataloader.dataset)
    return epoch_time, epoch_loss, epoch_accuracy


def validate(model, dataloader, criterion):

    # Switch to evaluate mode.
    model.eval()
    device = model.device
    epoch_start = time.time()
    running_loss = 0.0
    running_accuracy = 0.0
    all_prob = []
    all_labels = []
    # Deactivate autograd for evaluation.
    with torch.no_grad():
        for (batch_seqs, batch_seq_masks, batch_seq_segments, batch_labels) in dataloader:
            # Move input and output data to the GPU if one is used.
            seqs = batch_seqs.to(device)
            masks = batch_seq_masks.to(device)
            segments = batch_seq_segments.to(device)
            labels = batch_labels.to(device)
            logits, probs = model(seqs, masks, segments)
            loss = criterion(logits, labels)
            running_loss += loss.item()
            running_accuracy += correct_predictions(probs, labels)
            all_prob.extend(probs[:,1].cpu().numpy())
            all_labels.extend(batch_labels)
    epoch_time = time.time() - epoch_start
    epoch_loss = running_loss / len(dataloader)
    epoch_accuracy = running_accuracy / (len(dataloader.dataset))
    return epoch_time, epoch_loss, epoch_accuracy, roc_auc_score(all_labels, all_prob)



def test(model, dataloader):
    # Switch the model to eval mode.
    label_res=[]
    model.eval()
    device = model.device
    time_start = time.time()
    batch_time = 0.0
    
    # Deactivate autograd for evaluation.
    with torch.no_grad():
        for (batch_seqs, batch_seq_masks, batch_seq_segments, batch_labels) in dataloader:
            batch_start = time.time()
            # Move input and output data to the GPU if one is used.
            seqs, masks, segments, labels = batch_seqs.to(device), batch_seq_masks.to(device), batch_seq_segments.to(device), batch_labels.to(device)
            _, probabilities = model(seqs, masks, segments)
            _, out_classes = probabilities.max(dim=1)
#             print(out_classes)
            label_res.extend(out_classes.cpu().numpy())
            batch_time += time.time() - batch_start

    batch_time /= len(dataloader)
    total_time = time.time() - time_start
#     accuracy /= (len(dataloader.dataset))
    return batch_time, total_time,label_res

In [12]:
input_categories = ['query','reply']
output_categories = 'label'
MAX_SEQUENCE_LENGTH = 100
batch_size=16
print("\t* Loading test data...")
test_data = DataPrecessForSentence(bertwwm_tokenizer,df_test,input_categories,MAX_SEQUENCE_LENGTH)
test_loader = DataLoader(test_data, batch_size=batch_size)

145it [00:00, 1445.78it/s]

	* Loading test data...


53757it [00:35, 1503.79it/s]


In [13]:
#N折交叉验证

gkf = GroupKFold(n_splits=5).split(X=train_data.reply, groups=train_data.id)

valid_preds = [0,0,0,0,0]
test_preds = [0,0,0,0,0]

batch_size=16
epochs=3
lr=2e-05
patience=3
max_grad_norm=10.0

# criterion = nn.CrossEntropyLoss()

criterion = FocalLoss(gamma=0)
oof = np.zeros((len(train_data),1))
for fold, (train_idx, valid_idx) in enumerate(gkf):
    dev_res=[]
    test_res=[]
    best_score = 0.0
    start_epoch = 1
    # Data for loss curves plot
    epochs_count = []
    train_losses = []
    valid_losses = []
    #训练集
    t_data = DataPrecessForSentence(bertwwm_tokenizer, train_data.iloc[train_idx],input_categories,MAX_SEQUENCE_LENGTH)
    train_loader = DataLoader(t_data, shuffle=True, batch_size=batch_size)
    #验证集
    d_data = DataPrecessForSentence(bertwwm_tokenizer, train_data.iloc[valid_idx],input_categories,MAX_SEQUENCE_LENGTH)
    dev_loader = DataLoader(d_data, shuffle=False, batch_size=batch_size)
    #开始训练
    # -------------------- Model definition ------------------- #
    print("\t* Building model:{}...".format(fold))
    model = BertwwmModel().to(device)
    
    #初始化FGM
    fgm = FGM(model)
    
    # 待优化的参数
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
            {
                    'params':[p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
                    'weight_decay':0.01
            },
            {
                    'params':[p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
                    'weight_decay':0.0
            }
    ]
    optimizer = AdamW(optimizer_grouped_parameters, lr=lr)
    
#     optimizer = torch.optim.Adam(params=model.parameters(), lr=lr)
    
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode="max", 
                                                               factor=0.85, patience=0)
    
    print("\n", 20 * "=", "Training Albert model on device: {},fold:{}".format(device,fold), 20 * "=")
    patience_counter = 0
    for epoch in range(start_epoch, epochs + 1):
        epochs_count.append(epoch)
        print("* Training epoch {}:".format(epoch))
        epoch_time, epoch_loss, epoch_accuracy = train(model,fgm, train_loader, optimizer, criterion, epoch, max_grad_norm)
        train_losses.append(epoch_loss)
        print("-> Training time: {:.4f}s, loss = {:.4f}, accuracy: {:.4f}%"
              .format(epoch_time, epoch_loss, (epoch_accuracy*100)))
        print("* Validation for epoch {}:".format(epoch))
        epoch_time, epoch_loss, epoch_accuracy , epoch_auc= validate(model, dev_loader,criterion)
        valid_losses.append(epoch_loss)
        print("-> Valid. time: {:.4f}s, loss: {:.4f}, accuracy: {:.4f}%, auc: {:.4f}\n"
              .format(epoch_time, epoch_loss, (epoch_accuracy*100), epoch_auc))
        # Update the optimizer's learning rate with the scheduler.
        scheduler.step(epoch_accuracy)
        # Early stopping on validation accuracy.
        if epoch_accuracy < best_score:
            patience_counter += 1 
        else:
            best_score = epoch_accuracy
            patience_counter = 0
            
            batch_time, total_time, dev_res= test(model, dev_loader)
            oof[valid_idx] =[[i] for i in dev_res]
            valid_preds[fold]=dev_res
            batch_time, total_time, test_res=test(model, test_loader)
            test_preds[fold]=test_res
#             f1,t = search_f1(valid_outputs, valid_preds[-1])
#             print('validation score = ', f1)
        if patience_counter >= patience:
            print("-> Early stopping: patience limit reached, stopping...")
            break


17268it [00:10, 1667.52it/s]
4317it [00:02, 1680.35it/s]


	* Building model:0...


  0%|          | 0/1080 [00:00<?, ?it/s]


* Training epoch 1:


Avg. batch proc. time: 0.8041s, loss: 0.3855: 100%|██████████| 1080/1080 [14:35<00:00,  1.23it/s]


-> Training time: 875.8299s, loss = 0.3855, accuracy: 84.4626%
* Validation for epoch 1:
-> Valid. time: 69.8246s, loss: 0.3240, accuracy: 87.6303%, auc: 0.9269



  0%|          | 0/1080 [00:00<?, ?it/s]

* Training epoch 2:


Avg. batch proc. time: 1.0876s, loss: 0.2451: 100%|██████████| 1080/1080 [19:49<00:00,  1.10s/it]


-> Training time: 1189.0196s, loss = 0.2451, accuracy: 90.5027%
* Validation for epoch 2:
-> Valid. time: 76.1443s, loss: 0.3031, accuracy: 88.4410%, auc: 0.9371



  0%|          | 0/1080 [00:00<?, ?it/s]

* Training epoch 3:


Avg. batch proc. time: 1.1221s, loss: 0.1630: 100%|██████████| 1080/1080 [20:26<00:00,  1.14s/it]


-> Training time: 1226.8280s, loss = 0.1630, accuracy: 94.0120%
* Validation for epoch 3:


118it [00:00, 1175.87it/s]

-> Valid. time: 71.6313s, loss: 0.3578, accuracy: 87.9546%, auc: 0.9353



17268it [00:54, 319.36it/s]
4317it [00:18, 233.78it/s] 


	* Building model:1...


  0%|          | 0/1080 [00:00<?, ?it/s]


* Training epoch 1:


Avg. batch proc. time: 1.0475s, loss: 0.3897: 100%|██████████| 1080/1080 [19:04<00:00,  1.06s/it]


-> Training time: 1144.4041s, loss = 0.3897, accuracy: 84.2136%
* Validation for epoch 1:
-> Valid. time: 71.3436s, loss: 0.3351, accuracy: 85.6845%, auc: 0.9227



  0%|          | 0/1080 [00:00<?, ?it/s]

* Training epoch 2:


Avg. batch proc. time: 1.1011s, loss: 0.2549: 100%|██████████| 1080/1080 [20:04<00:00,  1.12s/it]


-> Training time: 1204.6013s, loss = 0.2549, accuracy: 90.2536%
* Validation for epoch 2:
-> Valid. time: 76.6887s, loss: 0.3058, accuracy: 87.6535%, auc: 0.9301



  0%|          | 0/1080 [00:00<?, ?it/s]

* Training epoch 3:


Avg. batch proc. time: 1.1495s, loss: 0.1666: 100%|██████████| 1080/1080 [20:57<00:00,  1.16s/it]


-> Training time: 1257.7982s, loss = 0.1666, accuracy: 93.8673%
* Validation for epoch 3:


141it [00:00, 1406.24it/s]

-> Valid. time: 71.2313s, loss: 0.3554, accuracy: 87.1207%, auc: 0.9245



17268it [00:37, 461.05it/s] 
4317it [00:06, 666.45it/s] 


	* Building model:2...


  0%|          | 0/1080 [00:00<?, ?it/s]


* Training epoch 1:


Avg. batch proc. time: 1.1248s, loss: 0.3734: 100%|██████████| 1080/1080 [20:30<00:00,  1.14s/it]


-> Training time: 1230.1680s, loss = 0.3734, accuracy: 84.7058%
* Validation for epoch 1:
-> Valid. time: 82.7830s, loss: 0.3157, accuracy: 87.0280%, auc: 0.9219



  0%|          | 0/1080 [00:00<?, ?it/s]

* Training epoch 2:


Avg. batch proc. time: 1.0851s, loss: 0.2451: 100%|██████████| 1080/1080 [19:46<00:00,  1.10s/it]


-> Training time: 1186.6395s, loss = 0.2451, accuracy: 90.6416%
* Validation for epoch 2:
-> Valid. time: 74.9429s, loss: 0.3219, accuracy: 87.1438%, auc: 0.9228



  0%|          | 0/1080 [00:00<?, ?it/s]

* Training epoch 3:


Avg. batch proc. time: 0.7670s, loss: 0.1611: 100%|██████████| 1080/1080 [13:55<00:00,  1.29it/s]


-> Training time: 835.3343s, loss = 0.1611, accuracy: 93.9136%
* Validation for epoch 3:
-> Valid. time: 57.4065s, loss: 0.3458, accuracy: 88.7190%, auc: 0.9267



17268it [00:10, 1675.37it/s]
4317it [00:02, 1687.75it/s]


	* Building model:3...


  0%|          | 0/1080 [00:00<?, ?it/s]


* Training epoch 1:


Avg. batch proc. time: 0.7341s, loss: 0.4300: 100%|██████████| 1080/1080 [13:17<00:00,  1.35it/s]


-> Training time: 797.1097s, loss = 0.4300, accuracy: 81.9666%
* Validation for epoch 1:
-> Valid. time: 59.4487s, loss: 0.3050, accuracy: 88.0009%, auc: 0.9192



  0%|          | 0/1080 [00:00<?, ?it/s]

* Training epoch 2:


Avg. batch proc. time: 0.6946s, loss: 0.2768: 100%|██████████| 1080/1080 [12:34<00:00,  1.43it/s]


-> Training time: 754.9611s, loss = 0.2768, accuracy: 89.2576%
* Validation for epoch 2:


  0%|          | 0/1080 [00:00<?, ?it/s]

-> Valid. time: 55.5610s, loss: 0.3165, accuracy: 87.7924%, auc: 0.9354

* Training epoch 3:


Avg. batch proc. time: 0.7029s, loss: 0.1839: 100%|██████████| 1080/1080 [12:43<00:00,  1.41it/s]


-> Training time: 763.6416s, loss = 0.1839, accuracy: 93.3287%
* Validation for epoch 3:
-> Valid. time: 58.5930s, loss: 0.3036, accuracy: 89.5298%, auc: 0.9361



17268it [00:11, 1536.28it/s]
4317it [00:02, 1613.13it/s]


	* Building model:4...


  0%|          | 0/1080 [00:00<?, ?it/s]


* Training epoch 1:


Avg. batch proc. time: 0.7468s, loss: 0.3660: 100%|██████████| 1080/1080 [13:31<00:00,  1.33it/s]


-> Training time: 811.2535s, loss = 0.3660, accuracy: 84.8216%
* Validation for epoch 1:
-> Valid. time: 59.9028s, loss: 0.3130, accuracy: 87.4913%, auc: 0.9208



  0%|          | 0/1080 [00:00<?, ?it/s]

* Training epoch 2:


Avg. batch proc. time: 0.6912s, loss: 0.2341: 100%|██████████| 1080/1080 [12:30<00:00,  1.44it/s]


-> Training time: 750.7724s, loss = 0.2341, accuracy: 90.7980%
* Validation for epoch 2:
-> Valid. time: 57.7384s, loss: 0.3132, accuracy: 88.7653%, auc: 0.9299



  0%|          | 0/1080 [00:00<?, ?it/s]

* Training epoch 3:


Avg. batch proc. time: 0.7675s, loss: 0.1455: 100%|██████████| 1080/1080 [13:54<00:00,  1.29it/s]


-> Training time: 834.3293s, loss = 0.1455, accuracy: 94.7823%
* Validation for epoch 3:
-> Valid. time: 58.8226s, loss: 0.3554, accuracy: 88.5569%, auc: 0.9278



In [14]:
from sklearn.metrics import f1_score
def search_f1(y_true, y_pred):
    best = 0
    best_t = 0
    for i in range(30,60):
        tres = i / 100
        y_pred_bin =  (y_pred > tres).astype(int)
        score = f1_score(y_true, y_pred_bin)
        if score > best:
            best = score
            best_t = tres
    print('best', best)
    print('thres', best_t)
    return best, best_t

def compute_output_arrays(df, columns):
    return np.asarray(df[columns])

In [15]:
outputs = compute_output_arrays(train_data, output_categories)
best_score, best_t = search_f1(outputs,oof)
sub = np.average(test_preds, axis=0)
sub = sub > best_t

best 0.7637795275590551
thres 0.3


In [19]:
df_test['label'] = sub.astype(int)
df_test[['id','id_sub','label']].to_csv('./submission_file/submission_roberta_large_new.csv',index=False, header=None,sep='\t')

In [17]:
# sub = np.average(test_preds, axis=0)
# sub = sub > 0.5

In [18]:
# df_test['label'] = sub.astype(int)
# df_test[['id','id_sub','label']].to_csv('./submission_file/submission_bert_wwm_focalloss_fgm.csv',index=False, header=None,sep='\t')