In [1]:
import pandas as pd
import numpy as np
import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
from transformers import *
from transformers.optimization import AdamW
import os
import time
from sklearn.metrics import roc_auc_score
from tqdm import tqdm
from sklearn.model_selection import GroupKFold
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"

In [2]:
bertwwm_tokenizer =BertTokenizer.from_pretrained('./preTrainModel/chinese-bert-wwm-ext/')
device=torch.device("cuda")
target_dir='./models/'
if not os.path.exists(target_dir):
    os.makedirs(target_dir)

In [3]:
train_left = pd.read_csv('./train/train.query.tsv',sep='\t',header=None)
train_left.columns=['id','query']
train_right = pd.read_csv('./train/train.reply.tsv',sep='\t',header=None)
train_right.columns=['id','id_sub','reply','label']
train_data = train_left.merge(train_right, how='left')
train_data['reply'] = train_data['reply'].fillna('好的')

In [4]:
test_left = pd.read_csv('./test/test.query.tsv',sep='\t',header=None, encoding='gbk')
test_left.columns = ['id','query']
test_right =  pd.read_csv('./test/test.reply.tsv',sep='\t',header=None, encoding='gbk')
test_right.columns=['id','id_sub','reply']
df_test = test_left.merge(test_right, how='left')
df_test['label']=666

In [5]:
train_data 

Unnamed: 0,id,query,id_sub,reply,label
0,0,采荷一小是分校吧,0,杭州市采荷第一小学钱江苑校区，杭州市钱江新城实验学校。,1
1,0,采荷一小是分校吧,1,是的,0
2,0,采荷一小是分校吧,2,这是5楼,0
3,1,毛坯吗？,0,因为公积金贷款贷的少,0
4,1,毛坯吗？,1,是呢,0
...,...,...,...,...,...
21580,5998,您好，我正在看尚林家园的房子,1,有啊,0
21581,5998,您好，我正在看尚林家园的房子,2,我带你看看,0
21582,5999,今天可以安排看房子吗？,0,我约下房东，稍后回你,1
21583,5999,今天可以安排看房子吗？,1,可以看，你几点有时间过来呢？,1


In [6]:
df_test

Unnamed: 0,id,query,id_sub,reply,label
0,0,东区西区？什么时候下证？,0,我在给你发套,666
1,0,东区西区？什么时候下证？,1,您看下我发的这几套,666
2,0,东区西区？什么时候下证？,2,这两套也是金源花园的,666
3,0,东区西区？什么时候下证？,3,价钱低,666
4,0,东区西区？什么时候下证？,4,便宜的房子，一般都是顶楼,666
...,...,...,...,...,...
53752,13998,这套房子有啥问题吗 我看价格不高,3,租约还有两年,666
53753,13998,这套房子有啥问题吗 我看价格不高,4,都有学位的,666
53754,13999,我看看时间吧,0,没有呢,666
53755,13999,我看看时间吧,1,今天新上的,666


In [7]:
class DataPrecessForSentence(Dataset):
    """
    对文本进行处理
    """
    def __init__(self, bert_tokenizer, df, input_categories,max_char_len = 103):
        """
        bert_tokenizer :分词器
        file     :语料文件
        """
        self.bert_tokenizer = bert_tokenizer
        self.max_seq_len = max_char_len
        self.seqs, self.seq_masks, self.seq_segments, self.labels = self.get_input(df,input_categories, self.bert_tokenizer, self.max_seq_len)
        
    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.seqs[idx], self.seq_masks[idx], self.seq_segments[idx], self.labels[idx]
    
    def _convert_to_transformer_inputs(self,question, answer, tokenizer, max_sequence_length):
        def return_id(str1, str2, truncation_strategy, length):

            inputs = tokenizer.encode_plus(str1, str2,
                add_special_tokens=True,
                max_length=length,
                truncation_strategy=truncation_strategy,
                #truncation=True
                )

            input_ids =  inputs["input_ids"]
            input_masks = [1] * len(input_ids)
            input_segments = inputs["token_type_ids"]
            padding_length = length - len(input_ids)
            padding_id = tokenizer.pad_token_id
            input_ids = input_ids + ([padding_id] * padding_length)
            input_masks = input_masks + ([0] * padding_length)
            input_segments = input_segments + ([0] * padding_length)

            return [input_ids, input_masks, input_segments]
    
        input_ids_q, input_masks_q, input_segments_q = return_id(
            question, answer, 'longest_first', max_sequence_length)

        return [input_ids_q, input_masks_q, input_segments_q]
        
    # 获取文本与标签
    def get_input(self, df,columns, tokenizer, max_sequence_length,test=False):
        """
        通对输入文本进行分词、ID化、截断、填充等流程得到最终的可用于模型输入的序列。
        入参:
            dataset     : pandas的dataframe格式，包含三列，第一,二列为文本，第三列为标签。标签取值为{0,1}，其中0表示负样本，1代表正样本。
            max_seq_len : 目标序列长度，该值需要预先对文本长度进行分别得到，可以设置为小于等于512（BERT的最长文本序列长度为512）的整数。
        出参:
            seq         : 在入参seq的头尾分别拼接了'CLS'与'SEP'符号，如果长度仍小于max_seq_len，则使用0在尾部进行了填充。
            seq_mask    : 只包含0、1且长度等于seq的序列，用于表征seq中的符号是否是有意义的，如果seq序列对应位上为填充符号，
                          那么取值为1，否则为0。
            seq_segment : shape等于seq，因为是单句，所以取值都为0。
            labels      : 标签取值为{0,1}，其中0表示负样本，1代表正样本。
        """
#         df = pd.read_csv(file)
#         sentences_1 = map(HanziConv.toSimplified, df['query'].values)
#         sentences_2 = map(HanziConv.toSimplified, df['reply'].values)
#         labels = df['label'].values
#         # 切词
#         tokens_seq_1 = list(map(self.bert_tokenizer.tokenize, sentences_1))
#         tokens_seq_2 = list(map(self.bert_tokenizer.tokenize, sentences_2))
#         # 获取定长序列及其mask
#         result = list(map(self.trunate_and_pad, tokens_seq_1, tokens_seq_2))
#         seqs = [i[0] for i in result]
#         seq_masks = [i[1] for i in result]
#         seq_segments = [i[2] for i in result]
#         return torch.Tensor(seqs).type(torch.long), torch.Tensor(seq_masks).type(torch.long), torch.Tensor(seq_segments).type(torch.long), torch.Tensor(labels).type(torch.long)
        input_ids_q, input_masks_q, input_segments_q = [], [], []
        input_ids_a, input_masks_a, input_segments_a = [], [], []
        for _, instance in tqdm(df[columns].iterrows()):
            query,reply = instance.query, instance.reply

            ids_q, masks_q, segments_q= \
            self._convert_to_transformer_inputs(query, reply, tokenizer, max_sequence_length)

            input_ids_q.append(ids_q)
            input_masks_q.append(masks_q)
            input_segments_q.append(segments_q)
            
        labels = df['label'].values
        return torch.Tensor(input_ids_q).type(torch.long),torch.Tensor(input_masks_q).type(torch.long),torch.Tensor(input_segments_q).type(torch.long),torch.Tensor(labels).type(torch.long)
#         return [np.asarray(input_ids_q, dtype=np.int32), 
#                 np.asarray(input_masks_q, dtype=np.int32), 
#                 np.asarray(input_segments_q, dtype=np.int32)]
        
    
#     def trunate_and_pad(self, tokens_seq_1, tokens_seq_2):
#         """
#         1. 如果是单句序列，按照BERT中的序列处理方式，需要在输入序列头尾分别拼接特殊字符'CLS'与'SEP'，
#            因此不包含两个特殊字符的序列长度应该小于等于max_seq_len-2，如果序列长度大于该值需要那么进行截断。
#         2. 对输入的序列 最终形成['CLS',seq,'SEP']的序列，该序列的长度如果小于max_seq_len，那么使用0进行填充。
#         入参: 
#             seq_1       : 输入序列，在本处其为单个句子。
#             seq_2       : 输入序列，在本处其为单个句子。
#             max_seq_len : 拼接'CLS'与'SEP'这两个特殊字符后的序列长度
        
#         出参:
#             seq         : 在入参seq的头尾分别拼接了'CLS'与'SEP'符号，如果长度仍小于max_seq_len，则使用0在尾部进行了填充。
#             seq_mask    : 只包含0、1且长度等于seq的序列，用于表征seq中的符号是否是有意义的，如果seq序列对应位上为填充符号，
#                           那么取值为1，否则为0。
#             seq_segment : shape等于seq，单句，取值都为0 ，双句按照01切分
           
#         """
#         # 对超长序列进行截断
#         if len(tokens_seq_1) > ((self.max_seq_len - 3)//2):
#             tokens_seq_1 = tokens_seq_1[0:(self.max_seq_len - 3)//2]
#         if len(tokens_seq_2) > ((self.max_seq_len - 3)//2):
#             tokens_seq_2 = tokens_seq_2[0:(self.max_seq_len - 3)//2]
#         # 分别在首尾拼接特殊符号
#         seq = ['[CLS]'] + tokens_seq_1 + ['[SEP]'] + tokens_seq_2 + ['[SEP]']
#         seq_segment = [0] * (len(tokens_seq_1) + 2) + [1] * (len(tokens_seq_2) + 1)
#         # ID化
#         seq = self.bert_tokenizer.convert_tokens_to_ids(seq)
#         # 根据max_seq_len与seq的长度产生填充序列
#         padding = [0] * (self.max_seq_len - len(seq))
#         # 创建seq_mask
#         seq_mask = [1] * len(seq) + padding
#         # 创建seq_segment
#         seq_segment = seq_segment + padding
#         # 对seq拼接填充序列
#         seq += padding
#         assert len(seq) == self.max_seq_len
#         assert len(seq_mask) == self.max_seq_len
#         assert len(seq_segment) == self.max_seq_len
#         return seq, seq_mask, seq_segment

In [8]:
class BertwwmModel(nn.Module):
    def __init__(self,dropout=0.5,num_classes=2):
        super(BertwwmModel,self).__init__()
#       self.albert=AlbertForSequenceClassification.from_pretrained('./preTrainModel/albert_chinese_base/')
        config = BertConfig.from_pretrained('./preTrainModel/chinese-bert-wwm-ext/') 
        config.output_hidden_states = False 
        self.bertwwm = BertModel.from_pretrained('./preTrainModel/chinese-bert-wwm-ext/', 
                                             config=config)
        self.dropout=dropout
        self.device=torch.device("cuda")
        self.num_classes = num_classes
        self.linear = nn.Linear(768*4, num_classes)
        for param in self.bertwwm.parameters():
            param.requires_grad=True
    
    def forward(self, q_id, q_mask, q_atn):
        q_embedding = self.bertwwm(q_id, attention_mask=q_mask, token_type_ids=q_atn)[0]
#         q = tf.keras.layers.GlobalAveragePooling1D()(q_embedding) 
        q = nn.AdaptiveAvgPool2d((1,q_embedding.shape[-1]))(q_embedding).squeeze(1)
#         a = tf.keras.layers.GlobalMaxPooling1D()(q_embedding)
        a = nn.AdaptiveMaxPool2d((1,q_embedding.shape[-1]))(q_embedding).squeeze(1)
        t = q_embedding[:,-1]
        e = q_embedding[:, 0]
#         x = tf.keras.layers.Concatenate()([q, a,t,e])
        merged = torch.cat([q, a, t, e], dim=1)
#         print(merged.shape)
        x = nn.Dropout(self.dropout)(merged)
#         x = tf.keras.layers.Dense(1, activation='sigmoid')(x)
        logits=self.linear(x)
#         print(logits.shape)
        probabilities =F.softmax(logits, dim=-1)
        return logits,probabilities
    
    
# # -------------------- Model definition ------------------- #
# print("\t* Building model...")
# model = AlbertModel().to(device)

In [9]:


# PATH='./'
# train_file = PATH+'train.csv'
# dev_file = PATH+'dev.csv'
# test_file=PATH+'test.csv'
# test_file = PATH+'test.csv'
# bert_tokenizer, df, input_categories,max_char_len = 103
# input_categories = ['query','reply']
# output_categories = 'label'
# MAX_SEQUENCE_LENGTH = 100
# print("\t* Loading training data...")
# df_train = pd.read_csv(train_file)
# df_dev = pd.read_csv(dev_file)
# inputs = DataPrecessForSentence(albert_tokenizer, train_data,input_categories,MAX_SEQUENCE_LENGTH)
# train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
# print("\t* Loading validation data...")
# dev_data = DataPrecessForSentence(albert_tokenizer,df_dev,input_categories,MAX_SEQUENCE_LENGTH)
# dev_loader = DataLoader(dev_data, shuffle=True, batch_size=batch_size)

In [10]:
# len(inputs)

In [11]:
# # -------------------- Model definition ------------------- #
# print("\t* Building model...")
# model = AlbertModel().to(device)

In [None]:
class FocalLoss(nn.Module):
    def __init__(self, gamma=0, alpha=None, size_average=True):
        super(FocalLoss, self).__init__()
        self.gamma = gamma
        self.alpha = alpha
        if isinstance(alpha,(float,int)): self.alpha = torch.Tensor([alpha,1-alpha])
        if isinstance(alpha,list): self.alpha = torch.Tensor(alpha)
        self.size_average = size_average

    def forward(self, input, target):
        if input.dim()>2:
            input = input.view(input.size(0),input.size(1),-1)  # N,C,H,W => N,C,H*W
            input = input.transpose(1,2)    # N,C,H*W => N,H*W,C
            input = input.contiguous().view(-1,input.size(2))   # N,H*W,C => N*H*W,C
        target = target.view(-1,1)

        logpt = F.log_softmax(input)
        logpt = logpt.gather(1,target)
        logpt = logpt.view(-1)
        pt = Variable(logpt.data.exp())

        if self.alpha is not None:
            if self.alpha.type()!=input.data.type():
                self.alpha = self.alpha.type_as(input.data)
            at = self.alpha.gather(0,target.data.view(-1))
            logpt = logpt * Variable(at)

        loss = -1 * (1-pt)**self.gamma * logpt
        if self.size_average: return loss.mean()
        else: return loss.sum()

In [12]:
def correct_predictions(output_probabilities, targets):

    _, out_classes = output_probabilities.max(dim=1)
    correct = (out_classes == targets).sum()
    return correct.item()


def train(model, dataloader,optimizer, criterion,epoch_number, max_gradient_norm):

    # Switch the model to train mode.
    model.train()
    device = model.device
    epoch_start = time.time()
    batch_time_avg = 0.0
    running_loss = 0.0
    correct_preds = 0
    tqdm_batch_iterator = tqdm(dataloader)
    for batch_index, (batch_seqs, batch_seq_masks, batch_seq_segments, batch_labels) in enumerate(tqdm_batch_iterator):
        batch_start = time.time()
        # Move input and output data to the GPU if it is used.
        seqs, masks, segments, labels = batch_seqs.to(device), batch_seq_masks.to(device), batch_seq_segments.to(device), batch_labels.to(device)
        optimizer.zero_grad()
        logits, probs  = model(seqs, masks, segments)
        loss = criterion(logits, labels)
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_gradient_norm)
        optimizer.step()
        batch_time_avg += time.time() - batch_start
        running_loss += loss.item()
        correct_preds += correct_predictions(probs, labels)
        description = "Avg. batch proc. time: {:.4f}s, loss: {:.4f}"\
                      .format(batch_time_avg/(batch_index+1), running_loss/(batch_index+1))
        tqdm_batch_iterator.set_description(description)
    epoch_time = time.time() - epoch_start
    epoch_loss = running_loss / len(dataloader)
    epoch_accuracy = correct_preds / len(dataloader.dataset)
    return epoch_time, epoch_loss, epoch_accuracy


def validate(model, dataloader, criterion):

    # Switch to evaluate mode.
    model.eval()
    device = model.device
    epoch_start = time.time()
    running_loss = 0.0
    running_accuracy = 0.0
    all_prob = []
    all_labels = []
    # Deactivate autograd for evaluation.
    with torch.no_grad():
        for (batch_seqs, batch_seq_masks, batch_seq_segments, batch_labels) in dataloader:
            # Move input and output data to the GPU if one is used.
            seqs = batch_seqs.to(device)
            masks = batch_seq_masks.to(device)
            segments = batch_seq_segments.to(device)
            labels = batch_labels.to(device)
            logits, probs = model(seqs, masks, segments)
            loss = criterion(logits, labels)
            running_loss += loss.item()
            running_accuracy += correct_predictions(probs, labels)
            all_prob.extend(probs[:,1].cpu().numpy())
            all_labels.extend(batch_labels)
    epoch_time = time.time() - epoch_start
    epoch_loss = running_loss / len(dataloader)
    epoch_accuracy = running_accuracy / (len(dataloader.dataset))
    return epoch_time, epoch_loss, epoch_accuracy, roc_auc_score(all_labels, all_prob)



def test(model, dataloader):
    # Switch the model to eval mode.
    label_res=[]
    model.eval()
    device = model.device
    time_start = time.time()
    batch_time = 0.0
    
    # Deactivate autograd for evaluation.
    with torch.no_grad():
        for (batch_seqs, batch_seq_masks, batch_seq_segments, batch_labels) in dataloader:
            batch_start = time.time()
            # Move input and output data to the GPU if one is used.
            seqs, masks, segments, labels = batch_seqs.to(device), batch_seq_masks.to(device), batch_seq_segments.to(device), batch_labels.to(device)
            _, probabilities = model(seqs, masks, segments)
            _, out_classes = probabilities.max(dim=1)
#             print(out_classes)
            label_res.extend(out_classes.cpu().numpy())
            batch_time += time.time() - batch_start

    batch_time /= len(dataloader)
    total_time = time.time() - time_start
#     accuracy /= (len(dataloader.dataset))
    return batch_time, total_time,label_res

In [13]:
# best_score = 0.0
# start_epoch = 1
# # Data for loss curves plot
# epochs_count = []
# train_losses = []
# valid_losses = []
# criterion = nn.CrossEntropyLoss()
# _, valid_loss, valid_accuracy, auc = validate(model, dev_loader,criterion)
# print("\t* Validation loss before training: {:.4f}, accuracy: {:.4f}%, auc: {:.4f}".format(valid_loss, (valid_accuracy*100), auc))

In [14]:
# # -------------------- Training epochs ------------------- #
# print("\n", 20 * "=", "Training Albert model on device: {}".format(device), 20 * "=")
# patience_counter = 0
# for epoch in range(start_epoch, epochs + 1):
#     epochs_count.append(epoch)
#     print("* Training epoch {}:".format(epoch))
#     epoch_time, epoch_loss, epoch_accuracy = train(model, train_loader, optimizer, criterion, epoch, max_grad_norm)
#     train_losses.append(epoch_loss)
#     print("-> Training time: {:.4f}s, loss = {:.4f}, accuracy: {:.4f}%"
#           .format(epoch_time, epoch_loss, (epoch_accuracy*100)))
#     print("* Validation for epoch {}:".format(epoch))
#     epoch_time, epoch_loss, epoch_accuracy , epoch_auc= validate(model, dev_loader,criterion)
#     valid_losses.append(epoch_loss)
#     print("-> Valid. time: {:.4f}s, loss: {:.4f}, accuracy: {:.4f}%, auc: {:.4f}\n"
#           .format(epoch_time, epoch_loss, (epoch_accuracy*100), epoch_auc))
#     # Update the optimizer's learning rate with the scheduler.
#     scheduler.step(epoch_accuracy)
#     # Early stopping on validation accuracy.
#     if epoch_accuracy < best_score:
#         patience_counter += 1
#     else:
#         best_score = epoch_accuracy
#         patience_counter = 0
#         torch.save({"epoch": epoch, 
#                     "model": model.state_dict(),
#                     "best_score": best_score,
#                     "epochs_count": epochs_count,
#                     "train_losses": train_losses,
#                     "valid_losses": valid_losses},
#                     os.path.join(target_dir, "pytorch_albert_best.pth.tar"))
#     if patience_counter >= patience:
#         print("-> Early stopping: patience limit reached, stopping...")
#         break

In [15]:
# label_res=[]
# def test(model, dataloader):
#     # Switch the model to eval mode.
#     model.eval()
#     device = model.device
#     time_start = time.time()
#     batch_time = 0.0
    
#     # Deactivate autograd for evaluation.
#     with torch.no_grad():
#         for (batch_seqs, batch_seq_masks, batch_seq_segments, batch_labels) in dataloader:
#             batch_start = time.time()
#             # Move input and output data to the GPU if one is used.
#             seqs, masks, segments, labels = batch_seqs.to(device), batch_seq_masks.to(device), batch_seq_segments.to(device), batch_labels.to(device)
#             _, probabilities = model(seqs, masks, segments)
#             _, out_classes = probabilities.max(dim=1)
# #             print(out_classes)
#             label_res.extend(out_classes.cpu().numpy())
#             batch_time += time.time() - batch_start

#     batch_time /= len(dataloader)
#     total_time = time.time() - time_start
# #     accuracy /= (len(dataloader.dataset))
#     return batch_time, total_time,label_res

In [16]:
input_categories = ['query','reply']
output_categories = 'label'
MAX_SEQUENCE_LENGTH = 100
batch_size=128
print("\t* Loading test data...")
test_data = DataPrecessForSentence(bertwwm_tokenizer,df_test,input_categories,MAX_SEQUENCE_LENGTH)
test_loader = DataLoader(test_data, batch_size=batch_size)

169it [00:00, 1684.51it/s]

	* Loading test data...


53757it [00:33, 1625.25it/s]


In [17]:
# batch_time, total_time, label_res= test(model, test_loader)

In [18]:
# input_categories = ['query','reply']
# output_categories = 'label'
# MAX_SEQUENCE_LENGTH = 100

#N折交叉验证

gkf = GroupKFold(n_splits=5).split(X=train_data.reply, groups=train_data.id)

valid_preds = [0,0,0,0,0]
test_preds = [0,0,0,0,0]

batch_size=64
epochs=3
lr=2e-05
patience=3
max_grad_norm=10.0

# criterion = nn.CrossEntropyLoss()

criterion = FocalLoss(gamma=0)
# label_res=[]
oof = np.zeros((len(train_data),1))
for fold, (train_idx, valid_idx) in enumerate(gkf):
    dev_res=[]
    test_res=[]
    best_score = 0.0
    start_epoch = 1
    # Data for loss curves plot
    epochs_count = []
    train_losses = []
    valid_losses = []
    #训练集
    t_data = DataPrecessForSentence(bertwwm_tokenizer, train_data.iloc[train_idx],input_categories,MAX_SEQUENCE_LENGTH)
    train_loader = DataLoader(t_data, shuffle=True, batch_size=batch_size)
    #验证集
    d_data = DataPrecessForSentence(bertwwm_tokenizer, train_data.iloc[valid_idx],input_categories,MAX_SEQUENCE_LENGTH)
    dev_loader = DataLoader(d_data, shuffle=True, batch_size=batch_size)
    #开始训练
    # -------------------- Model definition ------------------- #
    print("\t* Building model:{}...".format(fold))
    model = BertwwmModel().to(device)
    
    # 待优化的参数
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
            {
                    'params':[p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
                    'weight_decay':0.01
            },
            {
                    'params':[p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
                    'weight_decay':0.0
            }
    ]
    optimizer = AdamW(optimizer_grouped_parameters, lr=lr)
    
#     optimizer = torch.optim.Adam(params=model.parameters(), lr=lr)
    
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode="max", 
                                                               factor=0.85, patience=0)
    
    print("\n", 20 * "=", "Training Albert model on device: {},fold:{}".format(device,fold), 20 * "=")
    patience_counter = 0
    for epoch in range(start_epoch, epochs + 1):
        epochs_count.append(epoch)
        print("* Training epoch {}:".format(epoch))
        epoch_time, epoch_loss, epoch_accuracy = train(model, train_loader, optimizer, criterion, epoch, max_grad_norm)
        train_losses.append(epoch_loss)
        print("-> Training time: {:.4f}s, loss = {:.4f}, accuracy: {:.4f}%"
              .format(epoch_time, epoch_loss, (epoch_accuracy*100)))
        print("* Validation for epoch {}:".format(epoch))
        epoch_time, epoch_loss, epoch_accuracy , epoch_auc= validate(model, dev_loader,criterion)
        valid_losses.append(epoch_loss)
        print("-> Valid. time: {:.4f}s, loss: {:.4f}, accuracy: {:.4f}%, auc: {:.4f}\n"
              .format(epoch_time, epoch_loss, (epoch_accuracy*100), epoch_auc))
        # Update the optimizer's learning rate with the scheduler.
        scheduler.step(epoch_accuracy)
        # Early stopping on validation accuracy.
        if epoch_accuracy < best_score:
            patience_counter += 1 
        else:
            best_score = epoch_accuracy
            patience_counter = 0
            
            batch_time, total_time, dev_res= test(model, dev_loader)
            oof[valid_idx] =[[i] for i in dev_res]
            valid_preds[fold]=dev_res
            batch_time, total_time, test_res=test(model, test_loader)
            test_preds[fold]=test_res
#             f1,t = search_f1(valid_outputs, valid_preds[-1])
#             print('validation score = ', f1)
#             torch.save({"epoch": epoch, 
#                         "model": model.state_dict(),
#                         "best_score": best_score,
#                         "epochs_count": epochs_count,
#                         "train_losses": train_losses,
#                         "valid_losses": valid_losses},
#                         os.path.join(target_dir, "pytorch_albert_best.pth.tar"))
        if patience_counter >= patience:
            print("-> Early stopping: patience limit reached, stopping...")
            break
        
#     train_inputs = [inputs[i][train_idx] for i in range(len(inputs))]
#     train_outputs = outputs[train_idx]
#     valid_inputs = [inputs[i][valid_idx] for i in range(len(inputs))]
#     valid_outputs = outputs[valid_idx]

17268it [00:10, 1644.17it/s]
4317it [00:02, 1644.19it/s]


	* Building model:0...


  0%|          | 0/270 [00:00<?, ?it/s]


* Training epoch 1:


Avg. batch proc. time: 0.7735s, loss: 0.3836: 100%|██████████| 270/270 [03:30<00:00,  1.28it/s]


-> Training time: 210.4318s, loss = 0.3836, accuracy: 83.4260%
* Validation for epoch 1:
-> Valid. time: 19.2867s, loss: 0.2887, accuracy: 87.3987%, auc: 0.9278



  0%|          | 0/270 [00:00<?, ?it/s]

* Training epoch 2:


Avg. batch proc. time: 0.8496s, loss: 0.2562: 100%|██████████| 270/270 [03:51<00:00,  1.16it/s]


-> Training time: 231.8562s, loss = 0.2562, accuracy: 89.6630%
* Validation for epoch 2:
-> Valid. time: 20.1672s, loss: 0.2746, accuracy: 88.3252%, auc: 0.9378



  0%|          | 0/270 [00:00<?, ?it/s]

* Training epoch 3:


Avg. batch proc. time: 0.8544s, loss: 0.1885: 100%|██████████| 270/270 [03:53<00:00,  1.16it/s]


-> Training time: 233.5977s, loss = 0.1885, accuracy: 92.7322%
* Validation for epoch 3:


170it [00:00, 1697.14it/s]

-> Valid. time: 19.6511s, loss: 0.3015, accuracy: 88.0704%, auc: 0.9372



17268it [00:15, 1105.55it/s]
4317it [00:04, 1033.61it/s]


	* Building model:1...


  0%|          | 0/270 [00:00<?, ?it/s]


* Training epoch 1:


Avg. batch proc. time: 0.8478s, loss: 0.3625: 100%|██████████| 270/270 [03:52<00:00,  1.16it/s]


-> Training time: 232.0757s, loss = 0.3625, accuracy: 84.7985%
* Validation for epoch 1:
-> Valid. time: 19.7465s, loss: 0.3013, accuracy: 87.3292%, auc: 0.9274



  0%|          | 0/270 [00:00<?, ?it/s]

* Training epoch 2:


Avg. batch proc. time: 0.8782s, loss: 0.2460: 100%|██████████| 270/270 [04:00<00:00,  1.12it/s]


-> Training time: 240.9432s, loss = 0.2460, accuracy: 90.0162%
* Validation for epoch 2:
-> Valid. time: 18.5847s, loss: 0.2889, accuracy: 88.0704%, auc: 0.9349



  0%|          | 0/270 [00:00<?, ?it/s]

* Training epoch 3:


Avg. batch proc. time: 0.8547s, loss: 0.1764: 100%|██████████| 270/270 [03:53<00:00,  1.15it/s]


-> Training time: 233.8138s, loss = 0.1764, accuracy: 93.3055%
* Validation for epoch 3:
-> Valid. time: 18.3110s, loss: 0.3204, accuracy: 88.2789%, auc: 0.9330



17268it [00:14, 1156.15it/s]
4317it [00:03, 1247.70it/s]


	* Building model:2...


  0%|          | 0/270 [00:00<?, ?it/s]


* Training epoch 1:


Avg. batch proc. time: 0.8184s, loss: 0.3596: 100%|██████████| 270/270 [03:44<00:00,  1.21it/s]


-> Training time: 224.0291s, loss = 0.3596, accuracy: 84.4394%
* Validation for epoch 1:
-> Valid. time: 17.8186s, loss: 0.2880, accuracy: 88.0009%, auc: 0.9323



  0%|          | 0/270 [00:00<?, ?it/s]

* Training epoch 2:


Avg. batch proc. time: 0.8424s, loss: 0.2369: 100%|██████████| 270/270 [03:50<00:00,  1.17it/s]


-> Training time: 230.8411s, loss = 0.2369, accuracy: 90.3753%
* Validation for epoch 2:
-> Valid. time: 18.5753s, loss: 0.2869, accuracy: 88.1399%, auc: 0.9357



  0%|          | 0/270 [00:00<?, ?it/s]

* Training epoch 3:


Avg. batch proc. time: 1.0776s, loss: 0.1772: 100%|██████████| 270/270 [04:55<00:00,  1.09s/it]


-> Training time: 295.4238s, loss = 0.1772, accuracy: 93.0276%
* Validation for epoch 3:
-> Valid. time: 29.2646s, loss: 0.3021, accuracy: 88.2789%, auc: 0.9379



17268it [00:47, 361.83it/s] 
4317it [00:08, 529.93it/s] 


	* Building model:3...


  0%|          | 0/270 [00:00<?, ?it/s]


* Training epoch 1:


Avg. batch proc. time: 1.3639s, loss: 0.3470: 100%|██████████| 270/270 [06:18<00:00,  1.40s/it]


-> Training time: 378.1935s, loss = 0.3470, accuracy: 84.9954%
* Validation for epoch 1:
-> Valid. time: 27.0962s, loss: 0.2658, accuracy: 89.3213%, auc: 0.9388



  0%|          | 0/270 [00:00<?, ?it/s]

* Training epoch 2:


Avg. batch proc. time: 1.2039s, loss: 0.2296: 100%|██████████| 270/270 [05:34<00:00,  1.24s/it]


-> Training time: 334.4795s, loss = 0.2296, accuracy: 90.6532%
* Validation for epoch 2:


  0%|          | 0/270 [00:00<?, ?it/s]

-> Valid. time: 22.7884s, loss: 0.2751, accuracy: 89.2981%, auc: 0.9395

* Training epoch 3:


Avg. batch proc. time: 1.2602s, loss: 0.1628: 100%|██████████| 270/270 [05:47<00:00,  1.29s/it]


-> Training time: 347.4081s, loss = 0.1628, accuracy: 93.5951%
* Validation for epoch 3:


7it [00:00, 69.36it/s]

-> Valid. time: 29.0279s, loss: 0.3085, accuracy: 88.6959%, auc: 0.9400



17268it [01:07, 254.24it/s] 
4317it [00:11, 373.76it/s] 


	* Building model:4...


  0%|          | 0/270 [00:00<?, ?it/s]


* Training epoch 1:


Avg. batch proc. time: 1.2491s, loss: 0.3427: 100%|██████████| 270/270 [05:44<00:00,  1.27s/it]


-> Training time: 344.1886s, loss = 0.3427, accuracy: 85.1228%
* Validation for epoch 1:
-> Valid. time: 27.3842s, loss: 0.2988, accuracy: 87.0512%, auc: 0.9288



  0%|          | 0/270 [00:00<?, ?it/s]

* Training epoch 2:


Avg. batch proc. time: 1.2000s, loss: 0.2356: 100%|██████████| 270/270 [05:32<00:00,  1.23s/it]


-> Training time: 332.3501s, loss = 0.2356, accuracy: 90.3811%
* Validation for epoch 2:
-> Valid. time: 27.5362s, loss: 0.2956, accuracy: 88.2557%, auc: 0.9344



  0%|          | 0/270 [00:00<?, ?it/s]

* Training epoch 3:


Avg. batch proc. time: 1.3358s, loss: 0.1701: 100%|██████████| 270/270 [06:09<00:00,  1.37s/it]


-> Training time: 369.2492s, loss = 0.1701, accuracy: 93.3055%
* Validation for epoch 3:
-> Valid. time: 29.2513s, loss: 0.3118, accuracy: 88.3484%, auc: 0.9336



In [28]:
from sklearn.metrics import f1_score
def search_f1(y_true, y_pred):
    best = 0
    best_t = 0
    for i in range(30,60):
        tres = i / 100
        y_pred_bin =  (y_pred > tres).astype(int)
        score = f1_score(y_true, y_pred_bin)
        if score > best:
            best = score
            best_t = tres
    print('best', best)
    print('thres', best_t)
    return best, best_t

In [22]:
def compute_output_arrays(df, columns):
    return np.asarray(df[columns])

In [23]:
outputs = compute_output_arrays(train_data, output_categories)

In [29]:
best_score, best_t = search_f1(outputs,oof)

best 0.25062286610685613
thres 0.3


In [37]:
sub = np.average(test_preds, axis=0)
sub = sub > best_t

In [38]:
df_test['label'] = sub.astype(int)
df_test[['id','id_sub','label']].to_csv('./submission_file/submission_bert_wwm.csv',index=False, header=None,sep='\t')

In [20]:
len(oof)

21585

In [21]:
len(train_data)

21585

In [31]:
len(test_preds[0])

53757