In [7]:
import pandas as pd
import numpy as np
import torch
from torch import nn
import torch.nn.functional as F
from torch.autograd import Variable
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
# from transformers import *
from transformers.optimization import AdamW
import os
import time
from sklearn.metrics import roc_auc_score
from tqdm import tqdm
from sklearn.model_selection import GroupKFold
os.environ["CUDA_VISIBLE_DEVICES"] = '1'

In [None]:
albertsmall_tokenizer = BertTokenizer.from_pretrained('./preTrainModel/albert_chinese_small/')

In [3]:
device=torch.device("cuda")

In [4]:
target_dir='./models/'
if not os.path.exists(target_dir):
    os.makedirs(target_dir)

In [5]:
class DataPrecessForSentence(Dataset):
    """
    对文本进行处理
    """
    def __init__(self, bert_tokenizer, df, input_categories,max_char_len = 103):
        """
        bert_tokenizer :分词器
        file     :语料文件
        """
        self.bert_tokenizer = bert_tokenizer
        self.max_seq_len = max_char_len
        self.seqs, self.seq_masks, self.seq_segments, self.labels = self.get_input(df,input_categories, self.bert_tokenizer, self.max_seq_len)
        
    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.seqs[idx], self.seq_masks[idx], self.seq_segments[idx], self.labels[idx]
    
    def _convert_to_transformer_inputs(self,question, answer, tokenizer, max_sequence_length):
        def return_id(str1, str2, truncation_strategy, length):

            inputs = tokenizer.encode_plus(str1, str2,
                add_special_tokens=True,
                max_length=length,
                truncation_strategy=truncation_strategy,
                #truncation=True
                )

            input_ids =  inputs["input_ids"]
            input_masks = [1] * len(input_ids)
            input_segments = inputs["token_type_ids"]
            padding_length = length - len(input_ids)
            padding_id = tokenizer.pad_token_id
            input_ids = input_ids + ([padding_id] * padding_length)
            input_masks = input_masks + ([0] * padding_length)
            input_segments = input_segments + ([0] * padding_length)

            return [input_ids, input_masks, input_segments]
    
        input_ids_q, input_masks_q, input_segments_q = return_id(
            question, answer, 'longest_first', max_sequence_length)

        return [input_ids_q, input_masks_q, input_segments_q]
        
    # 获取文本与标签
    def get_input(self, df,columns, tokenizer, max_sequence_length,test=False):

        input_ids_q, input_masks_q, input_segments_q = [], [], []
        input_ids_a, input_masks_a, input_segments_a = [], [], []
        for _, instance in tqdm(df[columns].iterrows()):
            query,reply = instance.query, instance.reply

            ids_q, masks_q, segments_q= \
            self._convert_to_transformer_inputs(query, reply, tokenizer, max_sequence_length)

            input_ids_q.append(ids_q)
            input_masks_q.append(masks_q)
            input_segments_q.append(segments_q)
            
        labels = df['label'].values
        return torch.Tensor(input_ids_q).type(torch.long),torch.Tensor(input_masks_q).type(torch.long),torch.Tensor(input_segments_q).type(torch.long),torch.Tensor(labels).type(torch.long)


In [6]:
batch_size=16
epochs=10
lr=2e-05
patience=3
max_grad_norm=10.0

PATH='./'
train_file = PATH+'train.csv'
dev_file = PATH+'dev.csv'
test_file=PATH+'test.csv'
# test_file = PATH+'test.csv'

input_categories = ['query','reply']
output_categories = 'label'
MAX_SEQUENCE_LENGTH = 100
batch_size=32

df_train = pd.read_csv(train_file)
df_dev = pd.read_csv(dev_file)

print("\t* Loading training data...")
train_data = DataPrecessForSentence(albertsmall_tokenizer,df_train,input_categories,MAX_SEQUENCE_LENGTH)
train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
print("\t* Loading validation data...")
dev_data = DataPrecessForSentence(albertsmall_tokenizer,df_dev,input_categories,MAX_SEQUENCE_LENGTH)
dev_loader = DataLoader(dev_data, shuffle=True, batch_size=batch_size)

103it [00:00, 1022.23it/s]

	* Loading training data...


18200it [00:27, 669.83it/s] 
141it [00:00, 1406.24it/s]

	* Loading validation data...


2000it [00:01, 1066.42it/s]


In [7]:
#self.seqs, self.seq_masks, self.seq_segments



In [8]:
# # 待优化的参数
# param_optimizer = list(model.named_parameters())
# no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
# optimizer_grouped_parameters = [
#         {
#                 'params':[p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
#                 'weight_decay':0.01
#         },
#         {
#                 'params':[p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
#                 'weight_decay':0.0
#         }
# ]
# optimizer = AdamW(optimizer_grouped_parameters, lr=lr)
# scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode="max", 
#                                                            factor=0.85, patience=0)

In [9]:
def correct_predictions(output_probabilities, targets):

    _, out_classes = output_probabilities.max(dim=1)
    correct = (out_classes == targets).sum()
    return correct.item()


# def train(model, dataloader,optimizer, criterion,epoch_number, max_gradient_norm):

#     # Switch the model to train mode.
#     model.train()
# #     device = model.device
#     epoch_start = time.time()
#     batch_time_avg = 0.0
#     running_loss = 0.0
#     correct_preds = 0
#     tqdm_batch_iterator = tqdm(dataloader)
#     for batch_index, (batch_seqs, batch_seq_masks, batch_seq_segments, batch_labels) in enumerate(tqdm_batch_iterator):
#         batch_start = time.time()
#         # Move input and output data to the GPU if it is used.
#         seqs, masks, segments, labels = batch_seqs.cuda(), batch_seq_masks.cuda(), batch_seq_segments.cuda(), batch_labels.cuda()
#         optimizer.zero_grad()
#         logits, probs  = model(seqs, masks, segments)
#         loss = criterion(logits, labels)
#         loss.backward()
#         nn.utils.clip_grad_norm_(model.parameters(), max_gradient_norm)
#         optimizer.step()
#         batch_time_avg += time.time() - batch_start
#         running_loss += loss.item()
#         correct_preds += correct_predictions(probs, labels)
#         description = "Avg. batch proc. time: {:.4f}s, loss: {:.4f}"\
#                       .format(batch_time_avg/(batch_index+1), running_loss/(batch_index+1))
#         tqdm_batch_iterator.set_description(description)
#     epoch_time = time.time() - epoch_start
#     epoch_loss = running_loss / len(dataloader)
#     epoch_accuracy = correct_preds / len(dataloader.dataset)
#     return epoch_time, epoch_loss, epoch_accuracy


def validate(model, dataloader, criterion):

    # Switch to evaluate mode.
    model.eval()
#     device = model.device
    epoch_start = time.time()
    running_loss = 0.0
    running_accuracy = 0.0
    all_prob = []
    all_labels = []
    # Deactivate autograd for evaluation.
    with torch.no_grad():
        for (batch_seqs, batch_seq_masks, batch_seq_segments, batch_labels) in dataloader:
            # Move input and output data to the GPU if one is used.
            seqs = batch_seqs.cuda()
            masks = batch_seq_masks.cuda()
            segments = batch_seq_segments.cuda()
            labels = batch_labels.cuda()
            logits, probs = model(seqs, masks, segments)
            loss = criterion(logits, labels)
            running_loss += loss.item()
            running_accuracy += correct_predictions(probs, labels)
            all_prob.extend(probs[:,1].cpu().numpy())
            all_labels.extend(batch_labels)
    epoch_time = time.time() - epoch_start
    epoch_loss = running_loss / len(dataloader)
    epoch_accuracy = running_accuracy / (len(dataloader.dataset))
    return epoch_time, epoch_loss, epoch_accuracy, roc_auc_score(all_labels, all_prob)



def test(model, dataloader):
    # Switch the model to eval mode.
    label_res=[]
    model.eval()
#     device = model.device
    time_start = time.time()
    batch_time = 0.0
    
    # Deactivate autograd for evaluation.
    with torch.no_grad():
        for (batch_seqs, batch_seq_masks, batch_seq_segments, batch_labels) in dataloader:
            batch_start = time.time()
            # Move input and output data to the GPU if one is used.
            seqs, masks, segments, labels = batch_seqs.cuda(), batch_seq_masks.cuda(), batch_seq_segments.cuda(), batch_labels.cuda()
            _, probabilities = model(seqs, masks, segments)
            _, out_classes = probabilities.max(dim=1)
#             print(out_classes)
            label_res.extend(out_classes.cpu().numpy())
            batch_time += time.time() - batch_start

    batch_time /= len(dataloader)
    total_time = time.time() - time_start
#     accuracy /= (len(dataloader.dataset))
    return batch_time, total_time,label_res

In [10]:
class AlbertSmallModel(nn.Module):
    def __init__(self,dropout=0.5,num_classes=2):
        super(AlbertSmallModel,self).__init__()
        config = BertConfig.from_pretrained('./preTrainModel/albert_chinese_small/') 
        config.output_hidden_states = True
        self.AlbertSmall = BertModel.from_pretrained('./preTrainModel/albert_chinese_small/', 
                                             config=config)
        self.hidden_size = 384
        self.dropout=dropout
        self.device=torch.device("cuda")
        self.num_classes = num_classes
        self.linear = nn.Linear(10*self.hidden_size, num_classes)
        self.projection = nn.Sequential(nn.Linear(4*self.hidden_size, self.hidden_size), 
                                        nn.ReLU())
        self.classification = nn.Sequential(nn.Linear(4*4*self.hidden_size, self.hidden_size),
                                            nn.ReLU(),
                                            nn.Dropout(p=self.dropout),
                                            nn.Linear(self.hidden_size, self.hidden_size//2),
                                            nn.ReLU(),
                                            nn.Dropout(p=self.dropout),
                                            nn.Linear(self.hidden_size//2, self.num_classes))  
        
        for param in self.AlbertSmall.parameters():
            param.requires_grad=True
    
    def forward(self, q_id, q_mask, q_atn):

        mask1=q_mask.to(torch.float32)-q_atn.to(torch.float32) #batch_size*seq_max
        mask2=q_atn.to(torch.float32)   #batch_size*seq_max

        q_embedding,pooler_output,hidden_states = self.AlbertSmall(q_id, attention_mask=q_mask, token_type_ids=q_atn)
#         print(len(hidden_states))
#         print(hidden_states[-1]) 
#         print(q_embedding)
        #试试倒数二三层的hidden加入
        hidden_feature=hidden_states[-2]+hidden_states[-3]
        hidden_avg = nn.AdaptiveAvgPool2d((1,hidden_feature.shape[-1]))(hidden_feature).squeeze(1)
        hidden_max = nn.AdaptiveMaxPool2d((1,hidden_feature.shape[-1]))(hidden_feature).squeeze(1)
#         mask1=mask1.unsqueeze(-1) #batch_size*seq_max*1
        q1=q_embedding*mask1.unsqueeze(-1)  #batch_size*seq_max*dim
#         mask2=mask2.unsqueeze(-1) #batch_size*seq_max*1
        q2=q_embedding*mask2.unsqueeze(-1)  #batch_size*seq_max*dim
        
        #进行attention部分
        mask_attn1=mask1.unsqueeze(-1)  #batch_size*seqlen1*1
        mask_attn2=mask2.unsqueeze(1)   #batch_size*1*seqlen2
        mask_similarity_matrix=torch.bmm(mask_attn1,mask_attn2)  #batch_size_seqlen1*seqlen2
        mask_similarity_matrix=(mask_similarity_matrix-1.)*10000  #batch_size_seqlen1*seqlen2
        similarity_matrix=torch.bmm(q1,q2.permute(0,2,1))   #batch_size*seqlen1*seqlen2
        similarity_matrix=similarity_matrix+mask_similarity_matrix  #batch_size*seqlen1*seqlen2
        similarity_matrix_transpose=similarity_matrix.permute(0,2,1)   #batch_size*seqlen2*seqlen1
        
        alpha1=F.softmax(similarity_matrix_transpose,dim=-1)  #batch_size*seqlen2*seqlen1
        alpha2=F.softmax(similarity_matrix,dim=-1)    #batch_size*seqlen1*seqlen2
        
        q1_tilde=torch.bmm(alpha2,q2)    #batch_size*seqlen1*dim
        q2_tilde=torch.bmm(alpha1,q1)    #batch_size*seqlen2*dim
        
         #进行composition部分
        q1_combined=torch.cat([q1,q1_tilde,torch.abs(q1-q1_tilde),torch.mul(q1,q1_tilde)],dim=-1)  #batch_size*seqlen1*4dim
        q2_combined=torch.cat([q2,q2_tilde,torch.abs(q2-q2_tilde),torch.mul(q2,q2_tilde)],dim=-1)   #batch_size*seqlen2*4dim
        
        # 映射一下
        projected_q1 = self.projection(q1_combined)  #batch_size*seqlen1*dim
        projected_q2 = self.projection(q2_combined)  #batch_size*seqlen1*dim
        
        def reduce_mean_with_mask(q, mask):
            dim=q.shape[-1]  #dim
            seq_len=torch.sum(mask,1).unsqueeze(1)  #batch_size*1
            seq_len_tiled=seq_len.repeat(1,dim)  #batch_size*dim
            q_sum=torch.sum(q,1)  #batch_size*dim
            return q_sum/seq_len_tiled
        
        # 平均池化 + 最大池化
        q1_avg=reduce_mean_with_mask(projected_q1, mask1)   #batch_size*4dim
        q1_max=torch.max(projected_q1,1)[0]   #batch_size*4dim
        q2_avg =reduce_mean_with_mask(projected_q2, mask2)  #batch_size*4dim
        q2_max=torch.max(projected_q2,1)[0]   #batch_size*4dim
        
        # 拼接成最后的特征向量
        q = nn.AdaptiveAvgPool2d((1,q_embedding.shape[-1]))(q_embedding).squeeze(1)
        a = nn.AdaptiveMaxPool2d((1,q_embedding.shape[-1]))(q_embedding).squeeze(1)
        t = q_embedding[:,-1]
        e = q_embedding[:, 0] 
        
        
        merged = torch.cat([q,a,t,e,hidden_avg,hidden_max,q1_avg, q1_max, q2_avg, q2_max], dim=1)  #batch_size*16dim
        #分类
#         logits = self.classification(merged.float())
        
        
#         q = nn.AdaptiveAvgPool2d((1,q_embedding.shape[-1]))(q_embedding).squeeze(1)
#         a = nn.AdaptiveMaxPool2d((1,q_embedding.shape[-1]))(q_embedding).squeeze(1)
#         t = q_embedding[:,-1]
#         e = q_embedding[:, 0]
#         merged = torch.cat([q, a, t, e], dim=1)
        x = nn.Dropout(self.dropout)(merged)
        logits=self.linear(x)
        probabilities =F.softmax(logits, dim=-1)
        return logits,probabilities
    
    
    
# -------------------- Model definition ------------------- #
print("\t* Building model...")
model = AlbertSmallModel().to(device)

	* Building model...


In [11]:
best_score = 0.0
start_epoch = 1
# Data for loss curves plot
criterion = nn.CrossEntropyLoss()
epochs_count = []
train_losses = []
valid_losses = []
_, valid_loss, valid_accuracy, auc = validate(model, dev_loader, criterion)
print("\t* Validation loss before training: {:.4f}, accuracy: {:.4f}%, auc: {:.4f}".format(valid_loss, (valid_accuracy*100), auc))

	* Validation loss before training: 0.7635, accuracy: 72.9000%, auc: 0.4969


In [12]:
a = torch.Tensor([-1, -2,  3])
b = torch.Tensor([5, -2,  33]) 

In [13]:
torch.mul(a,b)

tensor([-5.,  4., 99.])

In [14]:
a = torch.randn(3,3)

In [15]:
a

tensor([[ 1.9669, -0.0629, -1.3784],
        [ 0.4956, -0.6945, -1.9827],
        [-0.3600, -1.0455,  0.3489]])

In [16]:
torch.max(a,0)[0]

tensor([ 1.9669, -0.0629,  0.3489])

In [17]:
a.shape

torch.Size([3, 3])

In [18]:
a=torch.tensor([[[1,2],[3,4],[3,4]],[[8,9],[88,12],[3,7]],[[18,29],[838,122],[663,4]],[[10,26],[8368,12552],[53,4]]]) 

In [19]:
a[:,-1]

tensor([[  3,   4],
        [  3,   7],
        [663,   4],
        [ 53,   4]])

In [20]:
a.shape

torch.Size([4, 3, 2])

In [21]:
a[:,0]

tensor([[ 1,  2],
        [ 8,  9],
        [18, 29],
        [10, 26]])

In [22]:
a=(1,2,3)