In [1]:
import numpy as np
import pandas as pd
from bert4keras.backend import keras, set_gelu
from bert4keras.tokenizers import Tokenizer
from bert4keras.models import build_transformer_model
from bert4keras.optimizers import Adam, extend_with_piecewise_linear_lr
from bert4keras.snippets import sequence_padding, DataGenerator
from sklearn.model_selection import StratifiedKFold
from bert4keras.backend import keras, set_gelu
from keras.layers import Lambda, Dense
from keras import backend as K
from sklearn import metrics

Using TensorFlow backend.


In [2]:
data = pd.read_csv("Train.csv")
test_data = pd.read_csv("Test_A.csv")

In [3]:
data

Unnamed: 0,SessionId,Role,Text,HighRiskFlag
0,session0,坐席,诶您好PRD您是NAME先生是吧,0
1,session0,客户,诶是是,0
2,session0,坐席,还没的时候我会逾期了您今天app还款NUM块钱先处理NUM下好吗,0
3,session0,客户,okokok信息今天还尽量今天还掉啊,0
4,session0,坐席,NUM天了,0
...,...,...,...,...
2615984,session135655,坐席,PRD请问是NAME先生吗,0
2615985,session135655,客户,是,0
2615986,session135655,坐席,先生您贷款逾期NUM天今天app处理下NUM元PRD的不要忘了好吧,0
2615987,session135655,客户,现在说没钱没,0


In [4]:
train = data[['SessionId','HighRiskFlag']].drop_duplicates().reset_index(drop=True)
test = test_data[['SessionId']].drop_duplicates().reset_index(drop=True)

In [5]:
def get_text(row):
    return " ".join(list(row['Role']+":"+row['Text']))
tmp = data.groupby('SessionId').apply(get_text).rename('text')
train = pd.merge(train, tmp, on=['SessionId'], how='left', copy=False)
tmp = test_data.groupby('SessionId').apply(get_text).rename('text')
test = pd.merge(test, tmp, on=['SessionId'], how='left', copy=False)
test['HighRiskFlag'] = 0

In [6]:
# 正样本占比
len(train[train['HighRiskFlag'] == 1]) / len(train)

0.008801674824556231

In [26]:
# 统计长度
train['len'] = train['text'].map(lambda x: len(x))
len(train[train['len']>510]) / len(train)

0.1402149554756148

In [27]:
# 风险对话文本长度均值
train[train['HighRiskFlag'] == 1]['len'].median()

1372.5

In [28]:
num_classes = 2
maxlen = 512
batch_size = 3

config_path = 'D:\\NLP\\chinese_wwm_L-12_H-768_A-12\\bert_config.json'
checkpoint_path = 'D:\\NLP\\chinese_wwm_L-12_H-768_A-12\\bert_model.ckpt'
dict_path = 'D:\\NLP\\chinese_wwm_L-12_H-768_A-12\\vocab.txt'

tokenizer = Tokenizer(dict_path, do_lower_case=True)
def load_data(data):
    D = []
    for text,label in zip(data['text'],data['HighRiskFlag']):
        D.append((text, int(label)))
    return D

In [29]:
class data_generator(DataGenerator):
    """数据生成器
    """
    def __iter__(self, random=False):
        batch_token_ids, batch_segment_ids, batch_labels = [], [], []
        for is_end, (text, label) in self.sample(random):
            token_ids, segment_ids = tokenizer.encode(text, maxlen=maxlen)
            batch_token_ids.append(token_ids)
            batch_segment_ids.append(segment_ids)
            batch_labels.append([label])
            if len(batch_token_ids) == self.batch_size or is_end:
                batch_token_ids = sequence_padding(batch_token_ids)
                batch_segment_ids = sequence_padding(batch_segment_ids)
                batch_labels = sequence_padding(batch_labels)
                yield [batch_token_ids, batch_segment_ids], batch_labels
                batch_token_ids, batch_segment_ids, batch_labels = [], [], []

In [None]:
def evaluate(data):
    total, right = 0., 0.
    y_trues = []
    y_preds = []
    for x_true, y_true in data:
        y_preds = np.r_[y_preds, model.predict(x_true)[:,1]]
        y_trues = np.r_[y_trues, y_true[:, 0]]
    fpr, tpr, thresholds = metrics.roc_curve(y_trues, y_preds)
    return  metrics.auc(fpr, tpr)


class Evaluator(keras.callbacks.Callback):
    """评估与保存
    """
    def __init__(self):
        self.best_val_acc = 0

    def on_epoch_end(self, epoch, logs=None):
        val_acc = evaluate(valid_generator)
        if (val_acc > self.best_val_acc) or (epoch == 1):
            print('1')
            self.best_val_acc = val_acc
            model.save_weights(r'best_model{}.weights'.format(fold))
        print(
            u'val_auc: %.5f, best_val_auc: %.5f' %
            (val_acc, self.best_val_acc)
        )
# 5折
kold = StratifiedKFold(random_state=2020,shuffle=True,n_splits=5).split(train, train['HighRiskFlag'])
probs = [0 for x in range(5)]
for fold, (train_idx, valid_idx) in enumerate(kold):

    K.clear_session()
    bert = build_transformer_model(
        config_path=config_path,
        checkpoint_path=checkpoint_path,
        model='bert',
        return_keras_model=False,
    )

    output = Lambda(lambda x: x[:, 0], name='CLS-token')(bert.model.output)
    output = Dense(
        units=num_classes,
        activation='softmax',
        kernel_initializer=bert.initializer
    )(output)

    model = keras.models.Model(bert.model.input, output)
    model.summary()

    # 派生为带分段线性学习率的优化器。
    # 其中name参数可选，但最好填入，以区分不同的派生优化器。
    AdamLR = extend_with_piecewise_linear_lr(Adam, name='AdamLR')

    model.compile(
        loss='sparse_categorical_crossentropy',
        # optimizer=Adam(1e-5),  # 用足够小的学习率
        optimizer=AdamLR(lr =1e-4, lr_schedule={
            1000: 1,
            2000: 0.1
        }),
        metrics=['accuracy'],
    )

    # 转换数据集
    train_data = load_data(train.iloc[train_idx])
    valid_data = load_data(train.iloc[valid_idx])
    test_data = load_data(test)
    train_generator = data_generator(train_data, batch_size) 
    valid_generator = data_generator(valid_data, batch_size)
    test_generator = data_generator(test_data, batch_size)


    evaluator = Evaluator()

    model.fit_generator(
        train_generator.forfit(),
        steps_per_epoch=len(train_generator),
        epochs=2,
        callbacks=[evaluator]
    )

    model.load_weights(r'best_model{}.weights'.format(fold))
    probs[fold] = model.predict_generator(test_generator.__iter__(),
        steps=len(test_generator))[:,1]
#     if fold == 0:
#         sub = test[['SessionId','HighRiskFlag']].copy()
#         sub['HighRiskFlag'] = probs[fold]
#         sub.to_csv("./result/sub1.csv", index=None)

Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Input-Token (InputLayer)        (None, None)         0                                            
__________________________________________________________________________________________________
Input-Segment (InputLayer)      (None, None)         0                                            
__________________________________________________________________________________________________
Embedding-Token (Embedding)     (None, None, 768)    16226304    Input-Token[0][0]                
__________________________________________________________________________________________________
Embedding-Segment (Embedding)   (None, None, 768)    1536        Input-Segment[0][0]              
____________________________________________________________________________________________

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 1/2
1
val_auc: 0.90330, best_val_auc: 0.90330
Epoch 2/2
1
val_auc: 0.87101, best_val_auc: 0.87101
Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Input-Token (InputLayer)        (None, None)         0                                            
__________________________________________________________________________________________________
Input-Segment (InputLayer)      (None, None)         0                                            
__________________________________________________________________________________________________
Embedding-Token (Embedding)     (None, None, 768)    16226304    Input-Token[0][0]                
__________________________________________________________________________________________________
Embedding-Segment (Embedding)   (None, None, 768)    1536        Input-Segment[0][0]   

Epoch 1/2
1
val_auc: 0.28962, best_val_auc: 0.28962
Epoch 2/2
1
val_auc: 0.85602, best_val_auc: 0.85602
Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Input-Token (InputLayer)        (None, None)         0                                            
__________________________________________________________________________________________________
Input-Segment (InputLayer)      (None, None)         0                                            
__________________________________________________________________________________________________
Embedding-Token (Embedding)     (None, None, 768)    16226304    Input-Token[0][0]                
__________________________________________________________________________________________________
Embedding-Segment (Embedding)   (None, None, 768)    1536        Input-Segment[0][0]   

Epoch 1/2
1
val_auc: 0.89513, best_val_auc: 0.89513
Epoch 2/2
1
val_auc: 0.83092, best_val_auc: 0.83092


In [67]:
np.savetxt('./result/probs1.csv', probs)

In [74]:
test['Probability'] = np.mean(probs, axis=0)
sub = test[['SessionId','Probability']].copy()
sub.to_csv("./result/sub1.csv", index=None)