In [1]:
import logging
logging.basicConfig(level=logging.ERROR)

import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from tqdm import tqdm
import tensorflow as tf
import tensorflow.keras.backend as K
import os
from transformers import *
print(tf.__version__)
from sklearn.metrics import f1_score

ModuleNotFoundError: No module named 'tensorflow'

In [2]:
train_left = pd.read_csv('./train/train.query.tsv',sep='\t',header=None)
train_left.columns=['id','query']
train_right = pd.read_csv('./train/train.reply.tsv',sep='\t',header=None)
train_right.columns=['id','id_sub','reply','label']
train_data = train_left.merge(train_right, how='left')
train_data['reply'] = train_data['reply'].fillna('好的')

In [3]:
# train_data = pd.read_csv('./train/train.csv')
# train_data['reply'] = train_data['reply'].fillna('好的')

In [4]:
train_data

Unnamed: 0,id,query,id_sub,reply,label
0,0,采荷一小是分校吧,0,杭州市采荷第一小学钱江苑校区，杭州市钱江新城实验学校。,1
1,0,采荷一小是分校吧,1,是的,0
2,0,采荷一小是分校吧,2,这是5楼,0
3,1,毛坯吗？,0,因为公积金贷款贷的少,0
4,1,毛坯吗？,1,是呢,0
...,...,...,...,...,...
21580,5998,您好，我正在看尚林家园的房子,1,有啊,0
21581,5998,您好，我正在看尚林家园的房子,2,我带你看看,0
21582,5999,今天可以安排看房子吗？,0,我约下房东，稍后回你,1
21583,5999,今天可以安排看房子吗？,1,可以看，你几点有时间过来呢？,1


In [5]:
# train_data

In [6]:
# train_left = pd.read_csv('./train/train.query.tsv',sep='\t',header=None)
# train_left.columns=['id','q1']
# train_right = pd.read_csv('./train/train.reply.tsv',sep='\t',header=None)
# train_right.columns=['id','id_sub','q2','label']
# df_train = train_left.merge(train_right, how='left')
# df_train['q2'] = df_train['q2'].fillna('好的')

test_left = pd.read_csv('./test/test.query.tsv',sep='\t',header=None, encoding='gbk')
test_left.columns = ['id','query']
test_right =  pd.read_csv('./test/test.reply.tsv',sep='\t',header=None, encoding='gbk')
test_right.columns=['id','id_sub','reply']
df_test = test_left.merge(test_right, how='left')

In [7]:
df_test

Unnamed: 0,id,query,id_sub,reply
0,0,东区西区？什么时候下证？,0,我在给你发套
1,0,东区西区？什么时候下证？,1,您看下我发的这几套
2,0,东区西区？什么时候下证？,2,这两套也是金源花园的
3,0,东区西区？什么时候下证？,3,价钱低
4,0,东区西区？什么时候下证？,4,便宜的房子，一般都是顶楼
...,...,...,...,...
53752,13998,这套房子有啥问题吗 我看价格不高,3,租约还有两年
53753,13998,这套房子有啥问题吗 我看价格不高,4,都有学位的
53754,13999,我看看时间吧,0,没有呢
53755,13999,我看看时间吧,1,今天新上的


In [8]:
# * Validation loss before training: 0.6397, accuracy: 68.7500%, auc: 0.5097


PATH = './'
# BERT_PATH = './'
# WEIGHT_PATH = './'
MAX_SEQUENCE_LENGTH = 100
input_categories = ['query','reply']
output_categories = 'label'

print('train shape =', train_data.shape)
print('test shape =', df_test.shape)

train shape = (21585, 5)
test shape = (53757, 4)


In [9]:
def _convert_to_transformer_inputs(question, answer, tokenizer, max_sequence_length):
    """Converts tokenized input to ids, masks and segments for transformer (including bert)"""
    
    def return_id(str1, str2, truncation_strategy, length):

        inputs = tokenizer.encode_plus(str1, str2,
            add_special_tokens=True,
            max_length=length,
            truncation_strategy=truncation_strategy,
            #truncation=True
            )
        
        input_ids =  inputs["input_ids"]
        input_masks = [1] * len(input_ids)
        input_segments = inputs["token_type_ids"]
        padding_length = length - len(input_ids)
        padding_id = tokenizer.pad_token_id
        input_ids = input_ids + ([padding_id] * padding_length)
        input_masks = input_masks + ([0] * padding_length)
        input_segments = input_segments + ([0] * padding_length)
        
        return [input_ids, input_masks, input_segments]
    
    input_ids_q, input_masks_q, input_segments_q = return_id(
        question, answer, 'longest_first', max_sequence_length)
    

    
    return [input_ids_q, input_masks_q, input_segments_q]

def compute_input_arrays(df, columns, tokenizer, max_sequence_length):
    input_ids_q, input_masks_q, input_segments_q = [], [], []
    input_ids_a, input_masks_a, input_segments_a = [], [], []
    for _, instance in tqdm(df[columns].iterrows()):
        query,reply = instance.query, instance.reply

        ids_q, masks_q, segments_q= \
        _convert_to_transformer_inputs(query, reply, tokenizer, max_sequence_length)
        
        input_ids_q.append(ids_q)
        input_masks_q.append(masks_q)
        input_segments_q.append(segments_q)

    return [np.asarray(input_ids_q, dtype=np.int32), 
            np.asarray(input_masks_q, dtype=np.int32), 
            np.asarray(input_segments_q, dtype=np.int32)]

def compute_output_arrays(df, columns):
    return np.asarray(df[columns])


def search_f1(y_true, y_pred):
    best = 0
    best_t = 0
    for i in range(30,60):
        tres = i / 100
        y_pred_bin =  (y_pred > tres).astype(int)
        score = f1_score(y_true, y_pred_bin)
        if score > best:
            best = score
            best_t = tres
    print('best', best)
    print('thres', best_t)
    return best, best_t

In [10]:
tokenizer = BertTokenizer.from_pretrained('./preTrainModel/bert_base_chinese_tf/bert-base-chinese-vocab.txt')
outputs = compute_output_arrays(train_data, output_categories)
inputs = compute_input_arrays(train_data, input_categories, tokenizer, MAX_SEQUENCE_LENGTH)
test_inputs = compute_input_arrays(df_test, input_categories, tokenizer, MAX_SEQUENCE_LENGTH)

21585it [00:12, 1784.46it/s]
53757it [00:30, 1761.62it/s]


In [11]:
def create_model():
    q_id = tf.keras.layers.Input((MAX_SEQUENCE_LENGTH,), dtype=tf.int32)
    q_mask = tf.keras.layers.Input((MAX_SEQUENCE_LENGTH,), dtype=tf.int32)
    q_atn = tf.keras.layers.Input((MAX_SEQUENCE_LENGTH,), dtype=tf.int32)
    
    config = BertConfig.from_pretrained('./preTrainModel/bert_base_chinese_tf/bert-base-chinese-config.json') 
    config.output_hidden_states = False
    bert_model = TFBertModel.from_pretrained('./preTrainModel/bert_base_chinese_tf/bert-base-chinese-tf_model.h5', 
                                             config=config)
    q_embedding = bert_model(q_id, attention_mask=q_mask, token_type_ids=q_atn)[0]
    q = tf.keras.layers.GlobalAveragePooling1D()(q_embedding)
    a = tf.keras.layers.GlobalMaxPooling1D()(q_embedding)
#     t = q_embedding[:,-1]
#     e = q_embedding[:, 0]
    x = tf.keras.layers.Concatenate()([q, a])
    
    x = tf.keras.layers.Dropout(0.3)(x)
    x = tf.keras.layers.Dense(1, activation='sigmoid')(x)
    model = tf.keras.models.Model(inputs=[q_id, q_mask, q_atn], outputs=x)
    
    return model

In [12]:
from sklearn.model_selection import GroupKFold

gkf = GroupKFold(n_splits=5).split(X=train_data.reply, groups=train_data.id)

valid_preds = []
test_preds = []

oof = np.zeros((len(train_data),1))
for fold, (train_idx, valid_idx) in enumerate(gkf):
    train_inputs = [inputs[i][train_idx] for i in range(len(inputs))]
    train_outputs = outputs[train_idx]
    valid_inputs = [inputs[i][valid_idx] for i in range(len(inputs))]
    valid_outputs = outputs[valid_idx]

    K.clear_session()
    model = create_model()
    optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5)
    model.compile(loss='binary_crossentropy', optimizer=optimizer,metrics=[tf.keras.metrics.AUC()])
    model.fit(train_inputs, train_outputs, validation_data = (valid_inputs, valid_outputs), epochs=3, batch_size=64)
    oof_p = model.predict(valid_inputs, batch_size=512)
    oof[valid_idx] = oof_p
    valid_preds.append(oof_p)
    test_preds.append(model.predict(test_inputs, batch_size=512))
    f1,t = search_f1(valid_outputs, valid_preds[-1])
    print('validation score = ', f1)

Epoch 1/3
Epoch 2/3
Epoch 3/3
best 0.7736441057821605
thres 0.58
validation score =  0.7736441057821605
Epoch 1/3
Epoch 2/3
Epoch 3/3
best 0.7718978102189782
thres 0.3
validation score =  0.7718978102189782
Epoch 1/3
Epoch 2/3
Epoch 3/3
best 0.7816387816387816
thres 0.38
validation score =  0.7816387816387816
Epoch 1/3
Epoch 2/3
Epoch 3/3
best 0.785451197053407
thres 0.43
validation score =  0.785451197053407
Epoch 1/3
Epoch 2/3
Epoch 3/3
best 0.768
thres 0.3
validation score =  0.768


In [13]:
best_score, best_t = search_f1(outputs,oof)

best 0.7701951090315176
thres 0.33


In [14]:
sub = np.average(test_preds, axis=0) 
sub = sub > best_t
df_test['label'] = sub.astype(int)
df_test[['id','id_sub','label']].to_csv('./submission_file/submission_beike_bert_base_enhancedata.csv',index=False, header=None,sep='\t')

In [15]:
a=1

In [16]:
a

1