In [1]:
# /usr/bin/env python
# coding=utf-8

import os
import numpy as np
from sklearn.model_selection import StratifiedKFold
from keras.optimizers import *
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.metrics import f1_score, recall_score, precision_score, accuracy_score

from params_config import ParamsConfig as config
from utils.score import *
from utils.data_convert import *
from utils.models import get_model

Using TensorFlow backend.
Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 0.254 seconds.
Prefix dict has been built succesfully.


In [2]:
def train(config):
    df_w = pd.read_csv(config.input_file_word, encoding="utf-8")   # process_word.csv， 原始数据分词
    df_c = pd.read_csv(config.input_file_char, encoding="utf-8")    # process_char.csv， 原始数据分字

    question1_w = df_w['question1'].values
    question2_w = df_w['question2'].values  # 问题1和2 的词
    question1_c = df_c['question1'].values
    question2_c = df_c['question2'].values  # 问题1 和 2 的字
    y = df_w['label'].values  # 每一行标签
    
    tokenizer_w = Tokenizer(num_words=config.word_vocab_size)
    tokenizer_w.fit_on_texts(list(question1_w) + list(question2_w))
    list_tokenized_question1_w = tokenizer_w.texts_to_sequences(question1_w)
    list_tokenized_question2_w = tokenizer_w.texts_to_sequences(question2_w)
    X_train_w_q1 = pad_sequences(list_tokenized_question1_w, maxlen=config.word_seq_length)
    X_train_w_q2 = pad_sequences(list_tokenized_question2_w, maxlen=config.word_seq_length)
    word_index = tokenizer_w.word_index
    
    tokenizer_c = Tokenizer(num_words=config.char_vocab_size)
    tokenizer_c.fit_on_texts(list(question1_c) + list(question2_c))
    list_tokenized_question1_c = tokenizer_c.texts_to_sequences(question1_c)
    list_tokenized_question2_c = tokenizer_c.texts_to_sequences(question2_c)
    X_train_c_q1 = pad_sequences(list_tokenized_question1_c, maxlen=config.char_seq_length)
    X_train_c_q2 = pad_sequences(list_tokenized_question2_c, maxlen=config.char_seq_length)
    char_index = tokenizer_c.word_index
    
    # 10折
    skf = StratifiedKFold(n_splits=config.cv_folds, random_state=config.seed, shuffle=True)
    
    pred_oob = np.zeros(shape=(len(y), config.cv_folds))
    
    if os.path.exists(config.model_dir2) == False:
        os.makedirs(config.model_dir2)
     
     # 要load embedding，加载 word_vec 和 char_vec，里面包含每个字和每个词语的向量   
    if config.is_load_embedding:
        embedding_matrix_w = load_embedding(config.w2v_w_file, word_index, config.embedding_dims, config.word_vocab_size)
        embedding_matrix_c = load_embedding(config.w2v_c_file, char_index, config.embedding_dims, config.char_vocab_size)
    else:
        embedding_matrix_w, embedding_matrix_c = None, None
    
    count = 0
    for ind_tr, ind_te in skf.split(X_train_w_q1, y):
        # 分成训练集
        x_train_w_q1 = X_train_w_q1[ind_tr]
        x_train_w_q2 = X_train_w_q2[ind_tr]
        x_train_c_q1 = X_train_c_q1[ind_tr]
        x_train_c_q2 = X_train_c_q2[ind_tr]
        
        # 分成验证集
        x_val_w_q1 = X_train_w_q1[ind_te]
        x_val_w_q2 = X_train_w_q2[ind_te]
        x_val_c_q1 = X_train_c_q1[ind_te]
        x_val_c_q2 = X_train_c_q2[ind_te]
        
        # 标签
        y_train = y[ind_tr]
        y_val = y[ind_te]
        
        model = get_model(config.kernel_name2)(config, embedding_matrix_w, embedding_matrix_c)
        #model.compile(optimizer=Adam(lr = config.learning_rate), loss="binary_crossentropy", metrics=["accuracy", f1_score_metrics])
        #early_stopping = EarlyStopping(monitor='val_f1_score_metrics', patience=3, mode='max', verbose=1)
        
        bst_model_path = config.model_dir2+config.kernel_name2 + '_weight_%d.h5' % count
        # model_checkpoint = ModelCheckpoint(bst_model_path, monitor='val_f1_score_metrics', mode='max',
        #                                   save_best_only=True, verbose=1, save_weights_only=True)
        #hist = model.fit([x_train_w_q1, x_train_w_q2, x_train_c_q1, x_train_c_q2], y_train,
        #                 validation_data=([x_val_w_q1, x_val_w_q2, x_val_c_q1, x_val_c_q2], y_val),
        #                 epochs=config.num_epochs, batch_size=config.batch_size, shuffle=True,
        #                 class_weight=config.class_weight,
        #                 callbacks=[early_stopping, model_checkpoint, F1ScoreCallback()])
        model.load_weights(bst_model_path)
        y_predict = model.predict([x_val_w_q1, x_val_w_q2, x_val_c_q1, x_val_c_q2], batch_size=256, verbose=1)
        pred_oob[ind_te] = y_predict
        y_predict = (y_predict > 0.5).astype(int)
        recall = recall_score(y_val, y_predict)
        print(count, "recal", recall)
        precision = precision_score(y_val, y_predict)
        print(count, "precision", precision)
        accuracy = accuracy_score(y_val, y_predict)
        print(count, "accuracy ", accuracy)
        f1 = f1_score(y_val, y_predict)
        print(count, "f1", f1)
        count += 1

    pred_oob1 = np.mean(pred_oob, axis=1)
    pred_oob1 = (pred_oob1 > 0.5).astype(int)
    recall = recall_score(y, pred_oob1)
    print("recal", recall)
    precision = precision_score(y, pred_oob1)
    print("precision", precision)
    accuracy = accuracy_score(y, pred_oob1)
    print("accuracy", accuracy)
    f1 = f1_score(y, pred_oob1)
    print("f1", f1)

In [3]:
train(config)

('word embedding', 13571)
('word embedding', 1988)
(0, 'recal', 0.6800428036383093)
(0, 'precision', 0.47692307692307695)
(0, 'accuracy ', 0.8056395745926432)
(0, 'f1', 0.5606528451698279)
(1, 'recal', 0.6682718031032637)
(1, 'precision', 0.49445764053840063)
(1, 'accuracy ', 0.814908771587472)
(1, 'f1', 0.5683731513083048)
(2, 'recal', 0.7265917602996255)
(2, 'precision', 0.4666666666666667)
(2, 'accuracy ', 0.7986924277907884)
(2, 'f1', 0.5683197321615401)
(3, 'recal', 0.6725521669341894)
(3, 'precision', 0.4805045871559633)
(3, 'accuracy ', 0.8076697892271663)
(3, 'f1', 0.5605351170568562)
(4, 'recal', 0.6837881219903692)
(4, 'precision', 0.48045112781954885)
(4, 'accuracy ', 0.8074746291959407)
(4, 'f1', 0.564362994038419)
(5, 'recal', 0.6354389721627409)
(5, 'precision', 0.5098797250859106)
(5, 'accuracy ', 0.8221918610324973)
(5, 'f1', 0.5657769304099142)
(6, 'recal', 0.6531049250535332)
(6, 'precision', 0.5045492142266336)
(6, 'accuracy ', 0.8198497121108617)
(6, 'f1', 0.5692953