In [None]:
from keras.models import Model
from keras.optimizers import Nadam, Adam
from keras.layers import *
from keras.initializers import *
from keras.activations import softmax
from keras.layers import InputSpec, Layer, Input, Dense, merge, Conv1D
from keras.layers import Lambda, Activation, Dropout, Embedding, TimeDistributed
from keras.layers.pooling import GlobalAveragePooling1D, GlobalMaxPooling1D
from keras.layers.merge import concatenate
from keras.preprocessing.sequence import pad_sequences
from keras.layers.normalization import BatchNormalization
import keras.backend as K
from keras.regularizers import l2
from keras.preprocessing.text import Tokenizer
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.models import Model
import gensim
import gc
import os
from keras.callbacks import *
from keras.optimizers import *
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold

In [None]:
class EpochSaver(gensim.models.callbacks.CallbackAny2Vec):
    '''用于保存模型, 打印损失函数等等'''
    def __init__(self, savedir, save_name="word2vector.model"):
        os.makedirs(savedir, exist_ok=True)
        self.save_path = os.path.join(savedir, save_name)
        self.epoch = 0
        self.pre_loss = 0
        self.best_loss = 999999999.9
        self.since = time.time()

    def on_epoch_end(self, model):
        self.epoch += 1
        cum_loss = model.get_latest_training_loss() # 返回的是从第一个epoch累计的
        epoch_loss = cum_loss - self.pre_loss
        time_taken = time.time() - self.since
        print("Epoch %d, loss: %.2f, time: %dmin %ds" % 
                    (self.epoch, epoch_loss, time_taken//60, time_taken%60))
        if self.best_loss > epoch_loss:
            self.best_loss = epoch_loss
            print("Better model. Best loss: %.2f" % self.best_loss)
            model.save(self.save_path)
            print("Model %s save done!" % self.save_path)

        self.pre_loss = cum_loss
        self.since = time.time()

In [None]:
train =pd.read_csv('/home/kesci/input/bytedance/train_final.csv',names=['qId','q','aId','a','target'],nrows=62500000)
test =pd.read_csv('/home/kesci/input/bytedance/bytedance_contest.final_2.csv',names=['qId','q','aId','a'])
test['a']=test['a'].apply(lambda x:x[:-1])
target=train['target']
embed_size=200
tokenizer=Tokenizer(lower=False,char_level=False,split=' ')
tokenizer.fit_on_texts(train['q'].tolist()+train['a'].tolist()+test['q'].tolist()+test['a'].tolist())
train_q=tokenizer.texts_to_sequences(train['q'])
train_a=tokenizer.texts_to_sequences(train['a'])
test_q=tokenizer.texts_to_sequences(test['q'])
test_a=tokenizer.texts_to_sequences(test['a'])
del train,test
gc.collect()
maxlen=40
train_q=pad_sequences(train_q,maxlen=40,value=0)
train_a=pad_sequences(train_a,maxlen=40,value=0)
test_q=pad_sequences(test_q,maxlen=40,value=0)
test_a=pad_sequences(test_a,maxlen=40,value=0)
max_features=750000
embedding_matrix=np.zeros((max_features,embed_size))
from gensim.models import Word2Vec
w2vmodel=Word2Vec.load('/home/kesci/work/w2vfinal_all.model')
num_oov=0
for word in tokenizer.word_index:
    try:
        embedding_matrix[tokenizer.word_index[word]]=w2vmodel[word]
    except:
        num_oov+=1
print('oov num is '+str(num_oov))

In [None]:
train_fea=pd.read_csv('/home/kesci/work/train_feature_freq_final.csv',nrows=62500000)
test_fea=pd.read_csv('/home/kesci/work/test_feature_freq_final.csv')

train_fea=train_fea.fillna(0)
test_fea=test_fea.fillna(0)
train_fea[np.isinf(train_fea)] =0
test_fea[np.isinf(test_fea)] =0

In [None]:

def build_model(emb_matrix, max_sequence_length,feature_shape):
    q = Input(shape=(max_sequence_length,))
    a = Input(shape=(max_sequence_length,))

    embedding = Embedding(
        input_dim=emb_matrix.shape[0],
        output_dim=emb_matrix.shape[1],
        weights=[emb_matrix],
        input_length=max_sequence_length,
        trainable=False
    )
    q1_embed = embedding(q)
    q2_embed = embedding(a)
    shared_lstm_1 = Bidirectional(CuDNNLSTM(128, return_sequences=True))
    shared_lstm_2 = Bidirectional(CuDNNGRU(128))
    q1 = shared_lstm_1(q1_embed)
    q1 = Dropout(0.4)(q1)
    q1 = shared_lstm_2(q1)
    q2 = shared_lstm_1(q2_embed)
    q2 = Dropout(0.4)(q2)
    q2 = shared_lstm_2(q2)
    d = Subtract()([q1, q2])
    distance = Multiply()([d, d])
    angle = Multiply()([q1, q2])
    magic_input = Input(shape=(feature_shape,))
    magic_dense = Dense(64, activation='relu')(magic_input)
    #magic_dense = Dropout(0.3)(magic_dense)
    
    merged = concatenate([distance,angle,magic_dense])
    merged = Dropout(0.3)(merged)

    merged = Dense(256, activation='relu')(merged)  # 64
    merged = Dropout(0.3)(merged)

    merged = Dense(64, activation='relu')(merged)  # 64
    merged = Dropout(0.3)(merged)

    out_ = Dense(1, activation='sigmoid')(merged)

    model = Model(inputs=[q, a, magic_input], outputs=out_)

    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

    return model

In [None]:
feature_dim=train_fea.shape[1]
num_class=1
sub1 = np.zeros((test_a.shape[0],1))

count = 0
folds = KFold(n_splits=5, shuffle=True, random_state=15)
for i, (trn_idx, val_idx) in enumerate(folds.split(train_q)):
    print("FOLD_ ", count + 1)
    filepath = "/home/kesci/work/nnsima_best_model%d.h5" % count
    checkpoint = ModelCheckpoint(
        filepath, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
    reduce_lr = ReduceLROnPlateau(
        monitor='val_loss', factor=0.8, patience=1, min_lr=0.00001, verbose=1)
    earlystopping = EarlyStopping(
        monitor='val_loss', min_delta=0.0001, patience=2, verbose=1, mode='auto')
    callbacks = [checkpoint, reduce_lr, earlystopping]
    model_nn =build_model(embedding_matrix, max_sequence_length=maxlen,feature_shape=feature_dim)
    X_tr, X_vl, X_tr2, X_vl2, y_tr, y_vl,X_fea_tr,X_fea_vl = train_q[trn_idx], train_q[val_idx], train_a[
        trn_idx], train_a[val_idx], target[trn_idx], target[val_idx],train_fea.iloc[trn_idx],train_fea.iloc[val_idx]
    hist = model_nn.fit([X_tr, X_tr2,X_fea_tr], y_tr, batch_size=1024, epochs=10, validation_data=([X_vl, X_vl2,X_fea_vl], y_vl),
                         callbacks=callbacks, verbose=1, shuffle=True)
    print('load_wight')
    model_nn.load_weights(filepath)
    print('start to predict')
    sub1 = model_nn.predict([test_q, test_a,test_fea], batch_size=2024)
    break
sub=pd.DataFrame(sub1.reshape(-1,1))
sub.to_csv('/home/kesci/work/nnsima_final.csv',index=False)