In [None]:
from keras.models import Model
from keras.optimizers import Nadam, Adam
from keras.layers import *
from keras.initializers import *
from keras.activations import softmax
from keras.layers import InputSpec, Layer, Input, Dense, merge, Conv1D
from keras.layers import Lambda, Activation, Dropout, Embedding, TimeDistributed
from keras.layers.pooling import GlobalAveragePooling1D, GlobalMaxPooling1D
from keras.layers.merge import concatenate
from keras.preprocessing.sequence import pad_sequences
from keras.layers.normalization import BatchNormalization
import keras.backend as K
from keras.regularizers import l2
from keras.preprocessing.text import Tokenizer
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.models import Model
import gensim
import gc
import os
from keras.callbacks import *
from keras.optimizers import *
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold

In [None]:
class EpochSaver(gensim.models.callbacks.CallbackAny2Vec):
    '''用于保存模型, 打印损失函数等等'''
    def __init__(self, savedir, save_name="word2vector.model"):
        os.makedirs(savedir, exist_ok=True)
        self.save_path = os.path.join(savedir, save_name)
        self.epoch = 0
        self.pre_loss = 0
        self.best_loss = 999999999.9
        self.since = time.time()

    def on_epoch_end(self, model):
        self.epoch += 1
        cum_loss = model.get_latest_training_loss() # 返回的是从第一个epoch累计的
        epoch_loss = cum_loss - self.pre_loss
        time_taken = time.time() - self.since
        print("Epoch %d, loss: %.2f, time: %dmin %ds" % 
                    (self.epoch, epoch_loss, time_taken//60, time_taken%60))
        if self.best_loss > epoch_loss:
            self.best_loss = epoch_loss
            print("Better model. Best loss: %.2f" % self.best_loss)
            model.save(self.save_path)
            print("Model %s save done!" % self.save_path)

        self.pre_loss = cum_loss
        self.since = time.time()

In [None]:
train =pd.read_csv('/home/kesci/input/bytedance/train_final.csv',names=['qId','q','aId','a','target'],nrows=62500000)
test =pd.read_csv('/home/kesci/input/bytedance/bytedance_contest.final_2.csv',names=['qId','q','aId','a'])
test['a']=test['a'].apply(lambda x:x[:-1])
target=train['target']
embed_size=200

In [None]:
tokenizer=Tokenizer(lower=False,char_level=False,split=' ')
tokenizer.fit_on_texts(train['q'].tolist()+train['a'].tolist()+test['q'].tolist()+test['a'].tolist())
train_q=tokenizer.texts_to_sequences(train['q'])
train_a=tokenizer.texts_to_sequences(train['a'])
test_q=tokenizer.texts_to_sequences(test['q'])
test_a=tokenizer.texts_to_sequences(test['a'])

In [None]:
del train,test
gc.collect()
maxlen=40
train_q=pad_sequences(train_q,maxlen=40,value=0)
train_a=pad_sequences(train_a,maxlen=40,value=0)
test_q=pad_sequences(test_q,maxlen=40,value=0)
test_a=pad_sequences(test_a,maxlen=40,value=0)

In [None]:
max_features=750000
embedding_matrix=np.zeros((max_features,embed_size))
from gensim.models import Word2Vec
w2vmodel=Word2Vec.load('/home/kesci/work/w2vfinal_all.model')
num_oov=0
for word in tokenizer.word_index:
    try:
        embedding_matrix[tokenizer.word_index[word]]=w2vmodel[word]
    except:
        num_oov+=1
print('oov num is '+str(num_oov))


In [None]:

train_fea=pd.read_csv('/home/kesci/work/train_feature_freq_final.csv',nrows=62500000)
test_fea=pd.read_csv('/home/kesci/work/test_feature_freq_final.csv')

train_fea=train_fea.fillna(0)
test_fea=test_fea.fillna(0)
train_fea[np.isinf(train_fea)] =0
test_fea[np.isinf(test_fea)] =0

In [None]:
class Attention(Layer):
    def __init__(self, step_dim,
                 W_regularizer=None, b_regularizer=None,
                 W_constraint=None, b_constraint=None,
                 bias=True, **kwargs):
        self.supports_masking = True
        #self.init = initializations.get('glorot_uniform')
        self.init = initializers.get('glorot_uniform')

        self.W_regularizer = regularizers.get(W_regularizer)
        self.b_regularizer = regularizers.get(b_regularizer)

        self.W_constraint = constraints.get(W_constraint)
        self.b_constraint = constraints.get(b_constraint)

        self.bias = bias
        self.step_dim = step_dim
        self.features_dim = 0
        super(Attention, self).__init__(**kwargs)

    def build(self, input_shape):
        assert len(input_shape) == 3

        self.W = self.add_weight(shape=(input_shape[-1],),
                                 initializer=self.init,
                                 name='%s_W'%self.name,
                                 regularizer=self.W_regularizer,
                                 constraint=self.W_constraint)
        self.features_dim = input_shape[-1]

        if self.bias:
            self.b = self.add_weight(shape=(input_shape[1],),
                                     initializer='zero',
                                     name='%s_b'%self.name,
                                     regularizer=self.b_regularizer,
                                     constraint=self.b_constraint)
        else:
            self.b = None

        self.built = True

    def compute_mask(self, input, input_mask=None):
        # do not pass the mask to the next layers
        return None

    def call(self, x, mask=None):
        # eij = K.dot(x, self.W) TF backend doesn't support it

        # features_dim = self.W.shape[0]
        # step_dim = x._keras_shape[1]

        features_dim = self.features_dim
        step_dim = self.step_dim
        eij = K.reshape(K.dot(K.reshape(x, (-1, features_dim)), K.reshape(self.W, (features_dim, 1))), (-1, step_dim))

        if self.bias:
            eij += self.b

        eij = K.tanh(eij)
        a = K.exp(eij)
        # apply mask after the exp. will be re-normalized next
        if mask is not None:
            # Cast the mask to floatX to avoid float64 upcasting in theano
            a *= K.cast(mask, K.floatx())

        # in some cases especially in the early stages of training the sum may be almost zero
        a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())

        a = K.expand_dims(a)
        weighted_input = x * a
        #print weigthted_input.shape
        return K.sum(weighted_input, axis=1)

    def compute_output_shape(self, input_shape):
        #return input_shape[0], input_shape[-1]
        return input_shape[0],  self.features_dim

In [None]:
def DSSM(emb_matrix, max_sequence_length,feature_shape, lstmsize=90):
    embedding = Embedding(
        input_dim=emb_matrix.shape[0],
        output_dim=emb_matrix.shape[1],
        weights=[emb_matrix],
        input_length=max_sequence_length,
        trainable=False
    )

    input1 = Input(shape=(max_sequence_length,))
    input2 = Input(shape=(max_sequence_length,))
    
    lstm0 = CuDNNLSTM(lstmsize,return_sequences = True)
    lstm1 = Bidirectional(CuDNNLSTM(lstmsize))
    lstm2 = CuDNNLSTM(lstmsize)
    att1 = Attention(max_sequence_length)
    den = Dense(64,activation = 'tanh')

    # att1 = Lambda(lambda x: K.max(x,axis = 1))

    v1 = embedding(input1)
    v2 = embedding(input2)
    v11 = lstm1(v1)
    v22 = lstm1(v2)
    v1ls = lstm2(lstm0(v1))
    v2ls = lstm2(lstm0(v2))
    v1 = Concatenate(axis=1)([att1(v1),v11])
    v2 = Concatenate(axis=1)([att1(v2),v22])
    fea = Input(shape=(feature_shape,))
    feax = Dense(128, activation='relu')(fea)
    mul = Multiply()([v1,v2])
    sub = Lambda(lambda x: K.abs(x))(Subtract()([v1,v2]))
    maximum = Maximum()([Multiply()([v1,v1]),Multiply()([v2,v2])])
    sub2 = Lambda(lambda x: K.abs(x))(Subtract()([v1ls,v2ls]))
    
    matchlist = Concatenate(axis=1)([mul,sub,maximum,sub2,feax])
    matchlist = Dropout(0.05)(matchlist)

    matchlist = Concatenate(axis=1)([Dense(32,activation = 'relu')(matchlist),Dense(128,activation = 'sigmoid')(matchlist)])
    res = Dense(1, activation = 'sigmoid')(matchlist)

    model = Model(inputs=[input1, input2,fea ], outputs=res)
    model.compile(optimizer='adam', loss='binary_crossentropy',
                  metrics=['accuracy'])
    return model

In [None]:
feature_dim=train_fea.shape[1]
num_class=1
sub1 = np.zeros((test_a.shape[0],1))

count = 0
folds = KFold(n_splits=5, shuffle=True, random_state=15)
for i, (trn_idx, val_idx) in enumerate(folds.split(train_q)):
    print("FOLD_ ", count + 1)
    filepath = "/home/kesci/work/nnDA_best_model%d.h5" % count
    checkpoint = ModelCheckpoint(
        filepath, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
    reduce_lr = ReduceLROnPlateau(
        monitor='val_loss', factor=0.8, patience=1, min_lr=0.00001, verbose=1)
    earlystopping = EarlyStopping(
        monitor='val_loss', min_delta=0.0001, patience=2, verbose=1, mode='auto')
    callbacks = [checkpoint, reduce_lr, earlystopping]
    model_nn =decomposable_attention(embedding_matrix, max_sequence_length=maxlen,feature_shape=feature_dim)
    X_tr, X_vl, X_tr2, X_vl2, y_tr, y_vl,X_fea_tr,X_fea_vl = train_q[trn_idx], train_q[val_idx], train_a[
        trn_idx], train_a[val_idx], target[trn_idx], target[val_idx],train_fea.iloc[trn_idx],train_fea.iloc[val_idx]
    hist = model_nn.fit([X_tr, X_tr2,X_fea_tr], y_tr, batch_size=1024, epochs=10, validation_data=([X_vl, X_vl2,X_fea_vl], y_vl),
                         callbacks=callbacks, verbose=1, shuffle=True)
    print('load_wight')
    model_nn.load_weights(filepath)
    print('start to predict')
    sub1 = model_nn.predict([test_q, test_a,test_fea], batch_size=2024)
    break
sub=pd.DataFrame(sub1.reshape(-1,1))
sub.to_csv('/home/kesci/work/nndssm_final.csv',index=False)