In [None]:
from keras.models import Model
from keras.optimizers import Nadam, Adam
from keras.layers import *
from keras.initializers import *
from keras.activations import softmax
from keras.layers import InputSpec, Layer, Input, Dense, merge, Conv1D
from keras.layers import Lambda, Activation, Dropout, Embedding, TimeDistributed
from keras.layers.pooling import GlobalAveragePooling1D, GlobalMaxPooling1D
from keras.layers.merge import concatenate
from keras.preprocessing.sequence import pad_sequences
from keras.layers.normalization import BatchNormalization
import keras.backend as K
from keras.regularizers import l2
from keras.preprocessing.text import Tokenizer
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.models import Model
import gensim
import gc
import os
from keras.callbacks import *
from keras.optimizers import *
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold

In [None]:
class EpochSaver(gensim.models.callbacks.CallbackAny2Vec):
    '''用于保存模型, 打印损失函数等等'''
    def __init__(self, savedir, save_name="word2vector.model"):
        os.makedirs(savedir, exist_ok=True)
        self.save_path = os.path.join(savedir, save_name)
        self.epoch = 0
        self.pre_loss = 0
        self.best_loss = 999999999.9
        self.since = time.time()

    def on_epoch_end(self, model):
        self.epoch += 1
        cum_loss = model.get_latest_training_loss() # 返回的是从第一个epoch累计的
        epoch_loss = cum_loss - self.pre_loss
        time_taken = time.time() - self.since
        print("Epoch %d, loss: %.2f, time: %dmin %ds" % 
                    (self.epoch, epoch_loss, time_taken//60, time_taken%60))
        if self.best_loss > epoch_loss:
            self.best_loss = epoch_loss
            print("Better model. Best loss: %.2f" % self.best_loss)
            model.save(self.save_path)
            print("Model %s save done!" % self.save_path)

        self.pre_loss = cum_loss
        self.since = time.time()

In [None]:
train =pd.read_csv('/home/kesci/input/bytedance/train_final.csv',names=['qId','q','aId','a','target'],nrows=62500000)
test =pd.read_csv('/home/kesci/input/bytedance/bytedance_contest.final_2.csv',names=['qId','q','aId','a'])
test['a']=test['a'].apply(lambda x:x[:-1])

In [None]:
target=train['target']
embed_size=200
tokenizer=Tokenizer(lower=False,char_level=False,split=' ')
tokenizer.fit_on_texts(train['q'].tolist()+train['a'].tolist()+test['q'].tolist()+test['a'].tolist())
train_q=tokenizer.texts_to_sequences(train['q'])
train_a=tokenizer.texts_to_sequences(train['a'])
test_q=tokenizer.texts_to_sequences(test['q'])
test_a=tokenizer.texts_to_sequences(test['a'])

In [None]:
del train,test
gc.collect()
maxlen=40

In [None]:
train_q=pad_sequences(train_q,maxlen=40,value=0)
train_a=pad_sequences(train_a,maxlen=40,value=0)
test_q=pad_sequences(test_q,maxlen=40,value=0)
test_a=pad_sequences(test_a,maxlen=40,value=0)

In [None]:
max_features=750000
embedding_matrix=np.zeros((max_features,embed_size))
from gensim.models import Word2Vec
w2vmodel=Word2Vec.load('/home/kesci/work/w2vfinal_all.model')
num_oov=0
for word in tokenizer.word_index:
    try:
        embedding_matrix[tokenizer.word_index[word]]=w2vmodel[word]
    except:
        num_oov+=1
print('oov num is '+str(num_oov))

In [None]:
train_fea=pd.read_csv('/home/kesci/work/train_feature_freq_final.csv',nrows=62500000)
test_fea=pd.read_csv('/home/kesci/work/test_feature_freq_final.csv')

In [None]:
train_fea=train_fea.fillna(0)
test_fea=test_fea.fillna(0)
train_fea[np.isinf(train_fea)] =0
test_fea[np.isinf(test_fea)] =0

In [None]:
def unchanged_shape(input_shape):
    "Function for Lambda layer"
    return input_shape


def substract(input_1, input_2):
    "Substract element-wise"
    neg_input_2 = Lambda(lambda x: -x, output_shape=unchanged_shape)(input_2)
    out_ = Add()([input_1, neg_input_2])
    return out_


def submult(input_1, input_2):
    "Get multiplication and subtraction then concatenate results"
    mult = Multiply()([input_1, input_2])
    sub = substract(input_1, input_2)
    out_= Concatenate()([sub, mult])
    return out_


def apply_multiple(input_, layers):
    "Apply layers to input then concatenate result"
    if not len(layers) > 1:
        raise ValueError('Layers list should contain more than 1 layer')
    else:
        agg_ = []
        for layer in layers:
            agg_.append(layer(input_))
        out_ = Concatenate()(agg_)
    return out_


def time_distributed(input_, layers):
    "Apply a list of layers in TimeDistributed mode"
    out_ = []
    node_ = input_
    for layer_ in layers:
        node_ = TimeDistributed(layer_)(node_)
    out_ = node_
    return out_


def soft_attention_alignment(input_1, input_2):
    "Align text representation with neural soft attention"
    attention = Dot(axes=-1)([input_1, input_2])
    w_att_1 = Lambda(lambda x: softmax(x, axis=1),
                     output_shape=unchanged_shape)(attention)
    w_att_2 = Permute((2,1))(Lambda(lambda x: softmax(x, axis=2),
                             output_shape=unchanged_shape)(attention))
    in1_aligned = Dot(axes=1)([w_att_1, input_1])
    in2_aligned = Dot(axes=1)([w_att_2, input_2])
    return in1_aligned, in2_aligned

In [None]:
def decomposable_attention(emb_matrix,maxlen,feature_shape, 
                           projection_dim=300, projection_hidden=0, projection_dropout=0.2,
                           compare_dim=300, compare_dropout=0.2,
                           dense_dim=300, dense_dropout=0.2,
                           lr=1e-3, activation='elu'):
    q1 = Input(name='q1',shape=(maxlen,))
    q2 = Input(name='q2',shape=(maxlen,))
    

    embedding = Embedding(
        input_dim=emb_matrix.shape[0],
        output_dim=emb_matrix.shape[1],
        weights=[emb_matrix],
        input_length=maxlen,
        trainable=False
    )
    q1_embed = embedding(q1)
    q2_embed = embedding(q2)
    
    # Projection
    projection_layers = []
    if projection_hidden > 0:
        projection_layers.extend([
                Dense(projection_hidden, activation=activation),
                Dropout(rate=projection_dropout),
            ])
    projection_layers.extend([
            Dense(projection_dim, activation=None),
            Dropout(rate=projection_dropout),
        ])
    q1_encoded = time_distributed(q1_embed, projection_layers)
    q2_encoded = time_distributed(q2_embed, projection_layers)
    
    # Attention
    q1_aligned, q2_aligned = soft_attention_alignment(q1_encoded, q2_encoded)    
    
    # Compare
    q1_combined = Concatenate()([q1_encoded, q2_aligned, submult(q1_encoded, q2_aligned)])
    q2_combined = Concatenate()([q2_encoded, q1_aligned, submult(q2_encoded, q1_aligned)]) 
    compare_layers = [
        Dense(compare_dim, activation=activation),
        Dropout(compare_dropout),
        Dense(compare_dim, activation=activation),
        Dropout(compare_dropout),
    ]
    q1_compare = time_distributed(q1_combined, compare_layers)
    q2_compare = time_distributed(q2_combined, compare_layers)
    
    # Aggregate
    q1_rep = apply_multiple(q1_compare, [GlobalAvgPool1D(), GlobalMaxPool1D()])
    q2_rep = apply_multiple(q2_compare, [GlobalAvgPool1D(), GlobalMaxPool1D()])
    
    fea = Input(shape=(feature_shape,))
    feax = Dense(128, activation='relu')(fea)
    merged = Concatenate()([q1_rep, q2_rep,feax])
    dense = Dense(dense_dim, activation=activation)(merged)
    dense = Dropout(dense_dropout)(dense)
    dense = Dense(dense_dim, activation=activation)(dense)
    dense = Dropout(dense_dropout)(dense)
    out_ = Dense(1, activation='sigmoid')(dense)
    model = Model(inputs=[q1, q2,fea], outputs=out_)
    model.compile(optimizer='adam', loss='binary_crossentropy',
                  metrics=['accuracy'])
    return model

In [None]:
feature_dim=train_fea.shape[1]
num_class=1
sub1 = np.zeros((test_a.shape[0],1))

count = 0
folds = KFold(n_splits=5, shuffle=True, random_state=15)
for i, (trn_idx, val_idx) in enumerate(folds.split(train_q)):
    print("FOLD_ ", count + 1)
    filepath = "/home/kesci/work/nnDA_best_model%d.h5" % count
    checkpoint = ModelCheckpoint(
        filepath, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
    reduce_lr = ReduceLROnPlateau(
        monitor='val_loss', factor=0.8, patience=1, min_lr=0.00001, verbose=1)
    earlystopping = EarlyStopping(
        monitor='val_loss', min_delta=0.0001, patience=2, verbose=1, mode='auto')
    callbacks = [checkpoint, reduce_lr, earlystopping]
    model_nn =decomposable_attention(embedding_matrix, maxlen=maxlen,feature_shape=feature_dim)
    X_tr, X_vl, X_tr2, X_vl2, y_tr, y_vl,X_fea_tr,X_fea_vl = train_q[trn_idx], train_q[val_idx], train_a[
        trn_idx], train_a[val_idx], target[trn_idx], target[val_idx],train_fea.iloc[trn_idx],train_fea.iloc[val_idx]
    hist = model_nn.fit([X_tr, X_tr2,X_fea_tr], y_tr, batch_size=1024, epochs=10, validation_data=([X_vl, X_vl2,X_fea_vl], y_vl),
                         callbacks=callbacks, verbose=1, shuffle=True)
    print('load_wight')
    model_nn.load_weights(filepath)
    print('start to predict')
    sub1 = model_nn.predict([test_q, test_a,test_fea], batch_size=2024)
    break
sub=pd.DataFrame(sub1.reshape(-1,1))
sub.to_csv('/home/kesci/work/nnda_final.csv',index=False)