In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import KFold, StratifiedKFold
import warnings
from collections import OrderedDict, namedtuple
from itertools import chain
import tensorflow as tf
from tensorflow.python.keras.utils import np_utils
from deepctr.layers.utils import Hash,concat_fun
from tensorflow.python.keras.preprocessing.sequence import pad_sequences
from tensorflow.python.keras.regularizers import l2
from deepctr.layers import *
from tensorflow.python.keras.callbacks import *
from tensorflow.python.keras.layers import *
from tensorflow.python.keras.models import *
from tensorflow.python.keras.initializers import *
from sklearn.metrics import f1_score, accuracy_score
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from tqdm import tqdm
warnings.filterwarnings('ignore')

In [2]:
def reduce_sum(input_tensor,axis=None,keep_dims=False,name=None,reduction_indices=None):
    if tf.__version__ < '2.0.0':
        return tf.reduce_sum(input_tensor,
                   axis=axis,
                   keep_dims=keep_dims,
                   name=name,
                   reduction_indices=reduction_indices)
    else:
        return  tf.reduce_sum(input_tensor,
                   axis=axis,
                   keepdims=keep_dims,
                   name=name)
class Linear(tf.keras.layers.Layer):

    def __init__(self, l2_reg=0.0, mode=0, **kwargs):

        self.l2_reg = l2_reg
        # self.l2_reg = tf.contrib.layers.l2_regularizer(float(l2_reg_linear))
        self.mode = mode
        super(Linear, self).__init__(**kwargs)

    def build(self, input_shape):

        self.bias = self.add_weight(name='linear_bias',
                                    shape=(1,),
                                    initializer=tf.keras.initializers.Zeros(),
                                    trainable=True)

        self.dense = tf.keras.layers.Dense(units=1, activation=None, use_bias=False,
                                           kernel_regularizer=tf.keras.regularizers.l2(self.l2_reg))

        super(Linear, self).build(input_shape)  # Be sure to call this somewhere!

    def call(self, inputs , **kwargs):

        if self.mode == 0:
            sparse_input = inputs
            linear_logit = reduce_sum(sparse_input, axis=-1, keep_dims=True)
        elif self.mode == 1:
            dense_input = inputs
            linear_logit = self.dense(dense_input)

        else:
            sparse_input, dense_input = inputs

            linear_logit = reduce_sum(sparse_input, axis=-1, keep_dims=False) + self.dense(dense_input)

        linear_bias_logit = linear_logit + self.bias

        return linear_bias_logit

    def compute_output_shape(self, input_shape):
        return (None, 1)

    def get_config(self, ):
        config = {'mode': self.mode, 'l2_reg': self.l2_reg}
        base_config = super(Linear, self).get_config()
        return dict(list(base_config.items()) + list(config.items()))
class SparseFeat(namedtuple('SparseFeat', ['name', 'dimension', 'use_hash', 'dtype','embedding_name','embedding'])):
    __slots__ = ()

    def __new__(cls, name, dimension, use_hash=False, dtype="int32", embedding_name=None,embedding=True):
        if embedding and embedding_name is None:
            embedding_name = name
        return super(SparseFeat, cls).__new__(cls, name, dimension, use_hash, dtype, embedding_name,embedding)
class DenseFeat(namedtuple('DenseFeat', ['name', 'dimension', 'dtype'])):
    __slots__ = ()

    def __new__(cls, name, dimension=1, dtype="float32"):

        return super(DenseFeat, cls).__new__(cls, name, dimension, dtype)
class VarLenSparseFeat(namedtuple('VarLenFeat', ['name', 'dimension', 'maxlen', 'combiner', 'use_hash', 'dtype','embedding_name','embedding'])):
    __slots__ = ()

    def __new__(cls, name, dimension, maxlen, combiner="mean", use_hash=False, dtype="float32", embedding_name=None,embedding=True):
        if embedding_name is None:
            embedding_name = name
        return super(VarLenSparseFeat, cls).__new__(cls, name, dimension, maxlen, combiner, use_hash, dtype, embedding_name,embedding)
def get_fixlen_feature_names(feature_columns):
    features = build_input_features(feature_columns, include_varlen=False,include_fixlen=True)
    return list(features.keys())
def get_varlen_feature_names(feature_columns):
    features = build_input_features(feature_columns, include_varlen=True,include_fixlen=False)
    return list(features.keys())
def get_inputs_list(inputs):
    return list(chain(*list(map(lambda x: x.values(), filter(lambda x: x is not None, inputs)))))
def build_input_features(feature_columns, include_varlen=True, mask_zero=True, prefix='',include_fixlen=True):
    input_features = OrderedDict()
    if include_fixlen:
        for fc in feature_columns:
            if isinstance(fc,SparseFeat):
                input_features[fc.name] = Input(
                    shape=(1,), name=prefix+fc.name, dtype=fc.dtype)
            elif isinstance(fc,DenseFeat):
                input_features[fc.name] = Input(
                    shape=(fc.dimension,), name=prefix + fc.name, dtype=fc.dtype)
    if include_varlen:
        for fc in feature_columns:
            if isinstance(fc,VarLenSparseFeat):
                input_features[fc.name] = Input(shape=(fc.maxlen,), name=prefix + 'seq_' + fc.name,
                                                      dtype=fc.dtype)
        if not mask_zero:
            for fc in feature_columns:
                input_features[fc.name+"_seq_length"] = Input(shape=(
                    1,), name=prefix + 'seq_length_' + fc.name)
                input_features[fc.name+"_seq_max_length"] = fc.maxlen


    return input_features
def create_embedding_dict(sparse_feature_columns, varlen_sparse_feature_columns, embedding_size, init_std, seed, l2_reg,prefix='sparse_', seq_mask_zero=True):
    if embedding_size == 'auto':
        print("Notice:Do not use auto embedding in models other than DCN")
        sparse_embedding = {feat.embedding_name: Embedding(feat.dimension, 6 * int(pow(feat.dimension, 0.25)),
                                                 embeddings_initializer=RandomNormal(
                                                     mean=0.0, stddev=init_std, seed=seed),
                                                 embeddings_regularizer=l2(l2_reg),
                                                 name=prefix + '_emb_' + feat.name) for feat in
                            sparse_feature_columns}
    else:

        sparse_embedding = {feat.embedding_name: Embedding(feat.dimension, embedding_size,
                                                 embeddings_initializer=RandomNormal(
                                                     mean=0.0, stddev=init_std, seed=seed),
                                                 embeddings_regularizer=l2(
                                                     l2_reg),
                                                 name=prefix + '_emb_'  + feat.name) for feat in
                            sparse_feature_columns}

    if varlen_sparse_feature_columns and len(varlen_sparse_feature_columns) > 0:
        for feat in varlen_sparse_feature_columns:
            # if feat.name not in sparse_embedding:
            if embedding_size == "auto":
                sparse_embedding[feat.embedding_name] = Embedding(feat.dimension, 6 * int(pow(feat.dimension, 0.25)),
                                                        embeddings_initializer=RandomNormal(
                                                            mean=0.0, stddev=init_std, seed=seed),
                                                        embeddings_regularizer=l2(
                                                            l2_reg),
                                                        name=prefix + '_seq_emb_' + feat.name,
                                                        mask_zero=seq_mask_zero)

            else:
                sparse_embedding[feat.embedding_name] = Embedding(feat.dimension, embedding_size,
                                                        embeddings_initializer=RandomNormal(
                                                            mean=0.0, stddev=init_std, seed=seed),
                                                        embeddings_regularizer=l2(
                                                            l2_reg),
                                                        name=prefix + '_seq_emb_' + feat.name,
                                                        mask_zero=seq_mask_zero)


    return sparse_embedding
def get_embedding_vec_list(embedding_dict, input_dict, sparse_feature_columns, return_feat_list=(), mask_feat_list=()):
    embedding_vec_list = []
    for fg in sparse_feature_columns:
        feat_name = fg.name
        if len(return_feat_list) == 0  or feat_name in return_feat_list:
            if fg.use_hash:
                lookup_idx = Hash(fg.dimension,mask_zero=(feat_name in mask_feat_list))(input_dict[feat_name])
            else:
                lookup_idx = input_dict[feat_name]

            embedding_vec_list.append(embedding_dict[feat_name](lookup_idx))

    return embedding_vec_list
def create_embedding_matrix(feature_columns,l2_reg,init_std,seed,embedding_size, prefix="",seq_mask_zero=True):
    sparse_feature_columns = list(
        filter(lambda x: isinstance(x, SparseFeat) and x.embedding, feature_columns)) if feature_columns else []
    varlen_sparse_feature_columns = list(
        filter(lambda x: isinstance(x, VarLenSparseFeat) and x.embedding, feature_columns)) if feature_columns else []
    sparse_emb_dict = create_embedding_dict(sparse_feature_columns, varlen_sparse_feature_columns, embedding_size, init_std, seed,
                                                 l2_reg, prefix=prefix + 'sparse',seq_mask_zero=seq_mask_zero)
    return sparse_emb_dict
def get_linear_logit(features, feature_columns, units=1, l2_reg=0, init_std=0.0001, seed=1024, prefix='linear'):

    linear_emb_list = [input_from_feature_columns(features,feature_columns,1,l2_reg,init_std,seed,prefix=prefix+str(i))[0] for i in range(units)]
    _, dense_input_list = input_from_feature_columns(features,feature_columns,1,l2_reg,init_std,seed,prefix=prefix)

    linear_logit_list = []
    for i in range(units):

        if len(linear_emb_list[0])>0 and len(dense_input_list) >0:
            sparse_input = concat_fun(linear_emb_list[i])
            dense_input = concat_fun(dense_input_list)
            linear_logit = Linear(l2_reg,mode=2)([sparse_input,dense_input])
        elif len(linear_emb_list[0])>0:
            sparse_input = concat_fun(linear_emb_list[i])
            linear_logit = Linear(l2_reg,mode=0)(sparse_input)
        elif len(dense_input_list) >0:
            dense_input = concat_fun(dense_input_list)
            linear_logit = Linear(l2_reg,mode=1)(dense_input)
        else:
            raise NotImplementedError
        linear_logit_list.append(linear_logit)

    return concat_fun(linear_logit_list)
def embedding_lookup(sparse_embedding_dict,sparse_input_dict,sparse_feature_columns,return_feat_list=(), mask_feat_list=()):
    embedding_vec_list = []
    for fc in sparse_feature_columns:
        feature_name = fc.name
        embedding_name = fc.embedding_name
        if len(return_feat_list) == 0  or feature_name in return_feat_list and fc.embedding:
            if fc.use_hash:
                lookup_idx = Hash(fc.dimension,mask_zero=(feature_name in mask_feat_list))(sparse_input_dict[feature_name])
            else:
                lookup_idx = sparse_input_dict[feature_name]

            embedding_vec_list.append(sparse_embedding_dict[embedding_name](lookup_idx))

    return embedding_vec_list
def varlen_embedding_lookup(embedding_dict, sequence_input_dict, varlen_sparse_feature_columns):
    varlen_embedding_vec_dict = {}
    for fc in varlen_sparse_feature_columns:
        feature_name = fc.name
        embedding_name = fc.embedding_name
        if fc.use_hash:
            lookup_idx = Hash(fc.dimension, mask_zero=True)(sequence_input_dict[feature_name])
        else:
            lookup_idx = sequence_input_dict[feature_name]
        varlen_embedding_vec_dict[feature_name] = embedding_dict[embedding_name](lookup_idx)

    return varlen_embedding_vec_dict
def get_varlen_pooling_list(embedding_dict, features, varlen_sparse_feature_columns):
    pooling_vec_list = []
    for fc in varlen_sparse_feature_columns:
        feature_name = fc.name
        combiner = fc.combiner
        feature_length_name = feature_name + '_seq_length'
        if feature_length_name in features:
            vec = SequencePoolingLayer(combiner, supports_masking=False)(
            [embedding_dict[feature_name], features[feature_length_name]])
        else:
            vec = SequencePoolingLayer(combiner, supports_masking=True)(
            embedding_dict[feature_name])
        pooling_vec_list.append(vec)
    return pooling_vec_list
def get_dense_input(features,feature_columns):
    dense_feature_columns = list(filter(lambda x:isinstance(x,DenseFeat),feature_columns)) if feature_columns else []
    dense_input_list = []
    for fc in dense_feature_columns:
        dense_input_list.append(features[fc.name])
    return dense_input_list
def input_from_feature_columns(features,feature_columns, embedding_size, l2_reg, init_std, seed,prefix='',seq_mask_zero=True,support_dense=True):


    sparse_feature_columns = list(filter(lambda x:isinstance(x,SparseFeat),feature_columns)) if feature_columns else []
    varlen_sparse_feature_columns = list(filter(lambda x: isinstance(x, VarLenSparseFeat), feature_columns)) if feature_columns else []

    embedding_dict = create_embedding_matrix(feature_columns,l2_reg,init_std,seed,embedding_size, prefix=prefix,seq_mask_zero=seq_mask_zero)
    sparse_embedding_list = embedding_lookup(
        embedding_dict, features, sparse_feature_columns)
    dense_value_list = get_dense_input(features,feature_columns)
    if not support_dense and len(dense_value_list) >0:
        raise ValueError("DenseFeat is not supported in dnn_feature_columns")

    sequence_embed_dict = varlen_embedding_lookup(embedding_dict,features,varlen_sparse_feature_columns)
    sequence_embed_list = get_varlen_pooling_list(sequence_embed_dict, features, varlen_sparse_feature_columns)
    sparse_embedding_list += sequence_embed_list

    return sparse_embedding_list, dense_value_list
def combined_dnn_input(sparse_embedding_list,dense_value_list):
    if len(sparse_embedding_list) > 0 and len(dense_value_list) > 0:
        sparse_dnn_input = Flatten()(concat_fun(sparse_embedding_list))
        dense_dnn_input = Flatten()(concat_fun(dense_value_list))
        return concat_fun([sparse_dnn_input,dense_dnn_input])
    elif len(sparse_embedding_list) > 0:
        return Flatten()(concat_fun(sparse_embedding_list))
    elif len(dense_value_list) > 0:
        return Flatten()(concat_fun(dense_value_list))
    else:
        raise NotImplementedError

In [3]:
train=pd.read_csv('invite_info.txt',sep='\s+',names=['qid','uid','time','target'])
test=pd.read_csv('invite_info_evaluate_1.txt',sep='\s+',names=['qid','uid','time'])
feature_answer_base=pd.read_csv('feature_answer_base.csv')
# member_info=pd.read_csv('member_info.txt',sep='\s+',names=['uid','sex','key_word','num_level','hot_level','regis_type','regis_platform',
#                                                           'look_freq','a','b','c','d','e','A','B','C','D','E','salt','l_topic','topic_n'])
# ques_info=pd.read_csv('question_info.txt',sep='\s+',names=['qid','qtime','qtitle','qtitlec','qinfo','qinfoc','qtopic'])

the JSON object must be str, not 'bytes'


In [4]:
train=train[['qid','uid','target']]
test=test[['qid','uid']]

In [5]:
drop_col=['picture_max','video_max','video_min','collect_max','collect_min',
          'report_mean','report_max','report_sum','report_min','report_std',
          'nohelp_mean','nohelp_max','nohelp_min','nohelp_std','oppose_max',
          'oppose_sum','oppose_std']
feature_answer_base.drop(drop_col,axis=1,inplace=True)
train=pd.merge(train,feature_answer_base,how='left',on='uid')
test=pd.merge(test,feature_answer_base,how='left',on='uid')

In [6]:
feature_topic_count=pd.read_csv('feature_topic_count.csv')
train=pd.concat([train,feature_topic_count.iloc[0:train.shape[0],:]],axis=1)
test=pd.concat([test,feature_topic_count.iloc[train.shape[0]:,:].reset_index(drop=True)],axis=1)

In [7]:
feature_topic_tfidf=pd.read_csv('feature_topic_tfidf.csv')
feature_topic_w2vdis=pd.read_csv('feature_topic_w2vdis.csv')
feature_qid_freq=pd.read_csv('feature_qid_freq.csv')
feature_uid_freq=pd.read_csv('feature_uid_freq.csv')
feature_uid_ctr=pd.read_csv('feature_uid_ctr.csv')
feature_uid_ctr=pd.read_csv('feature_uid_ctr.csv')
feature_member_info=pd.read_csv('feature_member_info.csv')

train=pd.concat([train,
                 feature_topic_tfidf.iloc[0:train.shape[0],:],
                 feature_topic_w2vdis.iloc[0:train.shape[0],:],
                 feature_qid_freq.iloc[0:train.shape[0],:],
                 feature_uid_freq.iloc[0:train.shape[0],:],
                 feature_uid_ctr.iloc[0:train.shape[0],:],
                 feature_member_info.iloc[0:train.shape[0],:],],axis=1)
test=pd.concat([test,
                feature_topic_tfidf.iloc[train.shape[0]:,:].reset_index(drop=True),
                feature_topic_w2vdis.iloc[train.shape[0]:,:].reset_index(drop=True),
                feature_qid_freq.iloc[train.shape[0]:,:].reset_index(drop=True),
                feature_uid_freq.iloc[train.shape[0]:,:].reset_index(drop=True),
                feature_uid_ctr.iloc[train.shape[0]:,:].reset_index(drop=True),
                feature_member_info.iloc[train.shape[0]:,:].reset_index(drop=True)],axis=1)

In [8]:
feature_answer_count=pd.read_csv('feature_answer_count.csv')
train=pd.merge(train,feature_answer_count,how='left',on='uid')
test=pd.merge(test,feature_answer_count,how='left',on='uid')

feature_topic_answer=pd.read_csv('feature_topic_answer.csv')
train=pd.merge(train,feature_topic_answer,how='left',on='qid')
test=pd.merge(test,feature_topic_answer,how='left',on='qid')

feature_topicn_tongji=pd.read_csv('feature_topicn_tongji.csv')
train=pd.merge(train,feature_topicn_tongji,how='left',on='uid')
test=pd.merge(test,feature_topicn_tongji,how='left',on='uid')

feature_answer_percount=pd.read_csv('feature_answer_percount.csv')
train=pd.merge(train,feature_answer_percount,how='left',on='uid')
test=pd.merge(test,feature_answer_percount,how='left',on='uid')

In [9]:
feature_topic_meta=pd.read_csv('feature_topic_meta.csv').drop_duplicates(['qid','uid'])
train=pd.merge(train,feature_topic_meta,how='left',on=['qid','uid'])
test=pd.merge(test,feature_topic_meta,how='left',on=['qid','uid'])

In [10]:
len(train)+len(test)

11421517

In [11]:
target=train['target']

In [12]:
alldata=pd.concat([train.drop(['target'],axis=1),test])

In [13]:
cat_col=['sex','look_freq','a','b','c','d','e','A','B','C','D','E']

In [14]:
dense_features = [i for i in alldata.columns if i not in cat_col + ['uid','qid']]
sparse_features = [i for i in alldata.columns if i not in dense_features + ['uid','qid']]

In [15]:
sparse_features

['sex', 'look_freq', 'a', 'b', 'c', 'd', 'e', 'A', 'B', 'C', 'D', 'E']

In [16]:
mm = StandardScaler()
alldata[dense_features] = mm.fit_transform(alldata[dense_features].replace([np.inf, -np.inf], 0).fillna(0))

In [17]:
fixlen_feature_columns = [SparseFeat(feat, alldata[feat].nunique()) for feat in sparse_features] + [
    DenseFeat(feat, 1, ) for feat in dense_features]
linear_feature_columns = fixlen_feature_columns #+ varlen_feature_columns
dnn_feature_columns = fixlen_feature_columns #+ varlen_feature_columns
fixlen_feature_names = get_fixlen_feature_names(linear_feature_columns + dnn_feature_columns)
varlen_feature_names = get_varlen_feature_names(linear_feature_columns + dnn_feature_columns)
len(sparse_features), len(dense_features)

(12, 68)

In [31]:
def xDeepFM(linear_feature_columns, dnn_feature_columns, embedding_size=8, dnn_hidden_units=(512, 256),
            cin_layer_size=(256, 256,), cin_split_half=True, cin_activation='relu', l2_reg_linear=0.00001,
            l2_reg_embedding=0.00001, l2_reg_dnn=0, l2_reg_cin=0, init_std=0.0001, seed=2019, dnn_dropout=0,
            dnn_activation='relu', dnn_use_bn=False, task='binary'):

    features = build_input_features(linear_feature_columns + dnn_feature_columns)

    datas_list = list(features.values())

    sparse_embedding_list, dense_value_list = input_from_feature_columns(features, dnn_feature_columns,
                                                                        embedding_size,
                                                                        l2_reg_embedding, init_std,
                                                                        seed)

    #     linear_logit
    feature_columns = linear_feature_columns
    prefix = 'linear'
    units = 6
    l2_reg = l2_reg_linear
    linear_emb_list = [
        input_from_feature_columns(features, feature_columns, 1, l2_reg, init_std, seed, prefix=prefix + str(i))[0] for i
        in range(units)]
    _, dense_data_list = input_from_feature_columns(features, feature_columns, 1, l2_reg, init_std, seed, prefix=prefix)

    if len(linear_emb_list[0]) > 1:
        linear_term = concat_fun([tf.keras.layers.add(linear_emb) for linear_emb in linear_emb_list])
    elif len(linear_emb_list[0]) == 1:
        linear_term = concat_fun([linear_emb[0] for linear_emb in linear_emb_list])
    else:
        linear_term = None

    if len(dense_data_list) > 0:
        dense_data__ = dense_data_list[0] if len(
            dense_data_list) == 1 else tf.keras.layers.Concatenate()(dense_data_list)
        linear_dense_logit = tf.keras.layers.Dense(
            units, activation='softplus', use_bias=True, kernel_regularizer=l2(l2_reg))(dense_data__)

        if linear_term is not None:
            linear_term = tf.keras.layers.add([linear_dense_logit, linear_term])
        else:
            linear_term = linear_dense_logit

    linear_logit = tf.keras.layers.Flatten()(linear_term)

    fm_data = concat_fun(sparse_embedding_list, axis=1)

    if len(cin_layer_size) > 0:
        exFM_out = CIN(cin_layer_size, cin_activation,
                       cin_split_half, l2_reg_cin, seed)(fm_data)
        exFM_logit = tf.keras.layers.Dense(6, activation='softplus', )(exFM_out)
        exFM_logit_reg = tf.keras.layers.Dense(1, activation='relu')(exFM_out)

    dnn_data_1 = combined_dnn_input(sparse_embedding_list, dense_value_list)

    deep_out_1 = DNN(dnn_hidden_units, dnn_activation, l2_reg_dnn, dnn_dropout,
                     dnn_use_bn, seed)(dnn_data_1)

    deep_logit_1 = tf.keras.layers.Dense(
        1, use_bias=False, activation='sigmoid')(deep_out_1)

    x = tf.keras.layers.average([exFM_logit, linear_logit, deep_logit_1])
    x = tf.keras.layers.concatenate([x, exFM_logit_reg])
   # x = tf.keras.layers.BatchNormalization()(x)
    x = tf.keras.layers.Dense(256)(x)
    x = tf.keras.layers.PReLU()(x)
  #  x = tf.keras.layers.BatchNormalization()(x)
    output = tf.keras.layers.Dense(1, activation='sigmoid')(x)
    model = tf.keras.models.Model(inputs=datas_list, outputs=output)#datas
    return model

In [19]:
def make_data(JB, index):
    JB = JB.iloc[index]
    fixlen_data = [JB[name].values for name in fixlen_feature_names]

    # v2 = [app1_list[index]]
    # v3 = [app1_list[index]]
    return fixlen_data #+ v2 + v3

In [20]:
X_train = alldata.iloc[0:train.shape[0],:].reset_index(drop=True)
y = target.astype(int)
X_test = alldata.iloc[train.shape[0]:,:].reset_index(drop=True)
print(X_train.shape, X_test.shape)
cv_pred = []
test_pred = []
cv_score = []
cv_model = []
sub1 = np.zeros((test.shape[0],1 ))
oof_pref1 = np.zeros((train.shape[0], 1))
count=0
skf = StratifiedKFold(n_splits=5, random_state=1996, shuffle=True)

(10166100, 82) (1255417, 82)


In [38]:
for index, (train_index, test_index) in enumerate(skf.split(X_train.iloc[0:1000000], y.iloc[0:1000000])):
    #K.clear_session()
    filepath = "xdeepfm_best_model%d.h5" % count
    checkpoint = ModelCheckpoint(
        filepath, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
    reduce_lr = ReduceLROnPlateau(
        monitor='val_loss', factor=0.8, patience=1, min_lr=0.0001, verbose=1)
    earlystopping = EarlyStopping(
        monitor='val_loss', min_delta=0.0001, patience=1,verbose=1, mode='auto')
    callbacks = [checkpoint, reduce_lr, earlystopping]
    print(index)
    model = xDeepFM(linear_feature_columns, dnn_feature_columns, embedding_size=8,
                    task='binary')  # xDeepFM DeepFM AFM NFM
    # model.compile(RAdam(lr=0.01), 'categorical_crossentropy',
    #               metrics=['accuracy', ], )
    model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['acc'], )
    train_x, test_x, train_y, test_y = X_train.iloc[train_index], X_train.iloc[test_index], y.iloc[train_index], y.iloc[
        test_index]
#     train_x=train_x.iloc[0:1000000]
#     train_y=train_y.iloc[0:1000000]
    train_x = make_data(X_train, train_index) 
    train_y = train_y
    test_x = make_data(X_train, test_index)
    test_y = test_y
    test_data = make_data(alldata, range(len(train), len(alldata)))
    history = model.fit(train_x, train_y, batch_size=512, epochs=10, verbose=1, validation_data=(test_x, test_y),callbacks=callbacks,)
    model.load_weights(filepath)
    sub1 += model.predict(test_data, batch_size=2048)
    oof_pref1[test_index] = model.predict(test_x, batch_size=2048)
    count += 1

0
Train on 799999 samples, validate on 200001 samples
Epoch 1/10

Epoch 00001: val_loss improved from inf to 0.39076, saving model to xdeepfm_best_model0.h5
Epoch 2/10

Epoch 00002: val_loss improved from 0.39076 to 0.38180, saving model to xdeepfm_best_model0.h5
Epoch 3/10
 99328/799999 [==>...........................] - ETA: 1:46 - loss: 0.3785 - acc: 0.8371

KeyboardInterrupt: 

In [35]:
len(train_x[0:1000000])

80

In [28]:
test_x

[array([0., 0., 0., ..., 2., 1., 1.]),
 array([1., 0., 2., ..., 1., 1., 0.]),
 array([0., 0., 0., ..., 1., 0., 0.]),
 array([0., 0., 0., ..., 1., 0., 0.]),
 array([0., 0., 0., ..., 0., 0., 0.]),
 array([0., 1., 0., ..., 0., 0., 0.]),
 array([0., 0., 0., ..., 0., 0., 0.]),
 array([ 8.,  0.,  0., ...,  0., 64., 26.]),
 array([3., 0., 0., ..., 0., 5., 6.]),
 array([ 7., 15.,  7., ..., 37.,  3.,  7.]),
 array([87.,  5., 84., ...,  0.,  0., 83.]),
 array([0., 0., 0., ..., 0., 1., 0.]),
 array([ 2.12825867, -0.40645519, -0.40645519, ..., -0.40645519,
        -0.01649921, -0.40645519]),
 array([ 0.15598875, -0.21933451, -0.21933451, ..., -0.21933451,
        -0.03167288, -0.21933451]),
 array([-0.14243624, -0.14243624, -0.14243624, ..., -0.14243624,
        -0.14243624, -0.14243624]),
 array([ 2.60485737, -0.52011453, -0.52011453, ..., -0.52011453,
         0.98107341, -0.52011453]),
 array([-0.10439391, -0.10439391, -0.10439391, ..., -0.10439391,
         1.73123637, -0.10439391]),
 array([-