In [46]:
import pandas as pd
import numpy as np
from tqdm import tqdm

from tensorflow.keras.layers import *
from tensorflow.keras import Model

from tensorflow.keras.initializers import RandomNormal,glorot_uniform
import tensorflow as tf
from tensorflow.keras.utils import multi_gpu_model

模型的核心思路

In [23]:
# 基础特征
all_col=['task_id', 'adv_id', 'creat_type_cd', 'adv_prim_id',
       'dev_id', 'inter_type_cd', 'slot_id', 'spread_app_id', 'tags',
       'app_first_class', 'app_second_class', 'age', 'city', 'city_rank',
       'device_name', 'device_size', 'career', 'gender', 'net_type',
       'residence', 'his_app_size', 'his_on_shelf_time', 'app_score',
       'emui_dev', 'list_time', 'device_price', 'up_life_duration',
       'up_membership_grade', 'membership_life_duration', 'consume_purchase',
        'communication_avgonline_30d', 'indu_name']
# 时间序列特征
# his_adv_prim_id：his_adv_id
# shape=(m,20)其中m表示用户数量，20表示序列长度，不足20的用0填充

In [57]:
# 以字典的形式定义每个类别的embedding_dim，vocabulary_size
# 具体的计算代码如下;
# all_data = pd.concat([train_data, test_data]).reset_index(drop=True)
# params = {'feature_config':{}}
# for i in tqdm(all_col):
#     params['feature_config'][i] = {}
#     params['feature_config'][i]['vocabulary_size'] = all_data[i].max()+1
#     params['feature_config'][i]['embedding_dim'] = 16


In [25]:
# 为了减小计算时间，可以将结果直接以字典的形式给出，
params={'feature_config': {
  'task_id': {'vocabulary_size': 4932, 'embedding_dim': 16},'adv_id': {'vocabulary_size': 5956, 'embedding_dim': 16},
  'creat_type_cd': {'vocabulary_size': 8, 'embedding_dim': 16},'adv_prim_id': {'vocabulary_size': 114, 'embedding_dim': 16},
  'dev_id': {'vocabulary_size': 61, 'embedding_dim': 16},'inter_type_cd': {'vocabulary_size': 4, 'embedding_dim': 16},
  'slot_id': {'vocabulary_size': 12, 'embedding_dim': 16},'spread_app_id': {'vocabulary_size': 79, 'embedding_dim': 16},
  'tags': {'vocabulary_size': 32, 'embedding_dim': 16},'app_first_class': {'vocabulary_size': 3, 'embedding_dim': 16},
  'app_second_class': {'vocabulary_size': 19, 'embedding_dim': 16},'age': {'vocabulary_size': 8, 'embedding_dim': 16},
  'city': {'vocabulary_size': 344, 'embedding_dim': 16},'city_rank': {'vocabulary_size': 4, 'embedding_dim': 16},
  'device_name': {'vocabulary_size': 94, 'embedding_dim': 16},'device_size': {'vocabulary_size': 231, 'embedding_dim': 16},
  'career': {'vocabulary_size': 9, 'embedding_dim': 16},'gender': {'vocabulary_size': 3, 'embedding_dim': 16},
  'net_type': {'vocabulary_size': 5, 'embedding_dim': 16},'residence': {'vocabulary_size': 36, 'embedding_dim': 16},
  'his_app_size': {'vocabulary_size': 21, 'embedding_dim': 16},'his_on_shelf_time': {'vocabulary_size': 4, 'embedding_dim': 16},
  'app_score': {'vocabulary_size': 2, 'embedding_dim': 16},'emui_dev': {'vocabulary_size': 18, 'embedding_dim': 16},
  'list_time': {'vocabulary_size': 19, 'embedding_dim': 16},'device_price': {'vocabulary_size': 8, 'embedding_dim': 16},
  'up_life_duration': {'vocabulary_size': 21, 'embedding_dim': 16},'up_membership_grade': {'vocabulary_size': 4, 'embedding_dim': 16},
  'membership_life_duration': {'vocabulary_size': 21, 'embedding_dim': 16},'consume_purchase': {'vocabulary_size': 9, 'embedding_dim': 16},
  'communication_avgonline_30d': {'vocabulary_size': 14, 'embedding_dim': 16},'indu_name': {'vocabulary_size': 42, 'embedding_dim': 16}
}}

In [26]:
# 输入特征构造
# （1）根据字典params构造特征的输入，以及对特征进行嵌入
features = params['feature_config']
input_dict = {}
embedding_dict = {}
embedding_lookup = {}
for fea in features:
    # 构造input
    fea_input = Input(shape=(1,),name = fea)
    input_dict[fea] = fea_input
    
    # 构造enbeddingg
    feature_embedding = Embedding(
        features[fea]['vocabulary_size'],features[fea]['embedding_dim'],
        embeddings_initializer=RandomNormal(mean=0.0, stddev=0.0001, seed=1996),
        name="emb_" + fea)
    embedding_dict[fea] = feature_embedding

    # 构造embedding_lookup，将Input输入Embedding(m,n,name)(Input)实现lookup过程
    embedding = feature_embedding(fea_input)
    embedding_lookup[fea] = embedding

# 时间序列特征构造输入
input_dict['his_adv_id'] = Input(shape=(20,),name = 'his_adv_id')
input_dict['his_adv_prim_id'] = Input(shape=(20,),name = 'his_adv_prim_id')


In [53]:
# 定义FM模型
class FM(Layer):
    def __init__(self, **kwargs):

        super(FM, self).__init__(**kwargs)

    def build(self, input_shape):

        super(FM, self).build(input_shape) 

    def call(self, inputs, **kwargs):
        
        ad_group = tf.reduce_sum(inputs[0], axis=1)
        user_group = tf.reduce_sum(inputs[1], axis=1)
        cross_term = tf.reduce_sum(ad_group*user_group,axis=1,keepdims=True)
#         cross_term = tf.keras.backend.sum(ad_group*user_group,axis=1)
        return cross_term

    def compute_output_shape(self, input_shape):
        return (None, 1)
class Add(Layer):
    def __init__(self, **kwargs):
        super(Add, self).__init__(**kwargs)
    def call(self, input, **kwargs):
        return input[0]+input[1]

In [54]:
# 模型的第一部分
# 对广告特征和用户特征的embedding_lookup结果进行交叉：FM模型
ad_group = concatenate([embedding_lookup[i] for i in ['adv_id','task_id','creat_type_cd','adv_prim_id','dev_id',
                            'inter_type_cd','spread_app_id','tags','app_first_class','app_second_class','his_app_size','his_on_shelf_time',
                                                             'app_score','indu_name']],axis = 1)
        
user_group = concatenate([embedding_lookup[i] for i in ['age','city','city_rank','device_name','device_size',
                            'career','gender','net_type','residence','emui_dev','list_time','device_price','up_life_duration','up_membership_grade',
                                        'membership_life_duration','consume_purchase','communication_avgonline_30d']],axis = 1)

FMout = FM(name = 'FM')([ad_group,user_group])

# 接入全连接层,得到FM模型的输出
FMout = Dense(1)(FMout)

    def call(self, inputs, **kwargs):
        
        ad_group = tf.reduce_sum(inputs[0], axis=1)
        user_group = tf.reduce_sum(inputs[1], axis=1)
        cross_term = tf.reduce_sum(ad_group*user_group,axis=1,keepdims=True)
#         cross_term = tf.keras.backend.sum(ad_group*user_group,axis=1)
        return cross_term

This may be caused by multiline strings or comments not indented at the same level as the code.
    def call(self, inputs, **kwargs):
        
        ad_group = tf.reduce_sum(inputs[0], axis=1)
        user_group = tf.reduce_sum(inputs[1], axis=1)
        cross_term = tf.reduce_sum(ad_group*user_group,axis=1,keepdims=True)
#         cross_term = tf.keras.backend.sum(ad_group*user_group,axis=1)
        return cross_term

This may be caused by multiline strings or comments not indented at the same level as the code.


In [40]:
# 模型的第二部分
# 对于时间序列数据采用双向GRU进行建模
his_adv_id = embedding_dict['adv_id'](input_dict['his_adv_id'])
his_adv_id = Bidirectional(GRU(128))(his_adv_id)
his_adv_prim_id = embedding_dict['adv_prim_id'](input_dict['his_adv_prim_id'])
his_adv_prim_id = Bidirectional(GRU(128))(his_adv_prim_id)

In [41]:
# 将所有的基础特征的嵌入结果进行concatenate,然后Flatten
embedding_concat = Flatten()(concatenate([embedding_lookup[i] for i in embedding_lookup]))
# 时间序列与基础特征的结果合并
embedding_concat=concatenate([embedding_concat,his_adv_id,his_adv_prim_id,])#user_app_rep

In [42]:
# 输入一系列全连接层
dense = Dropout(0.3)(embedding_concat)
dense = Dense(1024)(dense)
dense = BatchNormalization()(dense)
dense = Activation(activation="relu")(dense)
dense = Dropout(0.3)(dense)
dense = Dense(512)(dense)
dense = BatchNormalization()(dense)
dense = Activation(activation="relu")(dense)
dense = Dense(1)(dense)

In [43]:
# 组合FM
output = Add()([dense,FMout])#exFM_logit
output = Add()([dense,FMout])#exFM_logit
output = Activation(activation="sigmoid")(output)

In [56]:
# 模型初始化与编译
model = Model(inputs=[input_dict[i] for i in input_dict], outputs=output)
# model = multi_gpu_model(model, gpus=4)
model.compile(optimizer ='adam',
              loss= 'binary_crossentropy',
              metrics=['acc'])

In [17]:
 "ffm_model%d.h5" % 2

'ffm_model2.h5'

In [None]:
target = train_data['label']
mode = 1
stack_test = np.zeros((len(test_data),1))
stack_train = np.zeros((train_data.shape[0], 1))
if mode == 1:
    skf = StratifiedKFold(n_splits=5, random_state=1996, shuffle=True)
    for index, (train_index, test_index) in enumerate(skf.split(train_data, target)):
        K.clear_session()
        # 定义回调
        # 模型保存路径
        filepath = "ffm_model%d.h5" % index
        
        # 保存模型,保存每一个epoch最好的模型。
        checkpoint = ModelCheckpoint(filepath, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
        # 当 val_loss 在patience次数内不在减小时。减小学习率，其中：new_lr = lr * factor，
        reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.8, patience=1, min_lr=0.0001, verbose=1)
        # 提早停止
        earlystopping = EarlyStopping(monitor='val_loss', min_delta=0.0001, patience=2,verbose=1, mode='auto')
        callbacks = [checkpoint, reduce_lr, earlystopping]
    
        # 构造验证集和训练集
        trian_x = make_data(train_index)
        valid_x = make_data(test_index)
        trian_y = np.array(target[train_index])
        valid_y = np.array(target[test_index])
        
        # 模型的训练
        model.fit(trian_x,trian_y, batch_size=4096, epochs=10, verbose=1, 
                  validation_data=(valid_x, valid_y),callbacks=callbacks)

        nn_model.model.load_weights(filepath)
        stack_test += nn_model.predict(make_test())
        stack_train[test_index] = nn_model.predict(valid_x)
        
        if index==0:
            
            reslgb = pd.read_csv('lgbres.csv')
            reslgb['probability']=stack_test[1000000:]
            reslgb[['id','probability']].to_csv('subnn.csv',index=False)
else:
    trian_x = make_train()
    valid_x = make_valid()
    trian_y = np.array(target[0:35897957])
    valid_y = np.array(target[35897957:])
    model.train(trian_x,valid_x,trian_y,valid_y,4096,1)