In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import *
from sklearn.preprocessing import LabelEncoder
import tensorflow.keras.backend as K 
from tensorflow.keras.models import Model

In [2]:
train = pd.read_csv('./criteo_sampled_data.csv')

In [3]:
train.head()

Unnamed: 0,label,I1,I2,I3,I4,I5,I6,I7,I8,I9,...,C17,C18,C19,C20,C21,C22,C23,C24,C25,C26
0,0,1.0,1,5.0,0.0,1382.0,4.0,15.0,2.0,181.0,...,e5ba7672,f54016b9,21ddcdc9,b1252a9d,07b5194c,,3a171ecb,c5c50484,e8b83407,9727dd16
1,0,2.0,0,44.0,1.0,102.0,8.0,2.0,2.0,4.0,...,07c540c4,b04e4670,21ddcdc9,5840adea,60f6221e,,3a171ecb,43f13e8b,e8b83407,731c3655
2,0,2.0,0,1.0,14.0,767.0,89.0,4.0,2.0,245.0,...,8efede7f,3412118d,,,e587c466,ad3062eb,3a171ecb,3b183c5c,,
3,0,,893,,,4392.0,,0.0,0.0,0.0,...,1e88c74f,74ef3502,,,6b3a5ca6,,3a171ecb,9117a34a,,
4,0,3.0,-1,,0.0,2.0,0.0,3.0,0.0,0.0,...,1e88c74f,26b3c7a7,,,21c9516a,,32c7478e,b34f3128,,


In [4]:
len(train)

600000

In [5]:
cols = train.columns.values

# 数据预处理

In [6]:
# 定义特征组
dense_feats = [f for f in cols if f[0] == 'I']
sparse_feats = [f for f in cols if f[0] == 'C']


In [7]:
def process_dense_feats(data,feats):
    d  = data.copy()
    d = d[feats].fillna(0.0)
    for f in feats:
        d[f] = d[f].apply(lambda x: np.log(x+1) if x>-1 else -1)
    return d
data_dense = process_dense_feats(train, dense_feats)

In [8]:
def process_spares_feats(data,feats):
    d = data.copy()
    d = d[feats].fillna('-1')
    for f in feats:
        d[f] = LabelEncoder().fit_transform(d[f])
    return d
data_sparse = process_spares_feats(train,sparse_feats)

In [9]:
total_data = pd.concat([data_dense,data_sparse],axis=1)
total_data['label'] = train['label']

In [10]:
total_data.head()

Unnamed: 0,I1,I2,I3,I4,I5,I6,I7,I8,I9,I10,...,C18,C19,C20,C21,C22,C23,C24,C25,C26,label
0,0.693147,0.693147,1.791759,0.0,7.23201,1.609438,2.772589,1.098612,5.204007,0.693147,...,3439,213,3,4954,0,3,24768,52,14364,0
1,1.098612,0.0,3.806662,0.693147,4.634729,2.197225,1.098612,1.098612,1.609438,0.693147,...,2465,213,1,60664,0,3,8432,52,10835,0
2,1.098612,0.0,0.693147,2.70805,6.64379,4.49981,1.609438,1.098612,5.505332,0.693147,...,738,0,0,143786,9,3,7344,0,0,0
3,0.0,6.795706,0.0,0.0,8.387768,0.0,0.0,0.0,0.0,0.0,...,1648,0,0,67107,0,3,18107,0,0,0
4,1.386294,-1.0,0.0,0.0,1.098612,0.0,1.386294,0.0,0.0,0.693147,...,556,0,0,21257,0,2,22439,0,0,0


# 模型的构建与训练

## 一阶特征

### dense特征  

In [11]:
dense_inputs = []
for f in dense_feats:
    _input = Input([1],name=f)
    dense_inputs.append(_input)

In [12]:
dense_inputs

[<tf.Tensor 'I1:0' shape=(None, 1) dtype=float32>,
 <tf.Tensor 'I2:0' shape=(None, 1) dtype=float32>,
 <tf.Tensor 'I3:0' shape=(None, 1) dtype=float32>,
 <tf.Tensor 'I4:0' shape=(None, 1) dtype=float32>,
 <tf.Tensor 'I5:0' shape=(None, 1) dtype=float32>,
 <tf.Tensor 'I6:0' shape=(None, 1) dtype=float32>,
 <tf.Tensor 'I7:0' shape=(None, 1) dtype=float32>,
 <tf.Tensor 'I8:0' shape=(None, 1) dtype=float32>,
 <tf.Tensor 'I9:0' shape=(None, 1) dtype=float32>,
 <tf.Tensor 'I10:0' shape=(None, 1) dtype=float32>,
 <tf.Tensor 'I11:0' shape=(None, 1) dtype=float32>,
 <tf.Tensor 'I12:0' shape=(None, 1) dtype=float32>,
 <tf.Tensor 'I13:0' shape=(None, 1) dtype=float32>]

In [13]:
concat_dense_inputs = Concatenate(axis=1)(dense_inputs)  # (?,13)
fst_order_dense_layer = Dense(1)(concat_dense_inputs) # (?,1)

## sparse 特征 

In [14]:
sparse_inputs = []
for f in sparse_feats:
    _input = Input([1],name=f)
    sparse_inputs.append(_input)

In [40]:
sparse_ld_embed = []
for i,_input in enumerate(sparse_inputs):
    f = sparse_feats[i]
    voc_size  = train[f].nunique()
    reg = tf.keras.regularizers.l2(0.5)
#     _embed = Flatten()(Embedding(voc_size+1,1,embeddings_regularizer = tf.keras.regularizers.l2(0.5))(_input))
    _embed = Embedding(voc_size+1, 1, embeddings_regularizer=reg)(_input)
    # 由于 Embedding 的结果是二维的，
    # 因此如果需要在 Embedding 之后加入 Dense 层，则需要先连接上 Flatten 层
    _embed = Flatten()(_embed)
    sparse_ld_embed.append(_embed)

In [41]:
fst_order_sparse_layer = Add()(sparse_ld_embed)

In [42]:
fst_order_sparse_layer

<tf.Tensor 'add_3/Identity:0' shape=(None, 1) dtype=float32>

### Linear 部分合并


In [43]:
linear_part = Add()([fst_order_dense_layer, fst_order_sparse_layer])

## 二阶特征

In [44]:
k = 8  # embeding size 
# 只考虑sparse的二阶交叉
sparse_kd_embed = []
for i,_input in enumerate(sparse_inputs):
    f = sparse_feats[i]
    voc_size = train[f].nunique()
    _embed = Embedding(voc_size+1, k, embeddings_regularizer=tf.keras.regularizers.l2(0.7))(_input)
    sparse_kd_embed.append(_embed)

In [45]:
sparse_kd_embed

[<tf.Tensor 'embedding_80/Identity:0' shape=(None, 1, 8) dtype=float32>,
 <tf.Tensor 'embedding_81/Identity:0' shape=(None, 1, 8) dtype=float32>,
 <tf.Tensor 'embedding_82/Identity:0' shape=(None, 1, 8) dtype=float32>,
 <tf.Tensor 'embedding_83/Identity:0' shape=(None, 1, 8) dtype=float32>,
 <tf.Tensor 'embedding_84/Identity:0' shape=(None, 1, 8) dtype=float32>,
 <tf.Tensor 'embedding_85/Identity:0' shape=(None, 1, 8) dtype=float32>,
 <tf.Tensor 'embedding_86/Identity:0' shape=(None, 1, 8) dtype=float32>,
 <tf.Tensor 'embedding_87/Identity:0' shape=(None, 1, 8) dtype=float32>,
 <tf.Tensor 'embedding_88/Identity:0' shape=(None, 1, 8) dtype=float32>,
 <tf.Tensor 'embedding_89/Identity:0' shape=(None, 1, 8) dtype=float32>,
 <tf.Tensor 'embedding_90/Identity:0' shape=(None, 1, 8) dtype=float32>,
 <tf.Tensor 'embedding_91/Identity:0' shape=(None, 1, 8) dtype=float32>,
 <tf.Tensor 'embedding_92/Identity:0' shape=(None, 1, 8) dtype=float32>,
 <tf.Tensor 'embedding_93/Identity:0' shape=(None, 

In [46]:
# 1.将所有sparse的embedding拼接起来，得到 (n, k)的矩阵，其中n为特征数，k为embedding大小

concat_sparse_kd_embed = Concatenate(axis=1)(sparse_kd_embed) # ?, n, k（None,26,8）




In [47]:
# 2.先求和再平方
sum_kd_embed = Lambda(lambda x: K.sum(x,axis=1))(concat_sparse_kd_embed)
square_sum_kd_embed = Multiply()([sum_kd_embed, sum_kd_embed])


In [48]:
square_sum_kd_embed


<tf.Tensor 'multiply_2/Identity:0' shape=(None, 8) dtype=float32>

In [49]:
#  3.先平方再求和
square_kd_embed = Multiply()([concat_sparse_kd_embed, concat_sparse_kd_embed])
sum_square_kd_embed = Lambda(lambda x: K.sum(x,axis=1))(square_kd_embed)

In [50]:
# 4.相减除以2
sub = Subtract()([square_sum_kd_embed,sum_square_kd_embed])
sub = Lambda(lambda x: x*0.5)(sub)
snd_order_sparse_layer = Lambda(lambda x: K.sum(x, axis=1,keepdims=True))(sub)

# DNN 


In [51]:
flatten_sparse_embed = Flatten()(concat_sparse_kd_embed)   # (None,n*k)
fc_layer = Dropout(0.5)(Dense(256,activation='relu')(flatten_sparse_embed))  #(?,256)
fc_layer = Dropout(0.3)(Dense(256,activation='relu')(fc_layer))
fc_layer = Dropout(0.1)(Dense(256,activation='relu')(fc_layer))
fc_layer_output = Dense(1)(fc_layer)

# 输出结果

In [52]:
output_layer = Add()([linear_part, snd_order_sparse_layer, fc_layer_output])
output_layer = Activation('sigmoid')(output_layer)
output_layer

<tf.Tensor 'activation_1/Identity:0' shape=(None, 1) dtype=float32>

In [53]:
model = Model(dense_inputs + sparse_inputs, output_layer)

In [54]:
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
C1 (InputLayer)                 [(None, 1)]          0                                            
__________________________________________________________________________________________________
C2 (InputLayer)                 [(None, 1)]          0                                            
__________________________________________________________________________________________________
C3 (InputLayer)                 [(None, 1)]          0                                            
__________________________________________________________________________________________________
C4 (InputLayer)                 [(None, 1)]          0                                            
____________________________________________________________________________________________

In [55]:
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['binary_crossentropy',tf.keras.metrics.AUC(name='auc')])

In [57]:
train_data = total_data.loc[:500000-1]
valid_data = total_data.loc[500000:]

In [58]:
train_dense_x = [train_data[f].values for f in dense_feats]
train_sparse_x = [train_data[f].values for f in sparse_feats]

train_label = [train_data['label'].values]

val_dense_x = [valid_data[f].values for f in dense_feats]
val_sparse_x = [valid_data[f].values for f in sparse_feats]

val_label = [valid_data['label'].values]

In [59]:
model.fit(train_dense_x+train_sparse_x, train_label,
          epochs=5, batch_size=256,
         validation_data = (val_dense_x+val_sparse_x, val_label),
        )

Train on 500000 samples, validate on 100000 samples
Epoch 1/5


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x22eac3ffac8>