In [63]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

### Methods

In [15]:
def process_dense_feats(df, cols):
    ops_df = df[cols].fillna(0.0)
    for c in cols:
        ops_df[c] = ops_df[c].apply(lambda x: np.log(x+1) if x > -1 else -1)
    return ops_df



def process_sparse_feats(df, cols):
    ops_df = df[cols].fillna("UNK")
    for c in cols:
        ops_df[c] = LabelEncoder().fit_transform(ops_df[c])
    return ops_df


### Load data

In [16]:
data = pd.read_csv('./raw_data/criteo_sampled_data.csv')
data.head()

Unnamed: 0,label,I1,I2,I3,I4,I5,I6,I7,I8,I9,...,C17,C18,C19,C20,C21,C22,C23,C24,C25,C26
0,0,1.0,1,5.0,0.0,1382.0,4.0,15.0,2.0,181.0,...,e5ba7672,f54016b9,21ddcdc9,b1252a9d,07b5194c,,3a171ecb,c5c50484,e8b83407,9727dd16
1,0,2.0,0,44.0,1.0,102.0,8.0,2.0,2.0,4.0,...,07c540c4,b04e4670,21ddcdc9,5840adea,60f6221e,,3a171ecb,43f13e8b,e8b83407,731c3655
2,0,2.0,0,1.0,14.0,767.0,89.0,4.0,2.0,245.0,...,8efede7f,3412118d,,,e587c466,ad3062eb,3a171ecb,3b183c5c,,
3,0,,893,,,4392.0,,0.0,0.0,0.0,...,1e88c74f,74ef3502,,,6b3a5ca6,,3a171ecb,9117a34a,,
4,0,3.0,-1,,0.0,2.0,0.0,3.0,0.0,0.0,...,1e88c74f,26b3c7a7,,,21c9516a,,32c7478e,b34f3128,,


In [17]:
data.columns

Index(['label', 'I1', 'I2', 'I3', 'I4', 'I5', 'I6', 'I7', 'I8', 'I9', 'I10',
       'I11', 'I12', 'I13', 'C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7', 'C8',
       'C9', 'C10', 'C11', 'C12', 'C13', 'C14', 'C15', 'C16', 'C17', 'C18',
       'C19', 'C20', 'C21', 'C22', 'C23', 'C24', 'C25', 'C26'],
      dtype='object')

In [18]:
dense_feats = [f for f in data.columns if f[0] == 'I']
sparse_feats = [f for f in data.columns if f[0] == 'C']

In [19]:
%%time
dense_data = process_dense_feats(data, dense_feats)
sparse_data = process_sparse_feats(data, sparse_feats)

CPU times: user 22.2 s, sys: 1.47 s, total: 23.6 s
Wall time: 28.4 s


In [20]:
ecd_data = pd.concat([
    dense_data,
    sparse_data,
    data[["label"]]
],axis=1)

In [22]:
ecd_data.head()

Unnamed: 0,I1,I2,I3,I4,I5,I6,I7,I8,I9,I10,...,C18,C19,C20,C21,C22,C23,C24,C25,C26,label
0,0.693147,0.693147,1.791759,0.0,7.23201,1.609438,2.772589,1.098612,5.204007,0.693147,...,3439,212,3,4953,8,3,24768,52,14363,0
1,1.098612,0.0,3.806662,0.693147,4.634729,2.197225,1.098612,1.098612,1.609438,0.693147,...,2465,212,0,60663,8,3,8431,52,10834,0
2,1.098612,0.0,0.693147,2.70805,6.64379,4.49981,1.609438,1.098612,5.505332,0.693147,...,738,1097,1,143786,9,3,7343,38,15160,0
3,0.0,6.795706,0.0,0.0,8.387768,0.0,0.0,0.0,0.0,0.0,...,1648,1097,1,67106,8,3,18106,38,15160,0
4,1.386294,-1.0,0.0,0.0,1.098612,0.0,1.386294,0.0,0.0,0.693147,...,556,1097,1,21256,8,2,22439,38,15160,0


### Split train & test

In [167]:
X_full = ecd_data[dense_feats+sparse_feats].copy()
y_full = ecd_data[["label"]].copy()

In [168]:
test_ratio = 0.3

X_train, X_test, y_train, y_test = train_test_split(
    X_full, y_full,  
    test_size=test_ratio, random_state=42
)

In [169]:
# X_train_dense  = X_train[dense_feats].values.T.tolist()
# X_train_sparse = X_train[sparse_feats].values.T.tolist()

# X_test_dense  = X_test[dense_feats].values.T.tolist()
# X_test_sparse = X_test[sparse_feats].values.T.tolist()

In [170]:
# X_train_dense.values.T

In [171]:
train_dense_x  = [X_train[f].values for f in dense_feats]
train_sparse_x = [X_train[f].values for f in sparse_feats]

train_label = [y_train['label'].values]


val_dense_x = [X_test[f].values for f in dense_feats]
val_sparse_x = [X_test[f].values for f in sparse_feats]

val_label = [y_test['label'].values]

In [172]:
np.shape(y_train_input)

(420000, 1)

In [211]:
X_train_input = [X_train[f].values for f in dense_feats] + [X_train[f].values for f in sparse_feats]
X_test_input  = [X_test[f].values for f in dense_feats] + [X_test[f].values for f in sparse_feats]
y_train_input = y_train["label"].values
y_test_input  = y_test["label"].values



In [212]:
np.shape(train_dense_x + train_sparse_x)

(39, 420000)

In [213]:
y_train_input

array([0, 0, 0, ..., 1, 0, 1])

### Embeddings 

### FM

#### Linear part

In [90]:
# build tf2 input for sparse features one by one
sparse_inputs = []
for f in sparse_feats:
    _input = tf.keras.layers.Input([1], name=f)
    sparse_inputs.append(_input)

    
dense_inputs = []
for f in dense_feats:
    _input = tf.keras.layers.Input([1], name=f)
    dense_inputs.append(_input)

In [91]:
# 1 dimension embedding as dense layer.

sparse_1d_embed = []
for i, _input in enumerate(sparse_inputs):
    feat = sparse_feats[i]
    vocab_size = ecd_data[feat].nunique() + 1
    # 使用 l2 正则化防止过拟合
    reg = tf.keras.regularizers.l2(0.5)
    _embed = tf.keras.layers.Embedding(vocab_size, 1, embeddings_regularizer=reg)(_input)
    
    _flat = tf.keras.layers.Flatten()(_embed)
    sparse_1d_embed.append(_flat)

linear_sparse_part = tf.keras.layers.Add()(sparse_1d_embed)


In [92]:
concat_dense_inputs = tf.keras.layers.Concatenate(axis=1)(dense_inputs)  # ?, 13
linear_dense_part = tf.keras.layers.Dense(1)(concat_dense_inputs)  # ?, 1

In [93]:
# Add up linear dense part and linear sparse part.
linear_part = tf.keras.layers.Add()([linear_sparse_part, linear_dense_part])

In [215]:
sparse_1d_embed[0].shape

TensorShape([None, 1])

#### Second order part

In [95]:
# embedding size
k = 8

# 只考虑sparse的二阶交叉
sparse_kd_embed = []
for i, _input in enumerate(sparse_inputs):
    feat = sparse_feats[i]
    vocab_size = ecd_data[feat].nunique() + 1
    reg = tf.keras.regularizers.l2(0.7)
    _embed = tf.keras.layers.Embedding(vocab_size, k, embeddings_regularizer=reg)(_input)
    sparse_kd_embed.append(_embed)


In [96]:
# 1.将所有 sparse 特征 (?, 1, k)的embedding拼接起来，
# 得到 (?, n, k)的矩阵，其中n为特征数，k为embedding大小
concat_sparse_kd_embed = tf.keras.layers.Concatenate(axis=1)(sparse_kd_embed)  # ?, n, k

# 2.先求和再平方
sum_kd_embed = tf.keras.layers.Lambda(lambda x: tf.keras.backend.sum(x, axis=1))(concat_sparse_kd_embed)  # ?, k
square_sum_kd_embed = tf.keras.layers.Multiply()([sum_kd_embed, sum_kd_embed])  # ?, k

# 3.先平方再求和
square_kd_embed = tf.keras.layers.Multiply()([concat_sparse_kd_embed, concat_sparse_kd_embed]) # ?, n, k
sum_square_kd_embed = tf.keras.layers.Lambda(lambda x: tf.keras.backend.sum(x, axis=1))(square_kd_embed)  # ?, k

# 4.相减除以2
sub = tf.keras.layers.Subtract()([square_sum_kd_embed, sum_square_kd_embed])  # ?, k
sub = tf.keras.layers.Lambda(lambda x: x*0.5)(sub)  # ?, k
sec_order_part = tf.keras.layers.Lambda(lambda x: tf.keras.backend.sum(x, axis=1, keepdims=True))(sub)  # ?, 1


In [217]:
concat_sparse_kd_embed.shape

TensorShape([None, 26, 8])

### DNN

In [97]:
flatten_sparse_embed = tf.keras.layers.Flatten()(concat_sparse_kd_embed)  # ?, n*k
fc_layer = tf.keras.layers.Dropout(0.5)(tf.keras.layers.Dense(256, activation='relu')(flatten_sparse_embed))  # ?, 256
fc_layer = tf.keras.layers.Dropout(0.3)(tf.keras.layers.Dense(256, activation='relu')(fc_layer))  # ?, 256
fc_layer = tf.keras.layers.Dropout(0.1)(tf.keras.layers.Dense(256, activation='relu')(fc_layer))  # ?, 256
dnn_part = tf.keras.layers.Dense(1)(fc_layer)  # ?, 1

### Add up

In [191]:
output_layer = tf.keras.layers.Add()([linear_part, sec_order_part, dnn_part])
output_layer = tf.keras.layers.Activation("sigmoid")(output_layer)

model = tf.keras.Model(dense_inputs+sparse_inputs, output_layer)
model.compile(
    optimizer="adam", 
    loss="binary_crossentropy", 
    metrics=["binary_crossentropy", tf.keras.metrics.AUC(name='auc')]
)


In [192]:
model.summary()

Model: "functional_7"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
C1 (InputLayer)                 [(None, 1)]          0                                            
__________________________________________________________________________________________________
C2 (InputLayer)                 [(None, 1)]          0                                            
__________________________________________________________________________________________________
C3 (InputLayer)                 [(None, 1)]          0                                            
__________________________________________________________________________________________________
C4 (InputLayer)                 [(None, 1)]          0                                            
_______________________________________________________________________________________

In [193]:
# tf.keras.utils.plot_model(model, show_shapes=True)

### Fitting

In [194]:
EPOCHS = 5
BATCH_SIZE = 32

In [214]:
model.fit(

    X_train_input,
    y_train_input,
    # train_dense_x+train_sparse_x, train_label,
    epochs=EPOCHS, 
    batch_size=BATCH_SIZE,
    validation_data=(X_test_input, y_test_input),
    # validation_data=(val_dense_x+val_sparse_x, val_label),
    verbose=1
)

Epoch 1/5


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7ff0096476d0>