In [1]:
!git clone https://github.com/shenweichen/DeepCTR.git
import pandas as pd
from sklearn.metrics import log_loss, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

import tensorflow as tf
import sys

fatal: destination path 'DeepCTR' already exists and is not an empty directory.


In [2]:
sys.path.append('/content/DeepCTR')

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
data = pd.read_csv('/content/drive/MyDrive/data/train.csv', nrows=10000000)

In [5]:
data.head(5)

Unnamed: 0,id,click,hour,C1,banner_pos,site_id,site_domain,site_category,app_id,app_domain,...,device_type,device_conn_type,C14,C15,C16,C17,C18,C19,C20,C21
0,1.000009e+18,0,14102100,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,...,1,2,15706,320,50,1722,0,35,-1,79
1,1.000017e+19,0,14102100,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,...,1,0,15704,320,50,1722,0,35,100084,79
2,1.000037e+19,0,14102100,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,...,1,0,15704,320,50,1722,0,35,100084,79
3,1.000064e+19,0,14102100,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,...,1,0,15706,320,50,1722,0,35,100084,79
4,1.000068e+19,0,14102100,1005,1,fe8cc448,9166c161,0569f928,ecad2386,7801e8d9,...,1,0,18993,320,50,2161,0,35,-1,157


In [6]:
sparse_features = ['site_id','site_domain','site_category','app_id','app_domain','app_category','device_id','device_ip','device_model' ]
dense_features = ['id','hour','C1','device_type','device_conn_type','C14','C15','C16','C17','C18','C19','C20','C21']
target = ['click']

In [7]:
# 1) Label Encoding and normalization

for feature in sparse_features:
    label_encoding = LabelEncoder()
    data[feature] = label_encoding.fit_transform(data[feature])
mms = MinMaxScaler(feature_range=(0, 1))
data[dense_features] = mms.fit_transform(data[dense_features])


In [8]:
# From: https://deepctr-doc.readthedocs.io/en/v0.8.5/Features.html
from deepctr.feature_column import SparseFeat, DenseFeat, get_feature_names 

all_feature_columns = [SparseFeat(feature, vocabulary_size=data[feature].max() + 1, embedding_dim=4) for i, feature in enumerate(sparse_features)] \
                         + [DenseFeat(feature, 1, ) for feature in dense_features]

In [9]:
non_linear_columns = all_feature_columns
linear_columns = all_feature_columns

In [10]:
name_of_features =  get_feature_names(linear_columns + non_linear_columns)

In [11]:
all_feature_columns, len(all_feature_columns)

([SparseFeat(name='site_id', vocabulary_size=3496, embedding_dim=4, use_hash=False, vocabulary_path=None, dtype='int32', embeddings_initializer=<tensorflow.python.keras.initializers.initializers_v1.RandomNormal object at 0x7f2b921277d0>, embedding_name='site_id', group_name='default_group', trainable=True),
  SparseFeat(name='site_domain', vocabulary_size=4585, embedding_dim=4, use_hash=False, vocabulary_path=None, dtype='int32', embeddings_initializer=<tensorflow.python.keras.initializers.initializers_v1.RandomNormal object at 0x7f2b7f32c590>, embedding_name='site_domain', group_name='default_group', trainable=True),
  SparseFeat(name='site_category', vocabulary_size=23, embedding_dim=4, use_hash=False, vocabulary_path=None, dtype='int32', embeddings_initializer=<tensorflow.python.keras.initializers.initializers_v1.RandomNormal object at 0x7f2b7e695410>, embedding_name='site_category', group_name='default_group', trainable=True),
  SparseFeat(name='app_id', vocabulary_size=5469, embed

In [12]:
train_data, test_data = train_test_split(data, test_size=0.2)
train_input = [train_data[name] for name in name_of_features]
test_input = [test_data[name] for name in name_of_features]

In [13]:
import itertools

import tensorflow as tf
from tensorflow.python.keras import backend as K
from tensorflow.python.keras.backend import batch_dot

try:
    from tensorflow.python.ops.init_ops import Zeros, Ones, Constant, TruncatedNormal, \
        glorot_normal_initializer as glorot_normal, \
        glorot_uniform_initializer as glorot_uniform
except ImportError:
    from tensorflow.python.ops.init_ops_v2 import Zeros, Ones, Constant, TruncatedNormal, glorot_normal, glorot_uniform

from tensorflow.python.keras.layers import Layer, MaxPooling2D, Conv2D, Dropout, Lambda, Dense, Flatten
from tensorflow.python.keras.regularizers import l2
from tensorflow.python.layers import utils

from deepctr.layers.activation import activation_layer
from deepctr.layers.utils import concat_func, reduce_sum, softmax, reduce_mean

try:
    from tensorflow.python.keras.layers import BatchNormalization
except ImportError:
    BatchNormalization = tf.keras.layers.BatchNormalization

In [14]:
class MultiAttentionMechanism(Layer):
    # Initialasing multi head attention mechanism
    def __init__(self, attention_embedding=8, number_of_heads=4, residual=True, use_scale =False, seed=1024, **kwargs):

        self.attention_embedding = attention_embedding
        self.number_of_heads = number_of_heads
        self.residual = residual
        self.seed = seed
        self.use_scale = use_scale

        super(MultiAttentionMechanism, self).__init__(**kwargs)

    # Used to create weights that depend on the input shape
    def build(self, input_shape):

        embedding_vector = int(input_shape[-1])
        self.q = self.add_weight(name='query', shape=[embedding_vector, self.attention_embedding * self.number_of_heads],
                                       dtype=tf.float32,
                                       initializer=TruncatedNormal(seed=self.seed))
        self.k = self.add_weight(name='key', shape=[embedding_vector, self.attention_embedding * self.number_of_heads],
                                     dtype=tf.float32,
                                     initializer=TruncatedNormal(seed=self.seed + 1))
        self.v = self.add_weight(name='value', shape=[embedding_vector, self.attention_embedding * self.number_of_heads],
                                       dtype=tf.float32,
                                       initializer=TruncatedNormal(seed=self.seed + 2))
        if self.residual:
            self.r = self.add_weight(name='res', shape=[embedding_vector, self.attention_embedding * self.number_of_heads],
                                         dtype=tf.float32,
                                         initializer=TruncatedNormal(seed=self.seed))


        super(MultiAttentionMechanism, self).build(input_shape)

    # Performs the logic of applying the layer to the inputs
    def call(self, inputs, **kwargs):

        query = tf.tensordot(inputs, self.q,axes=(-1, 0))
        key = tf.tensordot(inputs, self.k, axes=(-1, 0))
        value = tf.tensordot(inputs, self.v, axes=(-1, 0))


        query = tf.stack(tf.split(query, self.number_of_heads, axis=2))
        key = tf.stack(tf.split(key, self.number_of_heads, axis=2))
        value = tf.stack(tf.split(value, self.number_of_heads, axis=2))

        dot_product = tf.matmul(query, key, transpose_b=True)
        if self.use_scale:
            dot_product /= self.attention_embedding ** 0.5

        self.attention_output = softmax(dot_product)

        output = tf.matmul(self.attention_output,value)
        output = tf.concat(tf.split(output, self.number_of_heads, ), axis=-1)
        output = tf.squeeze(output, axis=0)

        if self.residual:
            output += tf.tensordot(inputs, self.r, axes=(-1, 0))
        output = tf.nn.relu(output)

        return output

    def compute_output_shape(self, input_shape):

        return (None, input_shape[1], self.attention_embedding * self.number_of_heads)

    #  Returns a dictionary containing the configuration used to initialize this layer
    def get_config(self, ):
        config = {'attention_embedding': self.attention_embedding, 'number_of_heads': self.number_of_heads, 'residual': self.residual,
                  'seed': self.seed}
        base_config = super(MultiAttentionMechanism, self).get_config()
        base_config.update(config)
        return base_config

class ElementWiseFM(Layer):
    def __init__(self, **kwargs):

        super(ElementWiseFM, self).__init__(**kwargs)

    def build(self, input_size):

        super(ElementWiseFM, self).build(
            input_size)  

    def call(self, inputs_features, **kwargs):

        combined_value = inputs_features
        square_sum = tf.square(reduce_sum(
            combined_value, axis=1, keep_dims=True))
        sum_square = reduce_sum(
            combined_value * combined_value, axis=1, keep_dims=True)
        result = 0.5 * (square_sum - sum_square)

        return result

    def compute_output_shape(self, input_shape):
        return (None, 1, input_shape[-1])

class DNN(Layer):

    def __init__(self, hidden_layers, activation_function='relu', l2_regularizer=0, dropout=0, batch_norm=True, output_activation=None,
                 seed=1024, **kwargs):
        self.hidden_layers = hidden_layers
        self.activation_function = activation_function
        self.l2_regularizer = l2_regularizer
        self.dropout = dropout
        self.batch_norm = batch_norm
        self.output_activation = output_activation
        self.seed = seed

        super(DNN, self).__init__(**kwargs)

    def build(self, input_shape):

        embedding_vector = input_shape[-1]
        hidden_layers = [int(embedding_vector)] + list(self.hidden_layers)
        self.kernel = [self.add_weight(name='kernel' + str(i),
                                        shape=(
                                            hidden_layers[i], hidden_layers[i + 1]),
                                        initializer=glorot_normal(
                                            seed=self.seed),
                                        regularizer=l2(self.l2_regularizer),
                                        trainable=True) for i in range(len(self.hidden_layers))]
        self.bias = [self.add_weight(name='bias' + str(i),
                                     shape=(self.hidden_layers[i],),
                                     initializer=Zeros(),
                                     trainable=True) for i in range(len(self.hidden_layers))]
        if self.batch_norm:
            self.batch_norm_layers = [BatchNormalization() for _ in range(len(self.hidden_layers))]

        self.dropout_layers = [Dropout(self.dropout, seed=self.seed + i) for i in
                               range(len(self.hidden_layers))]

        self.activation_layers = [activation_layer(self.activation_function) for _ in range(len(self.hidden_layers))]

        if self.output_activation:
            self.activation_layers[-1] = activation_layer(self.output_activation)

        super(DNN, self).build(input_shape)  

    def call(self, inputs, training=None, **kwargs):

        dnn_input = inputs

        for i in range(len(self.hidden_layers)):
            result = tf.nn.bias_add(tf.tensordot(
                dnn_input, self.kernel[i], axes=(-1, 0)), self.bias[i])

            if self.batch_norm:
                result = self.batch_norm_layers[i](result, training=training)
            try:
                result = self.activation_layers[i](result, training=training)
            except TypeError as e:  
                print("make sure the activation function use training flag properly", e)
                result = self.activation_layers[i](result)

            result = self.dropout_layers[i](result, training=training)
            dnn_input = result

        return dnn_input

    def compute_output_shape(self, input_shape):
        if len(self.hidden_layers) > 0:
            shape = input_shape[:-1] + (self.hidden_layers[-1],)
        else:
            shape = input_shape

        return tuple(shape)

    def get_config(self, ):
        configuration = {'activation_function': self.activation_function, 'hidden_layers': self.hidden_layers,
                  'l2_regularizer': self.l2_regularizer, 'batch_norm': self.batch_norm, 'dropout': self.dropout,
                  'output_activation': self.output_activation, 'seed': self.seed}
        base_configuration = super(DNN, self).get_config()
        return dict(list(base_configuration.items()) + list(configuration.items()))


In [15]:
from tensorflow.python.keras.models import Model
from tensorflow.python.keras.layers import Flatten, Concatenate, Dense
from deepctr.layers.utils import concat_func, add_func, combined_dnn_input
from deepctr.feature_column import build_input_features, get_linear_logit, input_from_feature_columns
from deepctr.layers.core import PredictionLayer




def MANFM(linear_columns, non_linear_columns, number_attention_layer=4, attention_embedding_size=8, attention_head_number=4,
            residual_con=True,
            dnn_hidden_units=(300, 300, 300), dnn_activation='relu', l2_reg_linear=1e-5,
            l2_reg_embedding=1e-5, l2_reg_dnn=0, dnn_use_bn=True, dnn_dropout=0, seed=1024,
            task='binary', ):

    #Returns the input features
    features = build_input_features(non_linear_columns)
    #The function returns a view object that shows a list of all the dictionary's values.
    input = list(features.values())

    # Logistic Regression Model
    linear_output = get_linear_logit(features, linear_columns, seed=seed, prefix='linear', l2_reg=l2_reg_linear)

    sparse_embedding_vectors, dense_value_vectors = input_from_feature_columns(features, non_linear_columns,l2_reg_embedding, seed)

    # joins a collection of sparse vectors together and pass to att_input object.
    att_input = concat_func(sparse_embedding_vectors, axis=1)

    # For the number of attention layer in the model, do:
    for i in range(number_attention_layer):
        # the output of multi head attention fed to FM component.
        attention_input = MultiAttentionMechanism(attention_embedding_size, attention_head_number, residual_con)(att_input)
    # The last dimension of the output tensor is of size units instead of the same form as the inputs.
    att_output = ElementWiseFM()(attention_input)
    attention_output = Dense(1, use_bias=False)(att_output)

    # sparse and dense vectors combined together
    mlp_input = combined_dnn_input(sparse_embedding_vectors, dense_value_vectors)
    # Vectors feed to hidden layers
    mlp_output = DNN(dnn_hidden_units, dnn_activation, l2_reg_dnn, dnn_dropout, dnn_use_bn, seed=seed)(mlp_input)
    # The last dimension of the output tensor is of size units instead of the same form as the inputs.
    mlp_output = Dense(1, use_bias=False)(mlp_output)

    # The outcomes of the three components combined together.
    final_output = add_func([linear_output, mlp_output, attention_output])
    # Sigmoid function
    output = PredictionLayer('binary')(final_output)

    final_model = Model(inputs=input, outputs=output)

    return final_model

In [16]:
from deepctr.feature_column import SparseFeat, DenseFeat, get_feature_names

model = MANFM(linear_columns, non_linear_columns, task='binary')
model.compile("adam", "binary_crossentropy",metrics=['binary_crossentropy'], )

history = model.fit(train_input, train_data[target].values,batch_size=256, epochs=10, verbose=2, validation_split=0.2, )
prediction = model.predict(test_input, batch_size=256)
print("test LogLoss", round(log_loss(test_data[target].values, prediction), 4))
print("test AUC", round(roc_auc_score(test_data[target].values, prediction), 4))

Epoch 1/10
25000/25000 - 194s - loss: 0.3987 - binary_crossentropy: 0.3874 - val_loss: 0.3960 - val_binary_crossentropy: 0.3831
Epoch 2/10
25000/25000 - 191s - loss: 0.3946 - binary_crossentropy: 0.3802 - val_loss: 0.3949 - val_binary_crossentropy: 0.3809
Epoch 3/10
25000/25000 - 191s - loss: 0.3941 - binary_crossentropy: 0.3782 - val_loss: 0.3949 - val_binary_crossentropy: 0.3799
Epoch 4/10
25000/25000 - 190s - loss: 0.3935 - binary_crossentropy: 0.3766 - val_loss: 0.3955 - val_binary_crossentropy: 0.3797
Epoch 5/10
25000/25000 - 191s - loss: 0.3934 - binary_crossentropy: 0.3757 - val_loss: 0.3956 - val_binary_crossentropy: 0.3793
Epoch 6/10
25000/25000 - 190s - loss: 0.3933 - binary_crossentropy: 0.3750 - val_loss: 0.3954 - val_binary_crossentropy: 0.3786
Epoch 7/10
25000/25000 - 190s - loss: 0.3929 - binary_crossentropy: 0.3742 - val_loss: 0.3958 - val_binary_crossentropy: 0.3788
Epoch 8/10
25000/25000 - 191s - loss: 0.3927 - binary_crossentropy: 0.3738 - val_loss: 0.3959 - val_bina