In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/criteo-dataset/dac/test.txt
/kaggle/input/criteo-dataset/dac/readme.txt
/kaggle/input/criteo-dataset/dac/train.txt


In [2]:
!pip install deepctr[gpu]

Collecting deepctr[gpu]
  Downloading deepctr-0.8.5-py3-none-any.whl (116 kB)
[K     |████████████████████████████████| 116 kB 2.9 MB/s eta 0:00:01
Collecting tensorflow-gpu!=1.7.*,!=1.8.*,>=1.4.0
  Downloading tensorflow_gpu-2.4.1-cp37-cp37m-manylinux2010_x86_64.whl (394.3 MB)
[K     |████████████████████████████████| 394.3 MB 12 kB/s s eta 0:00:01    |███████▎                        | 89.5 MB 49.7 MB/s eta 0:00:07     |██████████████▍                 | 177.2 MB 60.4 MB/s eta 0:00:04     |███████████████▉                | 195.2 MB 63.3 MB/s eta 0:00:04     |████████████████▉               | 207.6 MB 63.3 MB/s eta 0:00:03     |█████████████████▎              | 212.5 MB 63.3 MB/s eta 0:00:03��█▋            | 242.0 MB 54.8 MB/s eta 0:00:03     |████████████████████            | 245.7 MB 54.8 MB/s eta 0:00:03██████████████▋           | 253.8 MB 54.8 MB/s eta 0:00:03
Installing collected packages: tensorflow-gpu, deepctr
Successfully installed deepctr-0.8.5 tensorflow-gpu-2.4.1


In [3]:
import pandas as pd
import tensorflow as tf
from sklearn.metrics import log_loss, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

from deepctr.estimator import DeepFMEstimator, DCNEstimator
from deepctr.models import *
from deepctr.estimator.inputs import input_fn_pandas
from deepctr.feature_column import SparseFeat, DenseFeat, get_feature_names

import time

In [4]:
import warnings
warnings.simplefilter('ignore', FutureWarning)
warnings.simplefilter('ignore', UserWarning)

## Notes
- Using the first one million records for analysis
- Using the next 250,000 as testing data
- for simplicity of model fitting, no cross-validation process, directly train on df_train_100 and test for performance on df_test_25

In [5]:
col_names_train = ['label'] + \
["I"+str(i) for i in range(1, 14)] + \
['C'+str(i) for i in range(1,27)]

#col_names_test = col_names_train[1:]

df_train = pd.read_csv('/kaggle/input/criteo-dataset/dac/train.txt', 
                       sep='\t', names=col_names_train,
                       chunksize=100000) # ten chunks: first 1,000,000

# df_test = pd.read_csv('/kaggle/input/criteo-dataset/dac/test.txt', 
#                       sep='\t', names=col_names_test,
#                       chunksize=100000)

# don't re-run, getting without replacement
df = df_train.get_chunk(1250000)

# Using the first one million records for analysis
# use the next 250,000 as testing data

## DeepCTR
ref: 
https://zhuanlan.zhihu.com/p/53231955

https://github.com/shenweichen/DeepCTR

### DeepFM

- https://www.cnblogs.com/xiaoqi/p/deepfm.html
- https://booking.ai/dont-be-tricked-by-the-hashing-trick-192a6aae3087

In [6]:
data = df.copy()

In [7]:
sparse_features = ['C' + str(i) for i in range(1, 27)]
dense_features = ['I' + str(i) for i in range(1, 14)]
data[sparse_features] = data[sparse_features].fillna('-1')
data[dense_features] = data[dense_features].fillna(0)
target = ['label']

# label encoding
for feat in sparse_features:
    lbe = LabelEncoder()
    data[feat] = lbe.fit_transform(data[feat])
# min-max scaler
mms = MinMaxScaler(feature_range=(0, 1))
data[dense_features] = mms.fit_transform(data[dense_features])

In [8]:
#[SparseFeat(feat, vocabulary_size=data[feat].max()+1, embedding_dim=2) for i,feat in enumerate(sparse_features)]
fixlen_feature_columns = [SparseFeat(feat, vocabulary_size=data[feat].max()+1, embedding_dim=3) 
                          for i,feat in enumerate(sparse_features)] + \
[SparseFeat(feat, vocabulary_size=data[feat].max()+1, embedding_dim=4, use_hash=True)
 for feat in sparse_features] + \
[DenseFeat(feat, 1,) for feat in dense_features]

dnn_feature_columns = fixlen_feature_columns
linear_feature_columns = fixlen_feature_columns

feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)

In [9]:
train, test = data[:1000000], data[1000000:]

In [10]:
train_model_input = {name:train[name] for name in feature_names}
test_model_input = {name:test[name] for name in feature_names}

In [11]:
model = DeepFM(linear_feature_columns, dnn_feature_columns, task='binary')
model.compile("adam", "binary_crossentropy",
              metrics=['binary_crossentropy'], )

history = model.fit(train_model_input, train[target].values,
                    batch_size=256, epochs=20, verbose=2, validation_split=0.2)
pred_ans = model.predict(test_model_input, batch_size=256)
print("test LogLoss", round(log_loss(test[target].values, pred_ans), 4))
print("test AUC", round(roc_auc_score(test[target].values, pred_ans), 4))

Epoch 1/20
3125/3125 - 186s - loss: 0.4933 - binary_crossentropy: 0.4808 - val_loss: 0.4903 - val_binary_crossentropy: 0.4705
Epoch 2/20
3125/3125 - 178s - loss: 0.4768 - binary_crossentropy: 0.4489 - val_loss: 0.5007 - val_binary_crossentropy: 0.4710
Epoch 3/20
3125/3125 - 178s - loss: 0.4622 - binary_crossentropy: 0.4220 - val_loss: 0.5212 - val_binary_crossentropy: 0.4767
Epoch 4/20
3125/3125 - 176s - loss: 0.3904 - binary_crossentropy: 0.3457 - val_loss: 0.5685 - val_binary_crossentropy: 0.5180
Epoch 5/20
3125/3125 - 175s - loss: 0.3425 - binary_crossentropy: 0.2976 - val_loss: 0.6252 - val_binary_crossentropy: 0.5783
Epoch 6/20
3125/3125 - 177s - loss: 0.3315 - binary_crossentropy: 0.2876 - val_loss: 0.6403 - val_binary_crossentropy: 0.5926
Epoch 7/20
3125/3125 - 176s - loss: 0.3283 - binary_crossentropy: 0.2828 - val_loss: 0.6302 - val_binary_crossentropy: 0.5807
Epoch 8/20
3125/3125 - 177s - loss: 0.3143 - binary_crossentropy: 0.2682 - val_loss: 0.6647 - val_binary_crossentropy:

results:
- label encoding, minmax scaler, feature embedding (4): test LogLoss 0.5877, test AUC 0.7205
- ***label encoding, minmax scaler, feature embedding (3): test LogLoss 0.5937, test AUC 0.7256
- label encoding, minmax scaler, feature embedding (5): divide by zero encountered in log loss
- label encoding, minmax scaler, feature embedding (2): test LogLoss 0.5772, test AUC 0.7305

- label encoding, minmax scaler, feature hashing (1000, 4): divide by zero encountered in log loss
- **label encoding, minmax scaler, feature hashing (data[feat].max()+1, 4): test LogLoss 0.6169, test AUC 0.7125
- label encoding, minmax scaler, feature hashing (1e6, 4): test LogLoss 0.6177, test AUC 0.712 (slow)

- *label encoding, minmax scaler, feature hashing (data[feat].max()+1, 4), feature embedding (3): test LogLoss 0.638, test AUC 0.7132
- label encoding, minmax scaler, feature hashing (data[feat].max()+1, 3), feature embedding (3): test LogLoss 0.622, test AUC 0.7172; 20 epochs, test LogLoss 0.7089, test AUC 0.7027

### DCN (Deep & Cross Network)

- https://blog.csdn.net/u012290039/article/details/106943344

In [12]:
model2 = DCN(linear_feature_columns, dnn_feature_columns, task='binary')
model2.compile("adam", "binary_crossentropy",
              metrics=['binary_crossentropy'], )

history = model2.fit(train_model_input, train[target].values,
                    batch_size=256, epochs=20, verbose=2, validation_split=0.2)
pred_ans = model2.predict(test_model_input, batch_size=256)
print("test LogLoss", round(log_loss(test[target].values, pred_ans), 4))
print("test AUC", round(roc_auc_score(test[target].values, pred_ans), 4))

CrossNet parameterization: vector
Epoch 1/20
3125/3125 - 192s - loss: 0.4927 - binary_crossentropy: 0.4808 - val_loss: 0.4884 - val_binary_crossentropy: 0.4700
Epoch 2/20
3125/3125 - 183s - loss: 0.4772 - binary_crossentropy: 0.4513 - val_loss: 0.4954 - val_binary_crossentropy: 0.4685
Epoch 3/20
3125/3125 - 183s - loss: 0.4670 - binary_crossentropy: 0.4297 - val_loss: 0.5151 - val_binary_crossentropy: 0.4745
Epoch 4/20
3125/3125 - 179s - loss: 0.4019 - binary_crossentropy: 0.3592 - val_loss: 0.5591 - val_binary_crossentropy: 0.5103
Epoch 5/20
3125/3125 - 178s - loss: 0.3448 - binary_crossentropy: 0.3024 - val_loss: 0.6163 - val_binary_crossentropy: 0.5727
Epoch 6/20
3125/3125 - 179s - loss: 0.3329 - binary_crossentropy: 0.2922 - val_loss: 0.6234 - val_binary_crossentropy: 0.5791
Epoch 7/20
3125/3125 - 178s - loss: 0.3302 - binary_crossentropy: 0.2879 - val_loss: 0.6224 - val_binary_crossentropy: 0.5759
Epoch 8/20
3125/3125 - 178s - loss: 0.3174 - binary_crossentropy: 0.2740 - val_loss:

  loss = -(transformed_labels * np.log(y_pred)).sum(axis=1)
  loss = -(transformed_labels * np.log(y_pred)).sum(axis=1)


- label encoding, minmax scaler, feature hashing (1e6, 4): test LogLoss 0.6057, test AUC 0.7208
- label encoding, minmax scaler, feature hashing (data[feat].max()+1, 4): test LogLoss 0.6228, test AUC 0.7097
- *label encoding, minmax scaler, feature hashing (data[feat].max()+1, 4), feature embedding (3): test LogLoss 0.6279, test AUC 0.7171