In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/criteo-dataset/dac/test.txt
/kaggle/input/criteo-dataset/dac/readme.txt
/kaggle/input/criteo-dataset/dac/train.txt


In [2]:
!pip install deepctr[gpu]



In [3]:
import pandas as pd
import tensorflow as tf
from sklearn.metrics import log_loss, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

from deepctr.estimator import DeepFMEstimator, DCNEstimator
from deepctr.models import *
from deepctr.estimator.inputs import input_fn_pandas

import time

In [4]:
import warnings
warnings.simplefilter('ignore', FutureWarning)
warnings.simplefilter('ignore', UserWarning)

## Notes
- Using the first one million records for analysis
- Using the next 250,000 as testing data
- for simplicity of model fitting, no cross-validation process, directly train on df_train_100 and test for performance on df_test_25

In [5]:
col_names_train = ['label'] + \
["I"+str(i) for i in range(1, 14)] + \
['C'+str(i) for i in range(1,27)]

#col_names_test = col_names_train[1:]

df_train = pd.read_csv('/kaggle/input/criteo-dataset/dac/train.txt', 
                       sep='\t', names=col_names_train,
                       chunksize=100000) # ten chunks: first 1,000,000

# df_test = pd.read_csv('/kaggle/input/criteo-dataset/dac/test.txt', 
#                       sep='\t', names=col_names_test,
#                       chunksize=100000)

# don't re-run, getting without replacement
df = df_train.get_chunk(1250000)

# Using the first one million records for analysis
# use the next 250,000 as testing data

In [6]:
# for simplicity of model fitting, no cross-validation process
# directly train on df_train_100 and test for performance on df_test_25 

df.dtypes

label      int64
I1       float64
I2         int64
I3       float64
I4       float64
I5       float64
I6       float64
I7       float64
I8       float64
I9       float64
I10      float64
I11      float64
I12      float64
I13      float64
C1        object
C2        object
C3        object
C4        object
C5        object
C6        object
C7        object
C8        object
C9        object
C10       object
C11       object
C12       object
C13       object
C14       object
C15       object
C16       object
C17       object
C18       object
C19       object
C20       object
C21       object
C22       object
C23       object
C24       object
C25       object
C26       object
dtype: object

## DeepCTR
ref: 
https://zhuanlan.zhihu.com/p/53231955

https://github.com/shenweichen/DeepCTR

In [7]:
data = df.copy()

### DeepFM - features 1

- https://deepctr-doc.readthedocs.io/en/latest/_modules/deepctr/estimator/models/deepfm.html#DeepFMEstimator

In [8]:
sparse_features = ['C' + str(i) for i in range(1, 27)]
dense_features = ['I' + str(i) for i in range(1, 14)]

data[sparse_features] = data[sparse_features].fillna('-1')
data[dense_features] = data[dense_features].fillna(0)
target = ['label']

In [9]:
# label encoding
for feat in sparse_features:
    lbe = LabelEncoder()
    data[feat] = lbe.fit_transform(data[feat])

# mix-max standardization for dense features 

# # Clip integer features by the 95% quantile of the total values: Above 95%: 1
# # cont_clip = data.iloc[:,1:14].quantile(q=0.95, axis=0)
# cont_clip = [9, 469, 49, 21, 62792.1, 386, 54, 43, 411, 1, 9, 1, 23]
# for i in range(13):
#     cutoff = cont_clip[i]
#     col_name = dense_features[i]
#     data[col_name] = np.where(data[col_name]>cutoff, cutoff, data[col_name])

mms = MinMaxScaler(feature_range=(0, 1))
data[dense_features] = mms.fit_transform(data[dense_features])

In [10]:
# features 1: feature embedding

dnn_feature_columns = []
linear_feature_columns = []

for feat in sparse_features:
    features_cat = tf.feature_column.categorical_column_with_identity(feat, data[feat].max()+1)
    features_embedded = tf.feature_column.embedding_column(features_cat, 3)
    dnn_feature_columns.append(features_embedded)
    linear_feature_columns.append(features_cat)
for feat in dense_features:
    features_num = tf.feature_column.numeric_column(feat)
    dnn_feature_columns.append(features_num)
    linear_feature_columns.append(features_num)

In [11]:
train, test = data[:1000000], data[1000000:]

In [12]:
# train.dtypes, int64, float64

In [13]:
# def df_to_dataset(dataframe, shuffle=True, batch_size=256):
#     '''
#     Pandas dataframe to tensorflow dataset.
#     '''
    
#     dataframe = dataframe.copy()
#     labels = dataframe.pop('Label')
#     ds = tf.data.Dataset.from_tensor_slices((dataframe.values, labels.values))
#     if shuffle:
#         ds = ds.shuffle(buffer_size=len(dataframe))
#     ds = ds.batch(batch_size)
    
#     return ds

# train_model_input = df_to_dataset(train)
# test_model_input = df_to_dataset(test, shuffle=False)

train_model_input = input_fn_pandas(train, sparse_features + dense_features, 'label', shuffle=True)
test_model_input = input_fn_pandas(test, sparse_features + dense_features, None, shuffle=False)

In [16]:
model = DeepFMEstimator(linear_feature_columns, dnn_feature_columns, task='binary', 
                        config=tf.estimator.RunConfig(tf_random_seed=123))

model.train(train_model_input)
pred_ans_iter = model.predict(test_model_input)
pred_ans = list(map(lambda x: x['pred'], pred_ans_iter))
#
print("test LogLoss", round(log_loss(test[target].values, pred_ans), 4))
print("test AUC", round(roc_auc_score(test[target].values, pred_ans), 4))

test LogLoss 0.5323
test AUC 0.6985


results:
- seed 123, embedding 4, min-max scaler: (0.531, 0.7024)
- seed 123, embedding 6, min-max scaler: (0.5292, 0.7051)
- seed 123, embedding 3, min-max scaler: (0.5335, 0.698)*
- seed 123, embedding 2, min-max scaler: divide by zero encountered in log loss

- seed 123, embedding 3, min-max scaler (95% by train): (0.528, 0.712)
- seed 123, embedding 3, min-max scaler (95% by all): (0.5278, 0.712)


In [17]:
# tf demo
# https://medium.com/ml-book/demonstration-of-tensorflow-feature-columns-tf-feature-column-3bfcca4ca5c4


# from tensorflow.python.feature_column.feature_column import _LazyBuilder

# def test_numeric():
#     price = {'price': [[1.], [2.], [3.], [4.]]}  # 4行样本
#     builder = _LazyBuilder(price)

#     def transform_fn(x):
#         return x + 2

#     price_column = tf.feature_column.numeric_column('price', normalizer_fn=transform_fn)
#     price_transformed_tensor = price_column._get_dense_tensor(builder)
#     print(price_transformed_tensor)

# test_numeric()


# def test_categorical_column_with_vocabulary_list():
#     color_data = {'color': [['R', 'R'], ['G', 'R'], ['B', 'G'], ['A', 'A']]}  # 4行样本
#     builder = _LazyBuilder(color_data)
#     color_column = tf.feature_column.categorical_column_with_vocabulary_list(
#         'color', ['R', 'G', 'B'], dtype=tf.string, default_value=-1
#     )
    
#     # sparse: named tuple
#     color_column_tensor = color_column._get_sparse_tensors(builder)
#     print('_' * 60)
#     print(color_column_tensor.id_tensor)

#     # 将稀疏的转换成dense，也就是one-hot形式，只是multi-hot
#     color_column_identy = tf.feature_column.indicator_column(color_column)
#     feature_layer = tf.keras.layers.DenseFeatures(color_column_identy)
#     print('_' * 60)
#     print(feature_layer(color_data))
    
#     # embedding column, can change dim
#     color_column_embedding = tf.feature_column.embedding_column(color_column,
#                                                       dimension=4) 
#     feature_layer = tf.keras.layers.DenseFeatures(color_column_embedding)
#     print('_' * 60)
#     print(feature_layer(color_data))

# test_categorical_column_with_vocabulary_list()