In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import sys
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.metrics import (accuracy_score, f1_score, precision_score, recall_score)
from tensorflow.python.client import device_lib
from utils.data import *
from utils.measuring_performance import *
from utils.misc import *
sys.path.append('../Fork/DeepCTR')
from deepctr.inputs import DenseFeat, SparseFeat, get_feature_names
from deepctr.layers import custom_objects
from deepctr.models import DeepFM

In [3]:
def get_available_gpus():
    local_device_protos = device_lib.list_local_devices()
    return [x.name for x in local_device_protos if x.device_type == 'GPU']

In [4]:
def get_n_rows_of_dataset(dataset):
    n_rows = 0
    for _ in dataset.take(-1):
        n_rows += 1
    return n_rows

In [5]:
def set_feature_names(num_feature_names, cat_feature_names, target_name=None):
    features = dict()
    features[target_name] = tf.io.FixedLenFeature([], tf.int64)
    for feature in num_feature_names:
        features[feature] = tf.io.FixedLenFeature([], tf.float32)
    for feature in cat_feature_names:
        features[feature] = tf.io.FixedLenFeature([], tf.int64)

    def _from_tfrecord(serialized):
        example = tf.io.parse_single_example(serialized=serialized, features=features)
        if target_name is not None:
            label = example.pop(target_name)
            return example, label
        else:
            return example
    return _from_tfrecord

In [6]:
DATA_DIR = os.path.abspath('../../Data/display_advertising_challenge/processed')
MODEL_DIR = os.path.abspath('models')
USE_TFRECORD = True

In [7]:
if not os.path.exists(MODEL_DIR):
    os.makedirs(MODEL_DIR)

In [8]:
train_dataset_type = 'train+valid'
test_dataset_type = 'test'
model_type = 'deepfm'
model_path = os.path.join(MODEL_DIR, '_'.join([model_type, 'model', train_dataset_type]))

In [9]:
target_name, num_feature_names, cat_feature_names, n_categories = load_pickle(
    os.path.join(DATA_DIR, '_'.join([train_dataset_type, 'metadata.pkl'])))

In [10]:
if USE_TFRECORD:
    train_dataset_path = os.path.join(DATA_DIR, '_'.join(['dataset', train_dataset_type]) + '.tfrecord')
    test_dataset_path = os.path.join(DATA_DIR, '_'.join(['dataset', test_dataset_type]) + '.tfrecord')
    
    train_dataset = tf.data.TFRecordDataset(filenames=train_dataset_path, compression_type='GZIP')
    test_dataset = tf.data.TFRecordDataset(filenames=test_dataset_path, compression_type='GZIP')
    
    n, m = 36672494, 9168123
    # n = get_n_rows_of_dataset(train_dataset)
    # m = get_n_rows_of_dataset(test_dataset)
    
else:
    df_y_train = pd.read_pickle(os.path.join(DATA_DIR, '_'.join(['df', 'y', train_dataset_type]) + '.pkl'))
    df_X_train = pd.read_pickle(os.path.join(DATA_DIR, '_'.join(['df', 'X', train_dataset_type]) + '.pkl'))
    df_y_test = pd.read_pickle(os.path.join(DATA_DIR, '_'.join(['df', 'y', test_dataset_type]) + '.pkl'))
    df_X_test = pd.read_pickle(os.path.join(DATA_DIR, '_'.join(['df', 'X', test_dataset_type]) + '.pkl'))

    train_model_input = {column: df_X_train[column].values for column in df_X_train.columns}
    test_model_input = {column: df_X_test[column].values for column in df_X_test.columns}
    
    n = df_y_train.shape[0]
    m = df_y_test.shape[0]

In [12]:
num_features = [DenseFeat(feature, 1) for feature in num_feature_names]
cat_features = [SparseFeat(feature, vocabulary_size=n_categories[feature], 
                           embedding_dim=4, use_hash=False) for feature in cat_feature_names]
linear_features = num_features + cat_features
dnn_features = num_features + cat_features
all_feature_names = get_feature_names(num_features + cat_features)

In [13]:
model = DeepFM(linear_features, dnn_features, task='binary')
if len(get_available_gpus()) > 1:
    model = tf.keras.utils.multi_gpu_model(model, gpus=n_gpus)
model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=[tf.keras.metrics.AUC()])

In [15]:
callbacks = [tf.keras.callbacks.EarlyStopping(monitor='val_auc', patience=1),
             tf.keras.callbacks.ModelCheckpoint(
                 filepath=model_path + '_{epoch:02d}_{val_auc:.4f}.h5', monitor='val_auc', save_best_only=True)]

In [16]:
n_epochs = 10
batch_size = 2 ** 17

In [17]:
if USE_TFRECORD:
    shuffle_buffer_size = 2 ** 20
    steps_per_epoch = n // batch_size
    validation_steps = m // batch_size
    
    from_tfrecord = set_feature_names(num_feature_names, cat_feature_names, target_name)
    
    train_generator = train_dataset.shuffle(shuffle_buffer_size, seed=42).map(
        from_tfrecord, num_parallel_calls=tf.data.experimental.AUTOTUNE).batch(
        batch_size).prefetch(tf.data.experimental.AUTOTUNE).repeat()
    valid_generator = test_dataset.shuffle(shuffle_buffer_size, seed=42).map(
        from_tfrecord, num_parallel_calls=tf.data.experimental.AUTOTUNE).batch(
        batch_size).prefetch(tf.data.experimental.AUTOTUNE).repeat()

    history = model.fit_generator(train_generator, steps_per_epoch=steps_per_epoch, epochs=n_epochs, verbose=True,
                                  validation_data=valid_generator, validation_steps=validation_steps, 
                                  callbacks=callbacks)
    
else:
    history = model.fit(train_model_input, df_y_train.values, batch_size=batch_size, epochs=n_epochs, 
                        verbose=True, validation_data=(test_model_input, df_y_test.values), callbacks=callbacks)

Epoch 1/10

KeyboardInterrupt: 

In [None]:
model.save(model_path + '.h5')
dump_pickle(model_path + '_history.pkl', history.history)

In [None]:
model = tf.keras.models.load_model(model_path, custom_objects=custom_objects)

In [None]:
if USE_TFRECORD:
    test_dataset = test_dataset.map(
        from_tfrecord, num_parallel_calls=tf.data.experimental.AUTOTUNE)
    
    y_true = np.array([y.numpy() for x, y in test_dataset.take(-1)])
    
    test_generator = test_dataset.map(
        lambda x, y: x, num_parallel_calls=tf.data.experimental.AUTOTUNE).batch(m)
    y_score = model.predict_generator(test_generator).ravel()
    
else:
    y_true = df_y_test.values
    y_score = model.predict(test_model_input).ravel()

In [None]:
ctr = y_true.mean()
y_pred = get_y_pred(y_score, threshold=ctr)

norm_entropy = get_norm_entropy(y_true, y_score)
calibration = y_score.mean() / ctr
accuracy, precision, recall, f1 = accuracy_score(y_true, y_pred), precision_score(y_true, y_pred), \
    recall_score(y_true, y_pred), f1_score(y_true, y_pred)

confusion_matrix = plot_confusion_matrix(y_true, y_pred)
auroc = plot_roc_curve(y_true, y_score)
auprc = plot_pr_curve(y_true, y_score)
_ = plot_lift_curve(y_true, y_score)
_ = plot_class_density(y_true, y_score, threshold=ctr)

In [None]:
dump_pickle(os.path.join(MODEL_DIR, '_'.join([model_type, 'metric', train_dataset_type]) + '.pkl'), 
            (norm_entropy, calibration, accuracy, precision, recall, f1, confusion_matrix, auroc, auprc))

In [None]:
test_dataset_type = 'quiz'
score_path = os.path.join(MODEL_DIR, '_'.join([model_type, 'score', test_dataset_type]) + '.pkl')

In [None]:
if USE_TFRECORD:
    test_dataset_path = os.path.join(DATA_DIR, '_'.join([model_type, 'dataset', test_dataset_type]) + '.tfrecord')
    test_dataset = tf.data.TFRecordDataset(filenames=test_dataset_path, compression_type='GZIP')
    
else:
    df_X_test = pd.read_pickle(os.path.join(DATA_DIR, '_'.join(['df', 'X', test_dataset_type]) + '.pkl'))
    test_model_input = {column: df_X_test[column].values for column in df_X_test.columns}

In [None]:
if USE_TFRECORD:
    m = 6042135 
    # m = get_n_rows_of_dataset(test_dataset)
    test_generator = test_dataset.map(
        from_tfrecord, num_parallel_calls=tf.data.experimental.AUTOTUNE).batch(m)
    y_score = model.predict_generator(test_generator).ravel()
    
else:
    y_score = model.predict(test_model_input).ravel()
    
dump_pickle(score_path, y_score)