In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import os
import sys
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.metrics import (accuracy_score, f1_score, precision_score, recall_score)
from tensorflow.python.client import device_lib
from utils.data import *
from utils.measuring_performance import *
from utils.misc import *
sys.path.append('../Fork/DeepCTR')
from deepctr.inputs import DenseFeat, SparseFeat, get_feature_names
from deepctr.layers import custom_objects
from deepctr.models import DeepFM

In [None]:
def get_available_gpus():
    local_device_protos = device_lib.list_local_devices()
    return [x.name for x in local_device_protos if x.device_type == 'GPU']

In [None]:
def get_n_rows_of_dataset(dataset):
    n_rows = 0
    for _ in dataset.take(-1):
        n_rows += 1
    return n_rows

In [None]:
def set_feature_names(num_feature_names, cat_feature_names, target_name=None):
    features = dict()
    features[target_name] = tf.io.FixedLenFeature([], tf.int64)
    for feature in num_feature_names:
        features[feature] = tf.io.FixedLenFeature([], tf.float32)
    for feature in cat_feature_names:
        features[feature] = tf.io.FixedLenFeature([], tf.int64)

    def _from_tfrecord(serialized):
        example = tf.io.parse_single_example(serialized=serialized, features=features)
        if target_name is not None:
            label = example.pop(target_name)
            return example, label
        else:
            return example
    return _from_tfrecord

In [None]:
DATA_DIR = os.path.abspath('../../Data/display_advertising_challenge/processed')
MODEL_DIR = os.path.abspath('models')
LOG_DIR = os.path.abspath('logs')

In [None]:
if not os.path.exists(MODEL_DIR):
    os.makedirs(MODEL_DIR)

if not os.path.exists(LOG_DIR):
    os.makedirs(LOG_DIR)

In [None]:
train_dataset_type = 'train+valid'
test_dataset_type = 'test'
model_type = 'deepfm'

train_dataset_path = os.path.join(DATA_DIR, '_'.join(['dataset', train_dataset_type]) + '.tfrecord')
test_dataset_path = os.path.join(DATA_DIR, '_'.join(['dataset', test_dataset_type]) + '.tfrecord')
model_path = os.path.join(MODEL_DIR, '_'.join([model_type, 'model', train_dataset_type]) + '.h5')

In [None]:
target_name, num_feature_names, cat_feature_names, n_categories = load_pickle(
    os.path.join(DATA_DIR, '_'.join([train_dataset_type, 'metadata.pkl'])))

In [None]:
from_tfrecord = set_feature_names(num_feature_names, cat_feature_names, target_name)
train_dataset = tf.data.TFRecordDataset(
    filenames=train_dataset_path, compression_type='GZIP').map(from_tfrecord)
test_dataset = tf.data.TFRecordDataset(
    filenames=test_dataset_path, compression_type='GZIP').map(from_tfrecord)

In [None]:
n = 36672494 
# n = get_n_rows_of_dataset(train_dataset)
y_true = pd.read_pickle(os.path.join(DATA_DIR, '_'.join(['df', 'y', test_dataset_type]) + '.pkl')).values
# y_true = np.array([y.numpy() for x, y in test_dataset.take(-1)])
m = y_true.shape[0]

In [None]:
num_features = [DenseFeat(feature, 1) for feature in num_feature_names]
cat_features = [SparseFeat(feature, vocabulary_size=n_categories[feature], 
                           embedding_dim=4, use_hash=False) for feature in cat_feature_names]
linear_features = num_features + cat_features
dnn_features = num_features + cat_features
all_feature_names = get_feature_names(num_features + cat_features)

In [None]:
model = DeepFM(linear_features, dnn_features, task='binary')
if len(get_available_gpus()) >= 2:
    model = tf.keras.utils.multi_gpu_model(model, gpus=n_gpus)
model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=[tf.keras.metrics.AUC()])

In [None]:
callbacks = [tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=1),
             tf.keras.callbacks.ModelCheckpoint(filepath=model_path, monitor='val_loss', save_best_only=True),
             tf.keras.callbacks.TensorBoard(log_dir=LOG_DIR, histogram_freq=1, embeddings_freq=1)]

In [None]:
n_epochs = 10
batch_size = 128
shuffle_buffer_size = 12800

generator = train_dataset.batch(batch_size).shuffle(shuffle_buffer_size, seed=42).repeat()
validation_data = test_dataset.batch(batch_size).shuffle(shuffle_buffer_size, seed=42).repeat()

steps_per_epoch = n // batch_size
validation_steps = m // batch_size

history = model.fit_generator(generator, steps_per_epoch=steps_per_epoch, epochs=n_epochs, verbose=True,
                              validation_data=validation_data, validation_steps=validation_steps, 
                              callbacks=callbacks)
model.save(model_path)

In [None]:
model = tf.keras.models.load_model(model_path, custom_objects=custom_objects)

In [None]:
generator = test_dataset.map(lambda x, y: x).batch(m)
y_score = model.predict_generator(generator).ravel()

In [None]:
ctr = y_true.mean()
y_pred = get_y_pred(y_score, threshold=ctr)

norm_entropy = get_norm_entropy(y_true, y_score)
calibration = y_score.mean() / ctr
accuracy, precision, recall, f1 = accuracy_score(y_true, y_pred), precision_score(y_true, y_pred), \
    recall_score(y_true, y_pred), f1_score(y_true, y_pred)

confusion_matrix = plot_confusion_matrix(y_true, y_pred)
auroc = plot_roc_curve(y_true, y_score)
auprc = plot_pr_curve(y_true, y_score)
_ = plot_lift_curve(y_true, y_score)
_ = plot_class_density(y_true, y_score, threshold=ctr)

In [None]:
dump_pickle(os.path.join(MODEL_DIR, '_'.join([model_type, 'metric', train_dataset_type]) + '.pkl'), 
            (norm_entropy, calibration, accuracy, precision, recall, f1, confusion_matrix, auroc, auprc))

In [None]:
test_dataset_type = 'quiz'
test_dataset_path = os.path.join(DATA_DIR, '_'.join([model_type, 'dataset', test_dataset_type]) + '.tfrecord')
score_path = os.path.join(MODEL_DIR, '_'.join([model_type, 'score', test_dataset_type]) + '.pkl')

In [None]:
from_tfrecord = set_feature_names(num_feature_names, cat_feature_names)
test_dataset = tf.data.TFRecordDataset(
    filenames=test_dataset_path, compression_type='GZIP').map(from_tfrecord)

In [None]:
m = 6042135 
# m = get_n_rows_of_dataset(test_dataset)
generator = test_dataset.batch(m)
y_score = model.predict_generator(generator).ravel()
dump_pickle(score_path, y_score)