In [1]:
import tensorflow as tf
import tensorflow_ranking as tfr
from dataset_builder import ZlibDatasetBuilder

In [2]:
_LABEL_FEATURE = 'label'
_MASK = '_mask'
_PADDING_LABEL = -1.0

In [3]:
def train_and_validate(config):
    example_feature_spec = {}
    for i in range(220):
        example_feature_spec[f'feature{i+1}'] = tf.io.FixedLenFeature(shape=(1,), dtype=tf.float32, default_value=0.0)

    context_feature_spec = {}
    label_spec = (_LABEL_FEATURE, tf.io.FixedLenFeature(shape=(1,), dtype=tf.float32, default_value=_PADDING_LABEL))
    dataset_hparams = tfr.keras.pipeline.DatasetHparams(
        train_input_pattern=config['train_input'],
        valid_input_pattern=config['vali_input'],
        train_batch_size=config['train_batch_size'],
        valid_batch_size=config['valid_batch_size'],
        list_size=config['list_size']
    )

    pipeline_hparams = tfr.keras.pipeline.PipelineHparams(
        model_dir=config['model_dir'],
        num_epochs=config['num_epochs'],
        steps_per_epoch=config['steps_per_epoch'],
        validation_steps=config['validation_steps'],
        loss=config['loss'],
        optimizer=config['optimizer'],
        learning_rate=config['learning_rate'],
        strategy=config['strategy']
    )
    
    dnn_scorer = tfr.keras.model.DNNScorer(
        hidden_layer_dims = config['hidden_layer_dims'],
        output_units=1,
        activation = tf.nn.relu,
        input_batch_norm = True,
        dropout = config['dropout']
    )

    model_builder = tfr.keras.model.ModelBuilder(
        input_creator=tfr.keras.model.FeatureSpecInputCreator(
            context_feature_spec, example_feature_spec),
        preprocessor=tfr.keras.model.PreprocessorWithSpec(),
        scorer=dnn_scorer,
        mask_feature_name=_MASK,
        name=config['name']
    )

    dataset_builder = ZlibDatasetBuilder(
        context_feature_spec=context_feature_spec,
        example_feature_spec=example_feature_spec,
        mask_feature_name=_MASK,
        label_spec=label_spec,
        hparams=dataset_hparams
    )

    ranking_pipeline = tfr.keras.pipeline.SimplePipeline(
        model_builder=model_builder,
        dataset_builder=dataset_builder,
        hparams=pipeline_hparams
    )

    ranking_pipeline.train_and_validate(verbose=1)

In [4]:
# 基线模型，参数未调优
config = {
    'train_input': '/home/guhangsong/Data/istella-letor/train.tfrecord',
    'vali_input': '/home/guhangsong/Data/istella-letor/test.tfrecord',
    'train_batch_size': 32,
    'valid_batch_size': 32,
    'list_size': None,
    'model_dir': 'istella_tfr/base_model',
    'num_epochs': 2,
    'steps_per_epoch': 1000,
    'validation_steps': 100,
    'learning_rate': 0.05,
    'optimizer': 'adam',
    'loss': 'approx_ndcg_loss',
    'strategy': 'MirroredStrategy',
    'hidden_layer_dims': [64, 32, 16],
    'dropout': 0.5,
    'name': 'istella_base_model',
}
train_and_validate(config)

INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0',)


2022-03-20 18:10:11.177993: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:925] could not open file to read NUMA node: /sys/bus/pci/devices/0000:09:00.0/numa_node
Your kernel may have been built without NUMA support.
2022-03-20 18:10:11.197884: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:925] could not open file to read NUMA node: /sys/bus/pci/devices/0000:09:00.0/numa_node
Your kernel may have been built without NUMA support.
2022-03-20 18:10:11.198299: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:925] could not open file to read NUMA node: /sys/bus/pci/devices/0000:09:00.0/numa_node
Your kernel may have been built without NUMA support.
2022-03-20 18:10:11.198919: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow wi

INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).


INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).


INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).


INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).


INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).


INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).


INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).


INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).


INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).


INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).


INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).


INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).


INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).


INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).


INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).


INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).


INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).


INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).


INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).


INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).


Epoch 1/2


2022-03-20 18:10:30.648131: I tensorflow/stream_executor/cuda/cuda_blas.cc:1774] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.


Epoch 2/2


2022-03-20 18:13:43.500963: W tensorflow/python/util/util.cc:368] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.


INFO:tensorflow:Assets written to: istella_tfr/base_model/export/latest_model/assets


INFO:tensorflow:Assets written to: istella_tfr/base_model/export/latest_model/assets
