In [1]:
# Copyright 2020 NVIDIA Corporation. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

<img src="http://developer.download.nvidia.com/compute/machine-learning/frameworks/nvidia_logo.png" style="width: 90px; float: right;">

# Benchmark NVTabular data loader
We are interested to benchmark the NVTabular data loader and compare its performance to the TensorFlow "native" data loader based on tf.records. In [benchmark-preprocess.ipynb](???), we preprocess the dataset, ready to use for NVTabular data loader (parquet) and TensorFlow native data loader (tf.records). In this notebook, we will train a neural network in TensorFlow using either data loader and measure the performance.

First, we install gpustat

In [2]:
!pip install tensorflow-gpu==2.4.0

Collecting tensorflow-gpu==2.4.0
  Downloading tensorflow_gpu-2.4.0-cp38-cp38-manylinux2010_x86_64.whl (394.8 MB)
[K     |████████████████████████████████| 394.8 MB 43 kB/s s eta 0:00:01     |████████████████████████████    | 346.1 MB 84.6 MB/s eta 0:00:01
Collecting gast==0.3.3
  Downloading gast-0.3.3-py2.py3-none-any.whl (9.7 kB)
Collecting keras-preprocessing~=1.1.2
  Downloading Keras_Preprocessing-1.1.2-py2.py3-none-any.whl (42 kB)
[K     |████████████████████████████████| 42 kB 4.3 MB/s  eta 0:00:01
[?25hCollecting numpy~=1.19.2
  Downloading numpy-1.19.5-cp38-cp38-manylinux2010_x86_64.whl (14.9 MB)
[K     |████████████████████████████████| 14.9 MB 64.4 MB/s eta 0:00:01
[?25hCollecting termcolor~=1.1.0
  Downloading termcolor-1.1.0.tar.gz (3.9 kB)
Collecting flatbuffers~=1.12.0
  Downloading flatbuffers-1.12-py2.py3-none-any.whl (15 kB)
Collecting grpcio~=1.32.0
  Downloading grpcio-1.32.0-cp38-cp38-manylinux2014_x86_64.whl (3.8 MB)
[K     |████████████████████████████████

In [3]:
!pip install numpy==1.20.1

Collecting numpy==1.20.1
  Downloading numpy-1.20.1-cp38-cp38-manylinux2010_x86_64.whl (15.4 MB)
[K     |████████████████████████████████| 15.4 MB 5.7 MB/s eta 0:00:01
[?25hInstalling collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 1.19.5
    Uninstalling numpy-1.19.5:
      Successfully uninstalled numpy-1.19.5
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
fastai 2.2.7 requires torchvision<0.9,>=0.8, which is not installed.
cudf 0.18.0 requires cython, which is not installed.
tensorflow-gpu 2.4.0 requires numpy~=1.19.2, but you have numpy 1.20.1 which is incompatible.
fastai 2.2.7 requires spacy<3, but you have spacy 3.0.3 which is incompatible.
fastai 2.2.7 requires torch<1.8,>=1.7.0, but you have torch 1.8.0 which is incompatible.[0m
Successfully installed numpy-1.20.1


We run single GPU version and set only one visible device.

In [1]:
import os, time
os.environ["CUDA_VISIBLE_DEVICES"]="0"

We import the required libraries.

In [2]:
import glob
import nvtabular as nvt

from time import time
from tqdm.notebook import trange

import pickle

We define multiple helper functions.<br><br>
*get_dataloader* returns the NVTabular data loader or TensorFlow native data loader, depending on dl_type<br>
*get_model* returns a standard TensorFlow model<br>
*make_tf_dataset* is a helper function to initalize the TensorFlow data loader

In [3]:
### Helper Function

def get_dataloader(dl_type='NVTabular', columns=[], HASH=False):
    if dl_type=='NVTabular':
        if HASH:
            workflow, columns = make_feature_column_workflow(columns, LABEL_COLUMNS[0])
        train_dataset_tf = KerasSequenceLoader(
            output_train_dir, # you could also use a glob pattern
            batch_size=BATCH_SIZE,
            label_names=LABEL_COLUMNS,
            cat_names=CATEGORICAL_COLUMNS,
            cont_names=CONTINUOUS_COLUMNS,
            engine='parquet',
            shuffle=True,
            buffer_size=0.06, # how many batches to load at once
            parts_per_chunk=PARTS_PER_CHUNK
        )
        valid_dataset_tf = KerasSequenceLoader(
            output_valid_dir, # you could also use a glob pattern
            batch_size=BATCH_SIZE,
            label_names=LABEL_COLUMNS,
            cat_names = CATEGORICAL_COLUMNS,
            cont_names=CONTINUOUS_COLUMNS,
            engine='parquet',
            shuffle=False,
            buffer_size=0.06,
            parts_per_chunk=PARTS_PER_CHUNK
        )
        if HASH:
            train_dataset_tf.map(workflow)
            valid_dataset_tf.map(workflow)
    if dl_type=='TensorFlow':
        train_dataset_tf = make_tf_dataset(TFRECORDS_TRAIN, columns)
        valid_dataset_tf = make_tf_dataset(TFRECORDS_VALID, columns)
    return(train_dataset_tf, valid_dataset_tf, columns)

def get_model(hidden_dims, inputs, features, dl_type):
    if dl_type=='NVTabular':
        dense_layer = layers.DenseFeatures(features)
    if dl_type=='TensorFlow':
        dense_layer = tf.keras.layers.DenseFeatures(features)
    x = dense_layer(inputs)
    
    for hidden in hidden_dims:
        x = tf.keras.layers.Dense(hidden, activation='relu')(x)
    
    x = tf.keras.layers.Dense(1, activation='sigmoid')(x)
    model = tf.keras.Model(inputs=inputs, outputs=x)
    metrics = [tf.keras.metrics.AUC(curve="ROC", name="auroc")]
    model.compile('sgd', 'binary_crossentropy', metrics=metrics)
    return(model)

def make_tf_dataset(file_pattern, columns):
    # get rid of embeddings for "raw" columns
    columns = [getattr(col, "categorical_column", col) for col in columns]
    # feature spec tells us how to parse tfrecords
    # using FixedLenFeatures keeps from using sparse machinery,
    # but obviously wouldn't extend to multi-hot categoricals
    get_dtype = lambda col: getattr(col, "dtype", tf.int64)
    feature_spec = {column.name: tf.io.FixedLenFeature((1,), get_dtype(column)) for column in columns}
    feature_spec[LABEL_COLUMNS[0]] = tf.io.FixedLenFeature((1,), tf.int64)

    dataset = tf.data.experimental.make_batched_features_dataset(
        file_pattern,
        BATCH_SIZE,
        feature_spec,
        label_key=LABEL_COLUMNS[0],
        num_epochs=EPOCHS,
        shuffle=True,
        shuffle_buffer_size=BATCH_SIZE,
    )
    return dataset

def log_textfile(filename, text, mode):
    print(text)
    f = open(filename, mode)
    f.write(str(text) + str('\n'))
    f.close()

In addition, we define functions to measure the performance.<br><br>
*time_only_dl* measures the time for just iterating through the dataset for 1 epoch WITHOUT training a model<br>
*time_training* measures the time for training a model for 1 epoch<br><br>
Note, that 1 epoch is defined by a number of steps. Tf.records does not allow partical batches in tf.records, so we approximated one epoch by the number of steps.

In [4]:
def time_only_dl(dl, num_steps):
    start = time.time()
    i = 0
    j= 0
    bl_done = False
    while not(bl_done) and i<num_steps:
        for _, batch in enumerate(dl):
            if i == num_steps:
                bl_done = True
                break
            i+=1
        j+=1
    end = time.time()
    return(end-start, i, j)

def time_training(model, train_dataset_tf, steps):
    start = time.time()
    history = model.fit(train_dataset_tf, epochs=1)
    end = time.time()
    return(end-start, steps, 1)

We define which benchmark, we want to run.

In [5]:
AMP = False
DL_TYPES = ['NVTabular', 'TensorFlow']
BENCHMARK_TYPES = ['time_only_dl', 'time_training', 'convergence_training_loss', 'convergence_val_loss']
DL_TYPE = 'NVTabular'
BENCHMARK_TYPE = 'time_training'
HASH = False
CPU = False

if DL_TYPE not in DL_TYPES:
    raise ValueError(DL_TYPE + ' is not supported.  Choose from ' + str(DL_TYPES))
    
if BENCHMARK_TYPE not in BENCHMARK_TYPES:
    raise ValueError(BENCHMARK_TYPE + ' is not supported. Choose from ' + str(BENCHMARK_TYPES))

We define the inpurt directory for the parquet and tf.records file.

In [6]:
# define some information about where to get our data
OUTPUT_DIR = '/raid/data/criteo/output/'
OUTPUT_DATA_DIR = os.environ.get('OUTPUT_DATA_DIR', OUTPUT_DIR + 'output') # where we'll save our procesed data to

output_train_dir = glob.glob(os.path.join(OUTPUT_DATA_DIR, 'train/*.parquet'))
output_valid_dir = glob.glob(os.path.join(OUTPUT_DATA_DIR, 'valid/*.parquet'))

output_train_dir, output_valid_dir

(['/raid/data/criteo/output/output/train/12.353c8050ed1f4702bde63bd697c9cc36.parquet',
  '/raid/data/criteo/output/output/train/18.6e5ac8d8f49f46089cecceb6d044adda.parquet',
  '/raid/data/criteo/output/output/train/14.98377a29380145999e7e1e0d4c75eb81.parquet',
  '/raid/data/criteo/output/output/train/4.1907f580308044b6a32da25e9d3f1540.parquet',
  '/raid/data/criteo/output/output/train/8.68a9f5f1feb64fe59fe160211b54afd4.parquet',
  '/raid/data/criteo/output/output/train/1.8772a8717fae46df8b521cdd891df07a.parquet',
  '/raid/data/criteo/output/output/train/16.7a1f69dd7d834844bbfb68e659bb362c.parquet',
  '/raid/data/criteo/output/output/train/6.46cd6180dc42484199d895a2de02a1a6.parquet',
  '/raid/data/criteo/output/output/train/2.1063c71d33e646fe8e6218bc2457c3cb.parquet',
  '/raid/data/criteo/output/output/train/7.1a249ca9dbf44b8e9ca6b0ce740e3a88.parquet',
  '/raid/data/criteo/output/output/train/0.cb895dbeefa14e65aa2e715b295ac9e7.parquet',
  '/raid/data/criteo/output/output/train/5.c311b59

We define some hyperparameters and network architecture.

In [7]:
# Batch size for training the deep learning model
BATCH_SIZE = int(os.environ.get('BATCH_SIZE', 1024*64))      
# Number of epochs (only for convergence_val_loss)
EPOCHS = 1
# Number of steps in training to collect train_loss (only for convergence_training_loss)
TRAIN_STEPS = 20
# Max. number of steps per epoch (tf.records allows only full batches)
STEPS = int(150000000/BATCH_SIZE)
# Number of units in hidden layer - length is number of hidden layers
HIDDEN_DIMS = [1024, 1024, 1024, 1024]
# Number of parts using in shuffling of NVTabular data loader
PARTS_PER_CHUNK = int(os.environ.get('PARTS_PER_CHUNK', 1))

We load the saved NVTabular workflow to extract the data schema and some statistics.

In [8]:
EMBEDDING_TABLE_SHAPES = {'C1': (7599500, 16),
 'C10': (5345303, 16),
 'C11': (561810, 16),
 'C12': (242827, 16),
 'C13': (11, 6),
 'C14': (2209, 16),
 'C15': (10616, 16),
 'C16': (100, 16),
 'C17': (4, 3),
 'C18': (968, 16),
 'C19': (15, 7),
 'C2': (33521, 16),
 'C20': (7838519, 16),
 'C21': (2580502, 16),
 'C22': (6878028, 16),
 'C23': (298771, 16),
 'C24': (11951, 16),
 'C25': (97, 16),
 'C26': (35, 12),
 'C3': (17022, 16),
 'C4': (7339, 16),
 'C5': (20046, 16),
 'C6': (4, 3),
 'C7': (7068, 16),
 'C8': (1377, 16),
 'C9': (63, 16)}

CONTINUOUS_COLUMNS = ['I' + str(x) for x in range(1,14)]
CATEGORICAL_COLUMNS =  ['C' + str(x) for x in range(1,27)]
LABEL_COLUMNS = ['label']

We import TensorFlow and set *TF_MEMORY_ALLOCATION*, that TensorFlow will not reserve the full GPU memory.

In [9]:
import time

import tensorflow as tf

from tensorflow.python.feature_column import feature_column_v2 as fc

# we can control how much memory to give tensorflow with this environment variable
# IMPORTANT: make sure you do this before you initialize TF's runtime, otherwise
# TF will have claimed all free GPU memory
os.environ['TF_MEMORY_ALLOCATION'] = "0.5" # fraction of free memory
from nvtabular.loader.tensorflow import KerasSequenceLoader, KerasSequenceValidater
from nvtabular.framework_utils.tensorflow import layers
from tensorflow.python.feature_column import feature_column_v2 as fc

from nvtabular.framework_utils.tensorflow import make_feature_column_workflow

We define the tf.keras.Input tensor and tf.feature_column s. A common technique is to use hashing with `tf.feature_column.categorical_column_with_hash_bucket` to reduce the dimensonality of the embedding tables. Optional we can add hash columns to our workflow.

In [10]:
inputs = {}
features = []

for col in CATEGORICAL_COLUMNS:
    inputs[col] =  tf.keras.Input(
        name=col,
        dtype=tf.int32,
        shape=(1,)
    )
    features.append(
        tf.feature_column.embedding_column(
            tf.feature_column.categorical_column_with_identity(
                col, 
                EMBEDDING_TABLE_SHAPES[col][0]                    # Input dimension (vocab size)
            ), EMBEDDING_TABLE_SHAPES[col][1]                     # Embedding output dimension
        )
    )
for col in CONTINUOUS_COLUMNS:
    inputs[col] =  tf.keras.Input(
        name=col,
        dtype=tf.float32,
        shape=(1,)
    )
    features.append(
        tf.feature_column.numeric_column(col, (1,))
    )
hash_postfix = 'nohash'

We initialize the data loader, depending on the data loader type DL_TYPE.

In [11]:
train_dataset_tf, valid_dataset_tf, features = get_dataloader(DL_TYPE, features, HASH)

We can specify to use mixed precision for the calculation.

In [12]:
if AMP:
    amp_postfix = 'amp'
else:
    amp_postfix = 'noamp'

We run the benchmark.

In [13]:
logfilename = 'testtf.log'
if BENCHMARK_TYPE=='time_training':
    model = get_model(HIDDEN_DIMS, inputs, features, DL_TYPE)
    run_time, num_steps_done, num_loops = time_training(model, train_dataset_tf, STEPS)



In [14]:
import pandas as pd
df = pd.concat([pd.read_parquet(x) for x in output_train_dir])

In [15]:
log_textfile(logfilename, 'Training', 'w')
log_textfile(logfilename, 'Time: ' + str(run_time), 'a')
log_textfile(logfilename, 'Throughput: ' + str(df.shape[0]/run_time), 'a')

Training
Time: 445.9770152568817
Throughput: 439130.21590853843
