In [1]:
# Copyright 2020 NVIDIA Corporation. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

<img src="http://developer.download.nvidia.com/compute/machine-learning/frameworks/nvidia_logo.png" style="width: 90px; float: right;">

# Benchmark NVTabular data loader
We are interested to benchmark the NVTabular data loader and compare its performance to the TensorFlow "native" data loader based on tf.records. In [benchmark-preprocess.ipynb](???), we preprocess the dataset, ready to use for NVTabular data loader (parquet) and TensorFlow native data loader (tf.records). In this notebook, we will train a neural network in TensorFlow using either data loader and measure the performance.

First, we install gpustat

In [2]:
!pip install gpustat tqdm

Collecting gpustat
  Downloading gpustat-0.6.0.tar.gz (78 kB)
[K     |████████████████████████████████| 78 kB 9.9 MB/s  eta 0:00:01
Collecting nvidia-ml-py3>=7.352.0
  Downloading nvidia-ml-py3-7.352.0.tar.gz (19 kB)
Collecting blessings>=1.6
  Downloading blessings-1.7-py3-none-any.whl (18 kB)
Building wheels for collected packages: gpustat, nvidia-ml-py3
  Building wheel for gpustat (setup.py) ... [?25ldone
[?25h  Created wheel for gpustat: filename=gpustat-0.6.0-py3-none-any.whl size=12617 sha256=d2a49287f83e5b8c20cd41c6cafad1675d2b7f629071d2fd0c21797d8e5a5dd2
  Stored in directory: /root/.cache/pip/wheels/0d/d9/80/b6cbcdc9946c7b50ce35441cc9e7d8c5a9d066469ba99bae44
  Building wheel for nvidia-ml-py3 (setup.py) ... [?25ldone
[?25h  Created wheel for nvidia-ml-py3: filename=nvidia_ml_py3-7.352.0-py3-none-any.whl size=19191 sha256=816cd59c1f25b6cb792b6331198ac66502b0048c0d2748985ce9a25a53b6edba
  Stored in directory: /root/.cache/pip/wheels/b9/b1/68/cb4feab29709d4155310d29a4213896

We run single GPU version and set only one visible device.

In [1]:
import os, time
os.environ["CUDA_VISIBLE_DEVICES"]="0"

We import the required libraries.

In [2]:
import glob
import gc
import pickle
from time import time


import nvtabular as nvt

import pandas as pd
import numpy as np

from fastai.tabular.data import TabularDataLoaders

from torch.utils.data import DataLoader
from torch.utils.data import Dataset

We define the CustomDataset, which is used for the native PyTorch dataloader

In [3]:
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

import numpy as np

class CustomDataset(Dataset):
  """Simple dataset class for dataloader"""
  def __init__(self, X, cont_names, cat_names, label_name):
    """Initialize the CustomDataset"""
    self.X_cont = X[cont_names].values
    self.X_cat = X[cat_names].values
    self.y = X[label_name].values
    
  def __len__(self):
    """Return the total length of the dataset"""
    dataset_size = self.X_cat.shape[0]
    return dataset_size
  
  def __getitem__(self, idx):
    """Return the batch given the indices"""
    return (self.X_cat[idx].astype(np.int64), 
            self.X_cont[idx].astype(np.float32), 
            self.y[idx].astype(np.float32))

We define a simple feed forward neural network with embedding layers for categorical features.

In [4]:
import torch
import torch.nn as nn

from nvtabular.loader.torch import TorchAsyncItr, DLDataLoader

from nvtabular.framework_utils.torch.utils import process_epoch

class EmbeddingBlock(nn.Module):
    def __init__(self, cat_names, cat_stats):
        super(EmbeddingBlock, self).__init__()
        emb_list = []
        size = 0
        for i, emb in enumerate(cat_names):
            emb_size = cat_stats[emb][1]
            emb_list.append(nn.Embedding(cat_stats[emb][0], emb_size))
            size += emb_size
        self.emb = nn.ModuleList(emb_list)
        self.out_size = size
    
    def forward(self, X):
        x = [e(X[:,i]) for i, e in enumerate(self.emb)]
        return(x)

class MLPTower(nn.Module):
    def __init__(self, in_size, out_size, hidden_layers):
        super(MLPTower, self).__init__()
        layers = []
        for i, tmp_out_size in enumerate(hidden_layers):
            layers.append(nn.Linear(in_size, tmp_out_size))
            layers.append(nn.ReLU())
            in_size = tmp_out_size
        layers.append(nn.Linear(in_size, out_size))
        self.hidden = nn.Sequential(*layers)
    
    def forward(self, X):
        return self.hidden(X)

class SimpleNN(nn.Module):
    def __init__(self, 
                 cont_names, 
                 cat_names, 
                 cat_stats,
                 hidden_layers, 
                 out_size=1,
                ):
        super(SimpleNN, self).__init__()
        self.embblock = EmbeddingBlock(cat_names, cat_stats)
        
        mlp_input_size = self.embblock.out_size+len(cont_names)
        self.topmlp = MLPTower(mlp_input_size, 
                               out_size=out_size, 
                               hidden_layers=hidden_layers
                              )
        
    def forward(self, X_cat, X_num):
        x_embblock = self.embblock(X_cat)
        x_deep = torch.cat(x_embblock + [X_num], axis=1)
        x_deep = self.topmlp(x_deep)
        x_out = x_deep
        return(torch.squeeze(x_out))
    

def process_epoch(
    dataloader,
    model,
    train=False,
    optimizer=None,
    loss_func=torch.nn.MSELoss(),
    transform=None,
    amp=True,
    device=None,
    steps=None
):
    """
    The controlling function that loads data supplied via a dataloader to a model. Can be redefined
    based on parameters.
    Parameters
    -----------
    dataloader : iterator
        Iterator that contains the dataset to be submitted to the model.
    model : torch.nn.Module
        Pytorch model to run data through.
    train : bool
        Indicate whether dataloader contains training set.
    optimizer : object
        Optimizer to run in conjunction with model.
    loss_func : function
        Loss function to use, default is MSELoss.
    """
    n=0
    model.train(mode=train)
    with torch.set_grad_enabled(train):
        y_list, y_pred_list = [], []
        for idx, batch in enumerate(iter(dataloader)):
            n+=batch[0].shape[0]
            if transform:
                x_cat, x_cont, y = transform(batch)
            else:
                x_cat, x_cont, y = batch
            if device:
                x_cat = x_cat.to(device)
                x_cont = x_cont.to(device)
                y = y.to(device)
            y = torch.squeeze(y).float()
            #y_list.append(y.detach())
            # maybe autocast goes here?
            if amp:
                with torch.cuda.amp.autocast():
                    y_pred = model(x_cat, x_cont)
                    #y_pred_list.append(y_pred.detach())
                    loss = loss_func(y_pred, y)
            else:
                y_pred = model(x_cat, x_cont)
                #y_pred_list.append(y_pred.detach())
                loss = loss_func(y_pred, y)
            if train:
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
            if steps:
                if (idx+1)>=steps:
                    break
    #y = torch.cat(y_list)
    #y_pred = torch.cat(y_pred_list)
    epoch_loss = loss_func(y_pred, y).item()
    return epoch_loss, 0, 0, n

We define multiple helper functions.<br><br>
*get_dataloader* returns the NVTabular data loader, PyTorch or FastAI data loader, depending on dl_type<br>
*log_textfile* stores output to a textfile

In [5]:
### Helper Function
def get_dataloader(dl_type='NVTabular'):
    if dl_type=='NVTabular':
        train_dataset = TorchAsyncItr(nvt.Dataset(train_files), 
                                      batch_size=BATCH_SIZE, 
                                      cats=CATEGORICAL_COLUMNS, 
                                      conts=CONTINUOUS_COLUMNS, 
                                      labels=LABEL_COLUMNS
                                     )
        train_loader = DLDataLoader(train_dataset, 
                                    batch_size=None,
                                    pin_memory=False, 
                                    num_workers=0
                                   )
        valid_dataset = TorchAsyncItr(nvt.Dataset(valid_files), 
                                      batch_size=BATCH_SIZE, 
                                      cats=CATEGORICAL_COLUMNS, 
                                      conts=CONTINUOUS_COLUMNS, 
                                      labels=LABEL_COLUMNS)
        valid_loader = DLDataLoader(valid_dataset, 
                                    batch_size=None,
                                    pin_memory=False, 
                                    num_workers=0)
    if dl_type=='PyTorch':
        pd_train = pd.concat([pd.read_parquet(x) for x in train_files])
        train_dataset = CustomDataset(pd_train, 
                                      cat_names=sorted(CATEGORICAL_COLUMNS), 
                                      cont_names=CONTINUOUS_COLUMNS, 
                                      label_name=LABEL_COLUMNS)
        train_loader = DataLoader(train_dataset, 
                                  batch_size=BATCH_SIZE, 
                                  shuffle=True, 
                                  num_workers=16)
        valid_loader = None
    if dl_type=='FastAI':
        df_valid = pd.concat([pd.read_parquet(x) for x in valid_files[0:1]])
        df_train = pd.concat([pd.read_parquet(x) for x in train_files])
        idx_train = df_train.shape[0]
        df = pd.concat([df_train, df_valid])
        del df_train, df_valid
        gc.collect()
        dl = TabularDataLoaders.from_df(df, 
                                cat_names=sorted(CATEGORICAL_COLUMNS), 
                                cont_names=CONTINUOUS_COLUMNS,
                                y_names=LABEL_COLUMNS[0],
                                valid_idx=list(range(idx_train,df.shape[0])),
                                bs=BATCH_SIZE,
                                val_bs=BATCH_SIZE,
                                device=todevice
                               )
        train_loader, valid_loader = dl.loaders
    return(train_loader, valid_loader)

def log_textfile(filename, text, mode):
    print(text)
    f = open(filename, mode)
    f.write(str(text) + str('\n'))
    f.close()

In addition, we define functions to measure the performance.<br><br>
*time_only_dl* measures the time for just iterating through the dataset for 1 epoch WITHOUT training a model<br>
*time_training* measures the time for training a model for 1 epoch

In [6]:
def time_only_dl(dl):
    start = time()
    i = 0
    n = 0
    for _, batch in enumerate(dl):
        i+=1
        n+=batch[0].shape[0]
    end = time()
    return(end-start, i, n)

def time_training(model, train_dataset, optimizer, amp=True, device=None):
    start = time()
    train_loss, y_pred, y, n = process_epoch(train_dataset, 
                                             model, 
                                             train=True,
                                             loss_func=torch.nn.BCEWithLogitsLoss(),
                                             optimizer=optimizer,
                                             amp=amp,
                                             device=device)
    end = time()
    return(end-start, n)

We define which benchmark, we want to run.

In [7]:
AMP = False
DL_TYPES = ['NVTabular', 'PyTorch', 'FastAI']
BENCHMARK_TYPES = ['time_only_dl', 'time_training', 'convergence_training_loss', 'convergence_val_loss']
DL_TYPE = 'NVTabular'
BENCHMARK_TYPE = 'time_training'
CPU = False

if DL_TYPE not in DL_TYPES:
    raise ValueError(DL_TYPE + ' is not supported.  Choose from ' + str(DL_TYPES))
    
if BENCHMARK_TYPE not in BENCHMARK_TYPES:
    raise ValueError(BENCHMARK_TYPE + ' is not supported. Choose from ' + str(BENCHMARK_TYPES))

if CPU:
    todevice = "cpu"
else:
    todevice = "cuda:0"

We define the inpurt directory for the parquet and tf.records file.

In [8]:
# define some information about where to get our data
OUTPUT_DIR = '/raid/data/criteo/output/'
OUTPUT_DATA_DIR = os.environ.get('OUTPUT_DATA_DIR', OUTPUT_DIR + 'output') # where we'll save our procesed data to

output_train_dir = os.path.join(OUTPUT_DATA_DIR, 'train/')
output_valid_dir = os.path.join(OUTPUT_DATA_DIR, 'valid/')

train_files = glob.glob(output_train_dir + '*.parquet')
valid_files = glob.glob(output_valid_dir + '*.parquet')
train_files, valid_files

(['/raid/data/criteo/output/output/train/12.353c8050ed1f4702bde63bd697c9cc36.parquet',
  '/raid/data/criteo/output/output/train/18.6e5ac8d8f49f46089cecceb6d044adda.parquet',
  '/raid/data/criteo/output/output/train/14.98377a29380145999e7e1e0d4c75eb81.parquet',
  '/raid/data/criteo/output/output/train/4.1907f580308044b6a32da25e9d3f1540.parquet',
  '/raid/data/criteo/output/output/train/8.68a9f5f1feb64fe59fe160211b54afd4.parquet',
  '/raid/data/criteo/output/output/train/1.8772a8717fae46df8b521cdd891df07a.parquet',
  '/raid/data/criteo/output/output/train/16.7a1f69dd7d834844bbfb68e659bb362c.parquet',
  '/raid/data/criteo/output/output/train/6.46cd6180dc42484199d895a2de02a1a6.parquet',
  '/raid/data/criteo/output/output/train/2.1063c71d33e646fe8e6218bc2457c3cb.parquet',
  '/raid/data/criteo/output/output/train/7.1a249ca9dbf44b8e9ca6b0ce740e3a88.parquet',
  '/raid/data/criteo/output/output/train/0.cb895dbeefa14e65aa2e715b295ac9e7.parquet',
  '/raid/data/criteo/output/output/train/5.c311b59

We define some hyperparameters and network architecture.

In [9]:
# Batch size for training the deep learning model
BATCH_SIZE = int(os.environ.get('BATCH_SIZE', 1024*32))      
# Number of epochs (only for convergence_val_loss)
EPOCHS = 1
# Number of steps in training to collect train_loss (only for convergence_training_loss)
TRAIN_STEPS = 20
# Max. number of steps per epoch (tf.records allows only full batches)
STEPS = int(150000000/BATCH_SIZE)
# Number of units in hidden layer - length is number of hidden layers
HIDDEN_DIMS = [1024, 1024, 1024, 1024]
# Number of parts using in shuffling of NVTabular data loader
PARTS_PER_CHUNK = int(os.environ.get('PARTS_PER_CHUNK', 1))

We load the saved NVTabular workflow to extract the data schema and some statistics.

In [10]:
EMBEDDING_TABLE_SHAPES = {'C1': (7599500, 16),
 'C10': (5345303, 16),
 'C11': (561810, 16),
 'C12': (242827, 16),
 'C13': (11, 6),
 'C14': (2209, 16),
 'C15': (10616, 16),
 'C16': (100, 16),
 'C17': (4, 3),
 'C18': (968, 16),
 'C19': (15, 7),
 'C2': (33521, 16),
 'C20': (7838519, 16),
 'C21': (2580502, 16),
 'C22': (6878028, 16),
 'C23': (298771, 16),
 'C24': (11951, 16),
 'C25': (97, 16),
 'C26': (35, 12),
 'C3': (17022, 16),
 'C4': (7339, 16),
 'C5': (20046, 16),
 'C6': (4, 3),
 'C7': (7068, 16),
 'C8': (1377, 16),
 'C9': (63, 16)}

CONTINUOUS_COLUMNS = ['I' + str(x) for x in range(1,14)]
CATEGORICAL_COLUMNS =  ['C' + str(x) for x in range(1,27)]
LABEL_COLUMNS = ['label']

We initialize our model.

In [11]:
model = SimpleNN(
     cont_names=CONTINUOUS_COLUMNS, 
     cat_names=sorted(CATEGORICAL_COLUMNS), 
     cat_stats=EMBEDDING_TABLE_SHAPES,
     hidden_layers=HIDDEN_DIMS
).to(todevice)

In [12]:
model

SimpleNN(
  (embblock): EmbeddingBlock(
    (emb): ModuleList(
      (0): Embedding(7599500, 16)
      (1): Embedding(5345303, 16)
      (2): Embedding(561810, 16)
      (3): Embedding(242827, 16)
      (4): Embedding(11, 6)
      (5): Embedding(2209, 16)
      (6): Embedding(10616, 16)
      (7): Embedding(100, 16)
      (8): Embedding(4, 3)
      (9): Embedding(968, 16)
      (10): Embedding(15, 7)
      (11): Embedding(33521, 16)
      (12): Embedding(7838519, 16)
      (13): Embedding(2580502, 16)
      (14): Embedding(6878028, 16)
      (15): Embedding(298771, 16)
      (16): Embedding(11951, 16)
      (17): Embedding(97, 16)
      (18): Embedding(35, 12)
      (19): Embedding(17022, 16)
      (20): Embedding(7339, 16)
      (21): Embedding(20046, 16)
      (22): Embedding(4, 3)
      (23): Embedding(7068, 16)
      (24): Embedding(1377, 16)
      (25): Embedding(63, 16)
    )
  )
  (topmlp): MLPTower(
    (hidden): Sequential(
      (0): Linear(in_features=380, out_features=1024,

We initialize the optimizer.

In [13]:
optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9)

We get our dataloader.

In [14]:
train_dataset, valid_dataset = get_dataloader(DL_TYPE)

We update some parameter.

In [15]:
if AMP:
    amp_postfix = 'amp'
else:
    amp_postfix = 'noamp'

Certain combinations are available by default and we do not specfic move the data to the device.

In [16]:
if DL_TYPE=='NVTabular' and todevice=='cuda:0':
    todevice=None
if DL_TYPE=='FastAI' and todevice=='cuda:0':
    todevice=None
if DL_TYPE=='FastAI' and todevice=='cpu':
    todevice=None
if DL_TYPE=='PyTorch' and todevice=='cpu':
    todevice=None

We run the benchmark.

In [17]:
logfilename = 'testpy.log'
run_time, n_samples = time_training(model, train_dataset, optimizer, amp=AMP, device=todevice)
log_textfile(logfilename, 'Training', 'w')
log_textfile(logfilename, 'Len: ' + str(len(train_dataset)), 'a')
log_textfile(logfilename, 'Samples: ' + str(n_samples), 'a')
log_textfile(logfilename, 'Time: ' + str(run_time), 'a')
log_textfile(logfilename, 'Throughput: ' + str(n_samples/run_time), 'a')

Training
Len: 5977
Samples: 195841983
Time: 573.9222145080566
Throughput: 341234.36599830515
