In [1]:
import os, time
os.environ["CUDA_VISIBLE_DEVICES"]="0"

In [2]:
import os
from time import time
import re
import glob
import warnings

# tools for data preproc/loading
import torch
import rmm
import nvtabular as nvt
from nvtabular.ops import Normalize,  Categorify,  LogOp, FillMissing, Clip, get_embedding_sizes
from nvtabular.loader.torch import TorchAsyncItr, DLDataLoader
from nvtabular.utils import device_mem_size

# tools for training
from fastai.basics import Learner
from fastai.tabular.model import TabularModel
from fastai.tabular.data import TabularDataLoaders
from fastai.metrics import accuracy
from fastai.callback.progress import ProgressCallback

import multiprocessing as mp
from itertools import repeat
from tqdm.notebook import trange

In [3]:
### Helper Function

def preproces_criteo():
    fname = 'day_{}.parquet'
    num_days = len([i for i in os.listdir(INPUT_DATA_DIR) if re.match(fname.format('[0-9]{1,2}'), i) is not None])
    train_paths = [os.path.join(INPUT_DATA_DIR, fname.format(day)) for day in range(1)]
    valid_paths = [os.path.join(INPUT_DATA_DIR, fname.format(day)) for day in [2]]
    train_paths, valid_paths
    
    proc = nvt.Workflow(
        cat_names=CATEGORICAL_COLUMNS,
        cont_names=CONTINUOUS_COLUMNS,
        label_name=LABEL_COLUMNS
    )
    
    proc.add_cont_feature([FillMissing(), Clip(min_value=0), LogOp()])
    proc.add_cont_preprocess(Normalize())
    proc.add_cat_preprocess(Categorify(freq_threshold=15, out_path=OUTPUT_DATA_DIR))
    
    train_dataset = nvt.Dataset(train_paths, engine='parquet', part_mem_fraction=0.15)
    valid_dataset = nvt.Dataset(valid_paths, engine='parquet', part_mem_fraction=0.15)
    
    os.system('rm -r ' + OUTPUT_DATA_DIR)
    os.system('mkdir -p ' + output_train_dir)
    os.system('mkdir -p ' + output_valid_dir)
    
    proc.apply(train_dataset, 
               shuffle=nvt.io.Shuffle.PER_PARTITION, 
               output_path=output_train_dir, 
               out_files_per_proc=20
              )
    
    proc.apply(valid_dataset, 
               record_stats=False, 
               shuffle=nvt.io.Shuffle.PER_PARTITION, 
               output_path=output_valid_dir, 
               out_files_per_proc=20
              )
    
    proc.save_stats(BASE_DIR + 'stats_wnd_workflow')

def get_model(hidden_dims, inputs, features):
    dense_layer = layers.DenseFeatures(features)
    x = dense_layer(inputs)
    
    for hidden in hidden_dims:
        x = tf.keras.layers.Dense(hidden, activation='relu')(x)
    
    x = tf.keras.layers.Dense(1, activation='sigmoid')(x)
    model = tf.keras.Model(inputs=inputs, outputs=x)
    metrics = [tf.keras.metrics.AUC(curve="ROC", name="auroc")]
    model.compile('sgd', 'binary_crossentropy', metrics=metrics)
    return(model)

def time_only_dl(dl, num_steps):
    start = time.time()
    i = 0
    j= 0
    bl_done = False
    while not(bl_done) and i<num_steps:
        for _, batch in enumerate(dl):
            if i == num_steps:
                bl_done = True
                break
            i+=1
        j+=1
    end = time.time()
    return(end-start, i, j)

def time_training(model, train_dataset_tf, steps):
    start = time.time()
    history = model.fit(train_dataset_tf, epochs=1, steps_per_epoch=steps)
    end = time.time()
    return(end-start, steps, 1)

In [4]:
# Which data loader to use
DL_TYPE = 'NVTabular'               # 'NVTabular', TensorFlow'
BENCHMARK_TYPE = 'convergence_val_loss'

# define some information about where to get our data
BASE_DIR = '/raid/data/criteo/'
INPUT_DATA_DIR = os.environ.get('INPUT_DATA_DIR', BASE_DIR + 'input/')
OUTPUT_DATA_DIR = os.environ.get('OUTPUT_DATA_DIR', BASE_DIR + 'output') # where we'll save our procesed data to
TFRECORD_DIR = os.environ.get("TFRECORD_DIR", BASE_DIR + 'tfrecords')
TFRECORDS_TRAIN = os.path.join(TFRECORD_DIR, 'train', '*.tfrecords')
TFRECORDS_VALID = os.path.join(TFRECORD_DIR, 'train', '*.tfrecords')

BATCH_SIZE = int(os.environ.get('BATCH_SIZE', 1024*64))
EPOCHS = 2
TRAIN_STEPS = 20
_EXAMPLES_PER_RECORD = 20000000
STEPS = int(150000000*1/BATCH_SIZE)

HIDDEN_DIMS = [128, 128, 128, 128]
PREPROCESS = False
PREPROCESS_TF = False


PARTS_PER_CHUNK = int(os.environ.get('PARTS_PER_CHUNK', 2))
NUM_TRAIN_DAYS = 23 # number of days worth of data to use for training, the rest will be used for validation

# define our dataset schema
CONTINUOUS_COLUMNS = ['I' + str(x) for x in range(1,14)]
CATEGORICAL_COLUMNS =  ['C' + str(x) for x in range(1,27)]
LABEL_COLUMNS = ['label']
COLUMNS = CONTINUOUS_COLUMNS + CATEGORICAL_COLUMNS + LABEL_COLUMNS
STEPS

2288

In [5]:
output_train_dir = os.path.join(OUTPUT_DATA_DIR, 'train/')
output_valid_dir = os.path.join(OUTPUT_DATA_DIR, 'valid/')
tf_input_train_dir = output_train_dir + '*.parquet'
tf_input_valid_dir = output_valid_dir + '*.parquet'

In [6]:
if PREPROCESS:
    preproces_criteo()

In [7]:
proc = nvt.Workflow(
    cat_names=CATEGORICAL_COLUMNS,
    cont_names=CONTINUOUS_COLUMNS,
    label_name=LABEL_COLUMNS
)
proc.load_stats(BASE_DIR + 'stats_wnd_workflow')
EMBEDDING_TABLE_SHAPES = nvt.ops.get_embedding_sizes(proc)

In [8]:
import time

import tensorflow as tf

from tensorflow.python.feature_column import feature_column_v2 as fc

# we can control how much memory to give tensorflow with this environment variable
# IMPORTANT: make sure you do this before you initialize TF's runtime, otherwise
# TF will have claimed all free GPU memory
os.environ['TF_MEMORY_ALLOCATION'] = "0.5" # fraction of free memory
from nvtabular.loader.tensorflow import KerasSequenceLoader, KerasSequenceValidater
from nvtabular.framework_utils.tensorflow import layers
from tensorflow.python.feature_column import feature_column_v2 as fc

In [9]:
inputs = {}
features = []

for col in CATEGORICAL_COLUMNS:
    inputs[col] =  tf.keras.Input(
        name=col,
        dtype=tf.int32,
        shape=(1,)
    )
    features.append(
        tf.feature_column.embedding_column(
            tf.feature_column.categorical_column_with_identity(
                col, 
                EMBEDDING_TABLE_SHAPES[col][0]                    # Input dimension (vocab size)
            ), EMBEDDING_TABLE_SHAPES[col][1]                     # Embedding output dimension
        )
    )
for col in CONTINUOUS_COLUMNS:
    inputs[col] =  tf.keras.Input(
        name=col,
        dtype=tf.float32,
        shape=(1,)
    )
    features.append(
        tf.feature_column.numeric_column(col, (1,))
    )

In [10]:
train_dataset_tf = KerasSequenceLoader(
    output_train_dir + '*.parquet', # you could also use a glob pattern
    batch_size=BATCH_SIZE,
    label_names=LABEL_COLUMNS,
    cat_names=CATEGORICAL_COLUMNS,
    cont_names=CONTINUOUS_COLUMNS,
    engine='parquet',
    shuffle=True,
    buffer_size=0.06, # how many batches to load at once
    parts_per_chunk=1
)
valid_dataset_tf = KerasSequenceLoader(
    output_valid_dir + '*.parquet', # you could also use a glob pattern
    batch_size=BATCH_SIZE,
    label_names=LABEL_COLUMNS,
    cat_names = CATEGORICAL_COLUMNS,
    cont_names=CONTINUOUS_COLUMNS,
    engine='parquet',
    shuffle=False,
    buffer_size=0.06,
    parts_per_chunk=1
)

In [11]:
DL_TYPE

'NVTabular'

In [12]:
class KerasSequenceValidater2_ModelPrediction(tf.keras.callbacks.Callback):
    # TODO: document
    _supports_tf_logs = True

    def __init__(self, dataloader):
        self.dataloader = dataloader

    def on_epoch_end(self, epoch, logs={}):
        i = 0
        start = time.time()
        for X, y_true in self.dataloader:
            i += 1
            y_pred = self.model(X)

            # TODO: how do we want to handle the multi-output case?
            #for metric in self.model.metrics:
            #    metric.update_state(y_true, y_pred)
            if (i % 10)==0:
                end = time.time()
                print('Time 10 batches: ' + str(end-start))
                start = time.time()
                print(i)
            if (i>100):
                break
        
        #for metric in self.model.metrics:
        #    logs["val_" + metric.name] = metric.result().numpy()
        return logs

In [13]:
model = get_model(HIDDEN_DIMS, inputs, features)

In [14]:
%%time

# Just executing the model prediction step outside validation callback
i = 0
start = time.time()
for X, y_true in train_dataset_tf:
    i += 1
    y_pred = model(X)
    if (i % 10)==0:
        end = time.time()
        print('Time 10 batches: ' + str(end-start))
        start = time.time()
        print(i)
    if (i>100):
        break

Time 10 batches: 5.90089750289917
10
Time 10 batches: 0.7925128936767578
20
Time 10 batches: 1.0462443828582764
30
Time 10 batches: 0.9086611270904541
40
Time 10 batches: 1.1971657276153564
50
Time 10 batches: 0.9518229961395264
60
Time 10 batches: 0.799907922744751
70
Time 10 batches: 0.45679450035095215
80
Time 10 batches: 0.9126536846160889
90
Time 10 batches: 0.9586730003356934
100
CPU times: user 17.1 s, sys: 6.55 s, total: 23.6 s
Wall time: 14 s


In [15]:
%%time

# Just executing the model prediction step outside validation callback
i = 0
start = time.time()
for X, y_true in valid_dataset_tf:
    i += 1
    y_pred = model(X)
    if (i % 10)==0:
        end = time.time()
        print('Time 10 batches: ' + str(end-start))
        start = time.time()
        print(i)
    if (i>100):
        break

Time 10 batches: 4.833057165145874
10
Time 10 batches: 3.9489147663116455
20
Time 10 batches: 6.718460559844971
30
Time 10 batches: 7.785595417022705
40
Time 10 batches: 7.827616453170776
50
Time 10 batches: 4.010151147842407
60
Time 10 batches: 6.016873121261597
70
Time 10 batches: 6.478650093078613
80
Time 10 batches: 5.842178106307983
90
Time 10 batches: 5.280597925186157
100
CPU times: user 1min 7s, sys: 6.29 s, total: 1min 14s
Wall time: 59.6 s


In [16]:
%%time

# Just executing the model prediction step outside validation callback
i = 0
start = time.time()
for X, y_true in valid_dataset_tf:
    i += 1
    #y_pred = model(X)
    if (i % 10)==0:
        end = time.time()
        print('Time 10 batches: ' + str(end-start))
        start = time.time()
        print(i)
    if (i>100):
        break

Time 10 batches: 0.019385337829589844
10
Time 10 batches: 9.036064147949219e-05
20
Time 10 batches: 0.00010704994201660156
30
Time 10 batches: 8.511543273925781e-05
40
Time 10 batches: 0.0628824234008789
50
Time 10 batches: 0.015411138534545898
60
Time 10 batches: 0.0001266002655029297
70
Time 10 batches: 3.266334533691406e-05
80
Time 10 batches: 3.1948089599609375e-05
90
Time 10 batches: 3.1948089599609375e-05
100
CPU times: user 100 ms, sys: 48.6 ms, total: 149 ms
Wall time: 119 ms


In [17]:
validation_callback = KerasSequenceValidater2_ModelPrediction(valid_dataset_tf)
history = model.fit(train_dataset_tf, 
                    epochs=1, 
                    steps_per_epoch=20,
                    callbacks=[validation_callback])

10
Time 10 batches: 7.0142738819122314
20
Time 10 batches: 5.858387470245361
30
Time 10 batches: 4.22637677192688
40
Time 10 batches: 7.8126726150512695
50
Time 10 batches: 6.899638652801514
60
Time 10 batches: 7.092344522476196
70
Time 10 batches: 7.813392400741577
80
Time 10 batches: 6.825751304626465
90
Time 10 batches: 8.5209801197052
100


In [18]:
class KerasSequenceValidater2_noModelPrediction(tf.keras.callbacks.Callback):
    # TODO: document
    _supports_tf_logs = True

    def __init__(self, dataloader):
        self.dataloader = dataloader

    def on_epoch_end(self, epoch, logs={}):
        i = 0
        start = time.time()
        for X, y_true in self.dataloader:
            i += 1
            #y_pred = self.model(X)

            # TODO: how do we want to handle the multi-output case?
            #for metric in self.model.metrics:
            #    metric.update_state(y_true, y_pred)
            if (i % 10)==0:
                end = time.time()
                print('Time 10 batches: ' + str(end-start))
                start = time.time()
                print(i)
            if (i>100):
                break
            
        #for metric in self.model.metrics:
        #    logs["val_" + metric.name] = metric.result().numpy()
        return logs

In [19]:
validation_callback = KerasSequenceValidater2_noModelPrediction(valid_dataset_tf)
history = model.fit(train_dataset_tf, 
                    epochs=1, 
                    steps_per_epoch=20,
                    callbacks=[validation_callback])

10
Time 10 batches: 8.535385131835938e-05
20
Time 10 batches: 8.034706115722656e-05
30
Time 10 batches: 8.249282836914062e-05
40
Time 10 batches: 8.130073547363281e-05
50
Time 10 batches: 8.106231689453125e-05
60
Time 10 batches: 8.153915405273438e-05
70
Time 10 batches: 0.030339717864990234
80
Time 10 batches: 0.010444879531860352
90
Time 10 batches: 0.010492086410522461
100
