In [1]:
# Copyright 2020 NVIDIA Corporation. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

<img src="http://developer.download.nvidia.com/compute/machine-learning/frameworks/nvidia_logo.png" style="width: 90px; float: right;">

# Preprocessing the Criteo Dataset
We are interested to benchmark the NVTabular data loader and compare its performance to the PyTorch "native" data loader and FastAI. We need to preprocess the dataset with NVTabular to normalize continuous features and categorify categorical ones.<br><br>
The input for this notebook is based on [optimize_criteo.ipynb](https://github.com/NVIDIA/NVTabular/blob/main/examples/optimize_criteo.ipynb).

In [2]:
import os, time
os.environ["CUDA_VISIBLE_DEVICES"]="0"

In [None]:
import os
from time import time
import re
import glob
import warnings

# tools for data preproc/loading
import torch
import rmm
import nvtabular as nvt
from nvtabular.ops import Normalize,  Categorify,  LogOp, FillMissing, Clip, get_embedding_sizes
from nvtabular.loader.torch import TorchAsyncItr, DLDataLoader
from nvtabular.utils import device_mem_size

We define a helper function.<br><br>
*preproces_criteo* defines a NVTabular workflow to preprocess the data. It fills missing values, clip them, apply the logarithm function. Finally, the continuous features are normalized and categorical features are categorify. For more details, take a look on the [Criteo example](https://github.com/NVIDIA/NVTabular/blob/main/examples/criteo-example.ipynb).

In [7]:
### Helper Function

def preproces_criteo():
    fname = 'day_{}.parquet'
    num_days = len([i for i in os.listdir(INPUT_DATA_DIR) if re.match(fname.format('[0-9]{1,2}'), i) is not None])
    train_paths = [os.path.join(INPUT_DATA_DIR, fname.format(day)) for day in range(1)]
    valid_paths = [os.path.join(INPUT_DATA_DIR, fname.format(day)) for day in [2]]
    train_paths, valid_paths
    
    proc = nvt.Workflow(
        cat_names=CATEGORICAL_COLUMNS,
        cont_names=CONTINUOUS_COLUMNS,
        label_name=LABEL_COLUMNS
    )
    
    proc.add_cont_feature([FillMissing(), Clip(min_value=0), LogOp()])
    proc.add_cont_preprocess(Normalize())
    proc.add_cat_preprocess(Categorify(freq_threshold=15, out_path=OUTPUT_DATA_DIR))
    
    train_dataset = nvt.Dataset(train_paths, engine='parquet', part_mem_fraction=0.15)
    valid_dataset = nvt.Dataset(valid_paths, engine='parquet', part_mem_fraction=0.15)
    
    os.system('rm -r ' + OUTPUT_DATA_DIR)
    os.system('mkdir -p ' + output_train_dir)
    os.system('mkdir -p ' + output_valid_dir)
    
    proc.apply(train_dataset, 
               shuffle=nvt.io.Shuffle.PER_PARTITION, 
               output_path=output_train_dir, 
               out_files_per_proc=20
              )
    
    proc.apply(valid_dataset, 
               record_stats=False, 
               shuffle=nvt.io.Shuffle.PER_PARTITION, 
               output_path=output_valid_dir, 
               out_files_per_proc=20
              )
    
    proc.save_stats(OUTPUT_DATA_DIR + '/stats_and_workflow')

First, we define the directory stucture. The base directory and input and output directories for .parquet.

In [8]:
# define some information about where to get our data
INPUT_DIR = '/raid/data/criteo/input/'
OUTPUT_DIR = '/raid/data/criteo/'
INPUT_DATA_DIR = INPUT_DIR
OUTPUT_DATA_DIR = os.environ.get('OUTPUT_DATA_DIR', OUTPUT_DIR + 'output') # where we'll save our procesed data to

output_train_dir = os.path.join(OUTPUT_DATA_DIR, 'train/')
output_valid_dir = os.path.join(OUTPUT_DATA_DIR, 'valid/')

We need to define the data schema, which column names are continouos, categorical and label columns.

In [9]:
# define our dataset schema
CONTINUOUS_COLUMNS = ['I' + str(x) for x in range(1,14)]
CATEGORICAL_COLUMNS =  ['C' + str(x) for x in range(1,27)]
LABEL_COLUMNS = ['label']
COLUMNS = CONTINUOUS_COLUMNS + CATEGORICAL_COLUMNS + LABEL_COLUMNS

We execute the NVTabular workflow to preprocess the dataset.

In [11]:
%%time

preproces_criteo()

CPU times: user 1min 28s, sys: 1min 4s, total: 2min 32s
Wall time: 3min 11s
