# TOC of Ray End-to-End NLP Example


First we import some libraries we need for the example.

In [1]:
import argparse
import logging
import json
import os
import time
from filelock import FileLock
from dataclasses import dataclass, field
from typing import Optional
import random

import numpy as np
import torch
from torch.utils.data import (DataLoader, RandomSampler, 
                              SequentialSampler, TensorDataset)
from tqdm import trange
import torch.distributed as dist

from transformers import (AdamW,
                          AutoConfig, AutoModelForSequenceClassification,
                          AutoTokenizer, get_linear_schedule_with_warmup,
                          HfArgumentParser, TrainingArguments)
from transformers import glue_output_modes as output_modes
from transformers import glue_processors as processors

import ray
from ray.util.sgd.torch import TrainingOperator
from ray.util.sgd import TorchTrainer

try:
    from apex import amp
except ImportError:
    amp = None

Then we set some configuration arguments.

In [8]:
# Training arguments (from hugging face)
training_arguments = TrainingArguments(
    output_dir = "output_dir/",
    learning_rate = "2e-5",
    num_train_epochs = 3,
    per_gpu_train_batch_size = 8,
    fp16 = True,
    do_train = True,
    do_eval = True
)
args = argparse.Namespace(**vars(training_arguments))

# Model arguments
args.model_name_or_path = "bert-base-cased"
args.model_type = "bert"
args.config_name = None
args.tokenizer_name = None
args.cache_dir = None

# Data processing arguments
args.task_name = "MRPC"
args.data_dir = "glue_data/MNLI/"
args.max_seq_length = 128
args.overwrite_cache = False

# Ray arguments
args.num_workers = 8
args.address = "auto"

use_gpu = torch.cuda.is_available() and not args.no_cuda

# GLUE task preparation
args.task_name = args.task_name.lower()
assert args.task_name in processors
args.output_mode = output_modes[args.task_name]

args

Namespace(adam_epsilon=1e-08, address=None, cache_dir=None, config_name=None, data_dir='glue_data/MNLI/', do_eval=True, do_predict=False, do_train=True, evaluate_during_training=False, fp16=True, fp16_opt_level='O1', gradient_accumulation_steps=1, learning_rate='2e-5', local_rank=-1, logging_dir=None, logging_first_step=False, logging_steps=500, max_grad_norm=1.0, max_seq_length=128, max_steps=-1, model_name_or_path='bert-base-cased', model_type='bert', no_cuda=False, num_train_epochs=3, num_workers=8, output_dir='output_dir/', output_mode='classification', overwrite_cache=False, overwrite_output_dir=False, per_gpu_eval_batch_size=8, per_gpu_train_batch_size=8, save_steps=500, save_total_limit=None, seed=42, task_name='mrpc', tokenizer_name=None, tpu_metrics_debug=False, tpu_num_cores=None, warmup_steps=0, weight_decay=0.0)

Then we connect to the ray server.

In [9]:
ray.init(address=args.address)

2020-05-19 23:03:58,663	INFO resource_spec.py:212 -- Starting Ray with 156.69 GiB memory available for workers and up to 71.15 GiB for objects. You can adjust these settings with ray.init(memory=<bytes>, object_store_memory=<bytes>).
2020-05-19 23:03:59,208	INFO services.py:1166 -- View the Ray dashboard at [1m[32mlocalhost:8266[39m[22m


{'node_ip_address': '172.31.23.151',
 'raylet_ip_address': '172.31.23.151',
 'redis_address': '172.31.23.151:52223',
 'object_store_address': '/tmp/ray/session_2020-05-19_23-03-58_659980_70439/sockets/plasma_store',
 'raylet_socket_name': '/tmp/ray/session_2020-05-19_23-03-58_659980_70439/sockets/raylet',
 'webui_url': 'localhost:8266',
 'session_dir': '/tmp/ray/session_2020-05-19_23-03-58_659980_70439'}

[2m[33m(pid=raylet)[0m I0519 23:04:00.258425 70600 global_state_accessor.cc:25] Redis server address = 172.31.23.151:52223, is test flag = 0
[2m[33m(pid=raylet)[0m I0519 23:04:00.272583 70601 global_state_accessor.cc:25] Redis server address = 172.31.23.151:52223, is test flag = 0
[2m[33m(pid=raylet)[0m I0519 23:04:00.277974 70600 redis_client.cc:141] RedisClient connected.
[2m[33m(pid=raylet)[0m I0519 23:04:00.284288 70612 global_state_accessor.cc:25] Redis server address = 172.31.23.151:52223, is test flag = 0
[2m[33m(pid=raylet)[0m I0519 23:04:00.284420 70604 global_state_accessor.cc:25] Redis server address = 172.31.23.151:52223, is test flag = 0
[2m[33m(pid=raylet)[0m I0519 23:04:00.285459 70620 global_state_accessor.cc:25] Redis server address = 172.31.23.151:52223, is test flag = 0
[2m[33m(pid=raylet)[0m I0519 23:04:00.285903 70601 redis_client.cc:141] RedisClient connected.
[2m[33m(pid=raylet)[0m I0519 23:04:00.287313 70620 redis_client.cc:141] RedisClie

Now we define several helper functions for Torch trainer. First we define the data creator.

In [14]:
def load_and_cache_examples(args, task, tokenizer, evaluate=False):
    processor = processors[task]()
    output_mode = output_modes[task]
    # Load data features from cache or dataset file
    cached_features_file = os.path.join(
        args.data_dir,
        "cached_{}_{}_{}_{}".format(
            "dev" if evaluate else "train",
            list(filter(None, args.model_name_or_path.split("/"))).pop(),
            str(args.max_seq_length),
            str(task),
        ),
    )

    with FileLock("/tmp/load_and_cache_examples.lock"):
        if os.path.exists(cached_features_file) and not args.overwrite_cache:
            print("Loading features from cached file %s",
                  cached_features_file)
            features = torch.load(cached_features_file)
        else:
            print("Creating features from dataset file at %s",
                  args.data_dir)
            label_list = processor.get_labels()
            if task in ["mnli", "mnli-mm"
                        ] and args.model_type in ["roberta", "xlmroberta"]:
                # HACK(label indices are swapped in RoBERTa pretrained model)
                label_list[1], label_list[2] = label_list[2], label_list[1]
            examples = (processor.get_dev_examples(args.data_dir) if evaluate
                        else processor.get_train_examples(args.data_dir))
            features = convert_examples_to_features(
                examples,
                tokenizer,
                label_list=label_list,
                max_length=args.max_seq_length,
                output_mode=output_mode,
            )
            if not os.path.exists(cached_features_file):
                print("Saving features into cached file %s",
                      cached_features_file)
                torch.save(features, cached_features_file)

    # Convert to Tensors and build dataset
    all_input_ids = torch.tensor(
        [f.input_ids for f in features], dtype=torch.long)
    all_attention_mask = torch.tensor(
        [f.attention_mask for f in features], dtype=torch.long)
    all_token_type_ids = torch.tensor(
        [f.token_type_ids for f in features], dtype=torch.long)
    if output_mode == "classification":
        all_labels = torch.tensor(
            [f.label for f in features], dtype=torch.long)
    elif output_mode == "regression":
        all_labels = torch.tensor(
            [f.label for f in features], dtype=torch.float)

    dataset = TensorDataset(all_input_ids, all_attention_mask,
                            all_token_type_ids, all_labels)
    return dataset

def data_creator(config):
    args = config["args"]
    start = time.time()
    tokenizer = AutoTokenizer.from_pretrained(
        args.tokenizer_name
        if args.tokenizer_name else args.model_name_or_path,
        cache_dir=args.cache_dir if args.cache_dir else None,
    )
    print("tokenizer instantiation time: {}".format(time.time() - start))

    train_dataset = load_and_cache_examples(
        args, args.task_name, tokenizer, evaluate=False)
    train_sampler = RandomSampler(
        train_dataset) if not dist.is_initialized() else None
    return DataLoader(
        train_dataset,
        sampler=train_sampler,
        batch_size=args.per_gpu_train_batch_size)

Then we define the model creator for the Torch trainer.

In [12]:
def model_creator(config):
    with FileLock(os.path.expanduser("~/.download.lock")):
        args = config["args"]
        processor = processors[args.task_name]()
        label_list = processor.get_labels()
        num_labels = len(label_list)
        config = AutoConfig.from_pretrained(
            args.config_name if args.config_name else args.model_name_or_path,
            num_labels=num_labels,
            finetuning_task=args.task_name,
            cache_dir=args.cache_dir if args.cache_dir else None,
        )
        model = AutoModelForSequenceClassification.from_pretrained(
            args.model_name_or_path,
            from_tf=bool(".ckpt" in args.model_name_or_path),
            config=config,
            cache_dir=args.cache_dir if args.cache_dir else None,
        )
    return model

Define the optimizer creator.

In [None]:
def optimizer_creator(model, config):
    args = config["args"]
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [
                p for n, p in model.named_parameters()
                if not any(nd in n for nd in no_decay)
            ],
            "weight_decay": args.weight_decay,
        },
        {
            "params": [
                p for n, p in model.named_parameters()
                if any(nd in n for nd in no_decay)
            ],
            "weight_decay": 0.0
        },
    ]

    return AdamW(
        optimizer_grouped_parameters,
        lr=args.learning_rate,
        eps=args.adam_epsilon)
