# TOC of Ray End-to-End NLP Example


In [6]:
!git clone https://github.com/huggingface/transformers
!cd transformers && pip install . && pip install -r ./examples/requirements.txt
!pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.9.0.dev0-cp36-cp36m-manylinux1_x86_64.whl
!pip install ray[tune]
!git clone https://github.com/NVIDIA/apex; cd apex && pip install -v --no-cache-dir  ./

fatal: destination path 'transformers' already exists and is not an empty directory.
Processing /home/ubuntu/ray-e2e-nlp-example/transformers
Collecting tokenizers==0.7.0 (from transformers==2.9.1)
  Using cached https://files.pythonhosted.org/packages/14/e5/a26eb4716523808bb0a799fcfdceb6ebf77a18169d9591b2f46a9adb87d9/tokenizers-0.7.0-cp36-cp36m-manylinux1_x86_64.whl
Collecting regex!=2019.12.17 (from transformers==2.9.1)
[?25l  Downloading https://files.pythonhosted.org/packages/1d/07/fb11080a1324bc8d7b68deb009a4c08bd675e0789a213028c58323c4aaab/regex-2020.5.14-cp36-cp36m-manylinux1_x86_64.whl (675kB)
[K    100% |████████████████████████████████| 686kB 13.3MB/s ta 0:00:01
[?25hCollecting sentencepiece (from transformers==2.9.1)
[?25l  Downloading https://files.pythonhosted.org/packages/d4/a4/d0a884c4300004a78cca907a6ff9a5e9fe4f090f5d95ab341c53d28cbc58/sentencepiece-0.1.91-cp36-cp36m-manylinux1_x86_64.whl (1.1MB)
[K    100% |████████████████████████████████| 1.1MB 14.0MB/s ta 0:00:

In [None]:
! wget https://gist.githubusercontent.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e/raw/17b8dd0d724281ed7c3b2aeeda662b92809aadd5/download_glue_data.py && python download_glue_data.py

In [1]:
!nvidia-smi

Thu May 21 17:45:46 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 418.87.01    Driver Version: 418.87.01    CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla V100-SXM2...  On   | 00000000:00:1B.0 Off |                    0 |
| N/A   44C    P0    39W / 300W |      0MiB / 16130MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
|   1  Tesla V100-SXM2...  On   | 00000000:00:1C.0 Off |                    0 |
| N/A   42C    P0    39W / 300W |      0MiB / 16130MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
|   2  Tesla V100-SXM2...  On   | 00000000:00:1D.0 Off |                    0 |
| N/A   

First we import some libraries we need for the example.

In [2]:
import argparse
import logging
import json
import os
import time
from filelock import FileLock
from dataclasses import dataclass, field
from typing import Optional
import random

import numpy as np
import torch
from torch.utils.data import (DataLoader, RandomSampler, 
                              SequentialSampler, TensorDataset)
from tqdm import trange
import torch.distributed as dist

from transformers import (AdamW,
                          AutoConfig, AutoModelForSequenceClassification,
                          AutoTokenizer, get_linear_schedule_with_warmup,
                          HfArgumentParser, TrainingArguments)
from transformers import glue_output_modes as output_modes
from transformers import glue_processors as processors
from transformers import (glue_convert_examples_to_features as
                          convert_examples_to_features)

import ray
from ray.util.sgd.torch import TrainingOperator
from ray.util.sgd import TorchTrainer

try:
    from apex import amp
except ImportError:
    amp = None

Then we set some configuration arguments.

In [3]:
# Training arguments (from hugging face)
training_arguments = TrainingArguments(
    output_dir = "output_dir/",
    learning_rate = 2e-5,
    num_train_epochs = 3,
    per_gpu_train_batch_size = 8,
    fp16 = True,
    do_train = True,
    do_eval = True
)
args = argparse.Namespace(**vars(training_arguments))

# Model arguments
args.model_name_or_path = "bert-base-cased"
args.model_type = "bert"
args.config_name = None
args.tokenizer_name = None
args.cache_dir = None

# Data processing arguments
args.task_name = "RTE"
args.data_dir = "glue_data/RTE/"
args.max_seq_length = 128
args.overwrite_cache = False

# Ray arguments
args.num_workers = 8
args.address = "auto"

use_gpu = torch.cuda.is_available() and not args.no_cuda

# GLUE task preparation
args.task_name = args.task_name.lower()
assert args.task_name in processors
args.output_mode = output_modes[args.task_name]

args

Namespace(adam_epsilon=1e-08, address='auto', cache_dir=None, config_name=None, data_dir='glue_data/RTE/', do_eval=True, do_predict=False, do_train=True, evaluate_during_training=False, fp16=True, fp16_opt_level='O1', gradient_accumulation_steps=1, learning_rate=2e-05, local_rank=-1, logging_dir=None, logging_first_step=False, logging_steps=500, max_grad_norm=1.0, max_seq_length=128, max_steps=-1, model_name_or_path='bert-base-cased', model_type='bert', no_cuda=False, num_train_epochs=3, num_workers=8, output_dir='output_dir/', output_mode='classification', overwrite_cache=False, overwrite_output_dir=False, per_gpu_eval_batch_size=8, per_gpu_train_batch_size=8, save_steps=500, save_total_limit=None, seed=42, task_name='rte', tokenizer_name=None, tpu_metrics_debug=False, tpu_num_cores=None, warmup_steps=0, weight_decay=0.0)

Then we connect to the ray server.

In [4]:
ray.init(address=args.address)

{'node_ip_address': '172.31.17.18',
 'raylet_ip_address': '172.31.17.18',
 'redis_address': '172.31.17.18:6379',
 'object_store_address': '/tmp/ray/session_2020-05-21_17-45-08_202298_97701/sockets/plasma_store',
 'raylet_socket_name': '/tmp/ray/session_2020-05-21_17-45-08_202298_97701/sockets/raylet',
 'webui_url': 'localhost:8265',
 'session_dir': '/tmp/ray/session_2020-05-21_17-45-08_202298_97701'}

Now we define several helper functions for Torch trainer. First we define the data creator.

In [5]:
def load_and_cache_examples(args, task, tokenizer, evaluate=False):
    processor = processors[task]()
    output_mode = output_modes[task]
    # Load data features from cache or dataset file
    cached_features_file = os.path.join(
        args.data_dir,
        "cached_{}_{}_{}_{}".format(
            "dev" if evaluate else "train",
            list(filter(None, args.model_name_or_path.split("/"))).pop(),
            str(args.max_seq_length),
            str(task),
        ),
    )

    with FileLock("/tmp/load_and_cache_examples.lock"):
        if os.path.exists(cached_features_file) and not args.overwrite_cache:
            print("Loading features from cached file %s",
                  cached_features_file)
            features = torch.load(cached_features_file)
        else:
            print("Creating features from dataset file at %s",
                  args.data_dir)
            label_list = processor.get_labels()
            if task in ["mnli", "mnli-mm"
                        ] and args.model_type in ["roberta", "xlmroberta"]:
                # HACK(label indices are swapped in RoBERTa pretrained model)
                label_list[1], label_list[2] = label_list[2], label_list[1]
            examples = (processor.get_dev_examples(args.data_dir) if evaluate
                        else processor.get_train_examples(args.data_dir))
            features = convert_examples_to_features(
                examples,
                tokenizer,
                label_list=label_list,
                max_length=args.max_seq_length,
                output_mode=output_mode,
            )
            if not os.path.exists(cached_features_file):
                print("Saving features into cached file %s",
                      cached_features_file)
                torch.save(features, cached_features_file)

    # Convert to Tensors and build dataset
    all_input_ids = torch.tensor(
        [f.input_ids for f in features], dtype=torch.long)
    all_attention_mask = torch.tensor(
        [f.attention_mask for f in features], dtype=torch.long)
    all_token_type_ids = torch.tensor(
        [f.token_type_ids for f in features], dtype=torch.long)
    if output_mode == "classification":
        all_labels = torch.tensor(
            [f.label for f in features], dtype=torch.long)
    elif output_mode == "regression":
        all_labels = torch.tensor(
            [f.label for f in features], dtype=torch.float)

    dataset = TensorDataset(all_input_ids, all_attention_mask,
                            all_token_type_ids, all_labels)
    return dataset

def data_creator(config):
    args = config["args"]
    start = time.time()
    tokenizer = AutoTokenizer.from_pretrained(
        args.tokenizer_name
        if args.tokenizer_name else args.model_name_or_path,
        cache_dir=args.cache_dir if args.cache_dir else None,
    )
    print("tokenizer instantiation time: {}".format(time.time() - start))

    train_dataset = load_and_cache_examples(
        args, args.task_name, tokenizer, evaluate=False)
    train_sampler = RandomSampler(
        train_dataset) if not dist.is_initialized() else None
    return DataLoader(
        train_dataset,
        sampler=train_sampler,
        batch_size=args.per_gpu_train_batch_size)

Then we define the model creator for the Torch trainer.

In [6]:
def model_creator(config):
    with FileLock(os.path.expanduser("~/.download.lock")):
        args = config["args"]
        processor = processors[args.task_name]()
        label_list = processor.get_labels()
        num_labels = len(label_list)
        config = AutoConfig.from_pretrained(
            args.config_name if args.config_name else args.model_name_or_path,
            num_labels=num_labels,
            finetuning_task=args.task_name,
            cache_dir=args.cache_dir if args.cache_dir else None,
        )
        model = AutoModelForSequenceClassification.from_pretrained(
            args.model_name_or_path,
            from_tf=bool(".ckpt" in args.model_name_or_path),
            config=config,
            cache_dir=args.cache_dir if args.cache_dir else None,
        )
    return model

Define the optimizer creator.

In [7]:
def optimizer_creator(model, config):
    args = config["args"]
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [
                p for n, p in model.named_parameters()
                if not any(nd in n for nd in no_decay)
            ],
            "weight_decay": args.weight_decay,
        },
        {
            "params": [
                p for n, p in model.named_parameters()
                if any(nd in n for nd in no_decay)
            ],
            "weight_decay": 0.0
        },
    ]

    return AdamW(
        optimizer_grouped_parameters,
        lr=args.learning_rate,
        eps=args.adam_epsilon)

Define the training operator.

In [8]:
def announce_training(args, dataset_len, t_total):
    # Train!
    print("***** Running training *****")
    print("  Num examples = %d", dataset_len)
    print("  Num Epochs = %d", args.num_train_epochs)
    print("  Instantaneous batch size per GPU = %d",
          args.per_gpu_train_batch_size)
    print(
        "  Total train batch size (w. parallel, distributed & accum) = %d",
        args.per_gpu_train_batch_size * args.gradient_accumulation_steps *
        args.num_workers,
    )
    print("  Gradient Accumulation steps = %d",
          args.gradient_accumulation_steps)
    print("  Total optimization steps = %d", t_total)


class TransformerOperator(TrainingOperator):
    def setup(self, config):
        self.args = args = config["args"]
        self.tokenizer = AutoTokenizer.from_pretrained(
            args.tokenizer_name
            if args.tokenizer_name else args.model_name_or_path,
            cache_dir=args.cache_dir if args.cache_dir else None,
        )

        self.train_data_len = len(self.train_loader)
        self._warmup_scheduler = get_linear_schedule_with_warmup(
            self.optimizer,
            num_warmup_steps=args.warmup_steps,
            num_training_steps=self.calculate_t_total())
        self._global_step = 0

        announce_training(args, self.train_data_len, self.calculate_t_total())

    def train_batch(self, batch, batch_info=None):
        args = self.args
        model = self.model
        optimizer = self.optimizer
        step = batch_info["batch_idx"]

        model.train()
        batch = tuple(t.to(self.device) for t in batch)
        inputs = {
            "input_ids": batch[0],
            "attention_mask": batch[1],
            "labels": batch[3]
        }
        if args.model_type != "distilbert":
            # XLM, DistilBERT, RoBERTa, and XLM-RoBERTa don't use segment_ids
            inputs["token_type_ids"] = (batch[2] if args.model_type in [
                "bert", "xlnet", "albert"
            ] else None)
        outputs = model(**inputs)

        # model outputs are always tuple in transformers (see doc)
        loss = outputs[0]

        if args.gradient_accumulation_steps > 1:
            loss = loss / args.gradient_accumulation_steps

        if args.fp16:
            with amp.scale_loss(loss, optimizer) as scaled_loss:
                scaled_loss.backward()
        else:
            loss.backward()

        batch_loss = loss.item()

        # last step in epoch but step is always smaller
        # than gradient_accumulation_steps
        ending = (self.train_data_len <= args.gradient_accumulation_steps
                  and (step + 1) == self.train_data_len)
        if (step + 1) % args.gradient_accumulation_steps == 0 or ending:
            if args.fp16:
                torch.nn.utils.clip_grad_norm_(
                    amp.master_params(optimizer), args.max_grad_norm)
            else:
                torch.nn.utils.clip_grad_norm_(model.parameters(),
                                               args.max_grad_norm)

            self.optimizer.step()
            self._warmup_scheduler.step()  # Update learning rate schedule
            model.zero_grad()
            self._global_step += 1

        learning_rate_scalar = self._warmup_scheduler.get_lr()[0]
        return {"learning_rate": learning_rate_scalar, "loss": batch_loss}

    def calculate_t_total(self):
        args = self.args
        grad_accum_steps = args.gradient_accumulation_steps
        train_data_len = len(self.train_loader)
        if args.max_steps > 0:
            t_total = args.max_steps
            args.num_train_epochs = args.max_steps // (
                train_data_len // grad_accum_steps) + 1
        else:
            t_total = (
                train_data_len // grad_accum_steps * args.num_train_epochs)
        return t_total

Define the trainer.

In [9]:
trainer = TorchTrainer(
        model_creator=model_creator,
        data_creator=data_creator,
        optimizer_creator=optimizer_creator,
        training_operator_cls=TransformerOperator,
        use_fp16=args.fp16,
        apex_args={"opt_level": args.fp16_opt_level},
        num_workers=args.num_workers,
        use_gpu=use_gpu,
        use_tqdm=True,
        config={"args": args})

AttributeError: module 'torch.cuda' has no attribute 'is_initialized'

In [10]:
torch.__version__

'1.3.1'

Traceback: Traceback (most recent call last):
  File "/home/ubuntu/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/urllib3/connection.py", line 171, in _new_conn
    (self._dns_host, self.port), self.timeout, **extra_kw)
  File "/home/ubuntu/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/urllib3/util/connection.py", line 79, in create_connection
    raise err
  File "/home/ubuntu/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/urllib3/util/connection.py", line 69, in create_connection
    sock.connect(sa)
TimeoutError: [Errno 110] Connection timed out

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/ubuntu/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/urllib3/connectionpool.py", line 600, in urlopen
    chunked=chunked)
  File "/home/ubuntu/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/urllib3/connectionpool.py", line 354, in _make_request
    conn.request(method, url, **httpli