In [1]:
from transformers import Trainer, TrainingArguments, AutoModelForCausalLM
from transformers import GPTJForCausalLM, AutoTokenizer, DataCollatorForLanguageModeling, default_data_collator
from datasets import load_dataset
import time
import torch
import os
import numpy as np
import evaluate
import sklearn
import pandas as pd
import ray
import ray.data
from ray.data.preprocessors import BatchMapper, Chain
import os
#os.environ["RAY_ML_DEV"] = "1"



ray.init(runtime_env={"env_vars": {"NCCL_SOCKET_IFNAME": "ens5"}})
start = time.time()
name = "gpt-j-6B"

comet_ml is installed but `COMET_API_KEY` is not set.
  from pandas import MultiIndex, Int64Index
2023-02-08 19:29:47,979	INFO worker.py:1364 -- Connecting to existing Ray cluster at address: 10.0.11.234:6379...
2023-02-08 19:29:47,989	INFO worker.py:1544 -- Connected to Ray cluster. View the dashboard at [1m[32mhttp://127.0.0.1:8265 [39m[22m
2023-02-08 19:29:47,993	INFO packaging.py:330 -- Pushing file package 'gcs://_ray_pkg_f2e93942ab6ad3d2e347b209b7e2bae2.zip' (0.37MiB) to Ray cluster...
2023-02-08 19:29:47,997	INFO packaging.py:343 -- Successfully pushed file package 'gcs://_ray_pkg_f2e93942ab6ad3d2e347b209b7e2bae2.zip'.


In [2]:
print("Loading dataset")
# current_dataset = load_dataset("wikitext", 'wikitext-103-v1', split="train")
current_dataset = load_dataset("tiny_shakespeare")
current_dataset

Loading dataset


Found cached dataset tiny_shakespeare (/home/ray/.cache/huggingface/datasets/tiny_shakespeare/default/1.0.0/b5b13969f09fe8707337f6cb296314fbe06960bd9a868dca39e713e163d27b5e)


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 1
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 1
    })
    test: Dataset({
        features: ['text'],
        num_rows: 1
    })
})

In [3]:
if isinstance(current_dataset, dict):
    ray_datasets = ray.data.from_huggingface(current_dataset)
else:
    ray_dataset: ray.data.Dataset = ray.data.from_huggingface(current_dataset)
    train, validation, test = ray_dataset.random_shuffle(seed=1).split_proportionately([0.9])
    ray_datasets = {"train": train.repartition(16), "validation": validation.repartition(4)}

In [4]:
def split_column_with_one_string(df):
    data = df["text"].iloc[0]
    df = pd.DataFrame()
    df["text"] = [x.strip() for x in data.split("\n\n") if x.strip()]
    return df

string_splitter = BatchMapper(split_column_with_one_string, batch_format="pandas")

In [5]:
from ray.data.preprocessor import Preprocessor

class Tokenizer:
    def __init__(self, pretrained_model_name_or_path, caption_column, revision=None) -> None:
        # Importing here to work around a memory leak with Ray Data in 2.2
        # Should be fixed in 2.3 or 2.4
        from transformers import AutoTokenizer

        self.tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path, revision=revision)
        self.tokenizer.pad_token = self.tokenizer.eos_token
        self.caption_column = caption_column

    # Preprocessing the datasets.
    # We need to tokenize input captions and transform the images.
    def tokenize_captions(self, txt_list, is_train=True):
        tokens = self.tokenizer(list(txt_list[self.caption_column]), truncation=True,
                                       max_length=self.tokenizer.model_max_length, padding="max_length",return_tensors="np",)        
        tokens["labels"] = tokens["input_ids"].copy()
        return {k: v for k, v in tokens.items()}

    def __call__(self, df: "pd.DataFrame") -> "pd.DataFrame":
        return self.tokenize_captions(df)


class TokenizerPreprocessor(Preprocessor):
    _is_fittable = False

    def __init__(self, pretrained_model_name_or_path, caption_column, revision=None) -> None:
        self.pretrained_model_name_or_path = pretrained_model_name_or_path
        self.revision = revision
        self.caption_column = caption_column

    _transform_pandas = Tokenizer

    def _get_transform_config(self):
        """Returns kwargs to be passed to :meth:`ray.data.Dataset.map_batches`.
        This can be implemented by subclassing preprocessors.
        """
        return dict(
            compute=ray.data.ActorPoolStrategy(),
            fn_constructor_kwargs=dict(
                pretrained_model_name_or_path=self.pretrained_model_name_or_path,
                revision=self.revision,
                caption_column=self.caption_column,
            ),
        )


In [6]:
from transformers.utils.logging import disable_progress_bar, enable_progress_bar
from ray.air import session
import torch
import os

num_cpus = 8

def trainer_init_per_worker(train_dataset, eval_dataset = None, **config):
    os.environ["OMP_NUM_THREADS"] = str(num_cpus)
    torch.backends.cuda.matmul.allow_tf32 = True

    batch_size = 4
    deepspeed = {
        "fp16": {
            "enabled": "auto",
            "initial_scale_power": 32,
        },
        "bf16":{
            "enabled":"auto"
        },    
        "optimizer": {
            "type": "AdamW",
            "params": {
            "lr": "auto",
            "betas": "auto",
            "eps":"auto",
            }
        },
        "zero_optimization": {
            "stage": 3,
            "offload_optimizer": {
                "device": "cpu",
                "pin_memory": False,
            },
           # "offload_param": {
           #     "device": "cpu",
           #     "pin_memory": False,
           # },
            "overlap_comm": True,
            "contiguous_gradients": True,
            "reduce_bucket_size": "auto",
            "stage3_prefetch_bucket_size": "auto",
            "stage3_param_persistence_threshold": "auto",
            "gather_16bit_weights_on_model_save": True,
            "round_robin_gradients": True,
        },
        "gradient_accumulation_steps": "auto",
        "gradient_clipping": "auto",
        "steps_per_print": 1,
        "train_batch_size": "auto",
        "train_micro_batch_size_per_gpu": "auto",
        "wall_clock_breakdown": False
    }

    print("Preparing training arguments")
    training_args = TrainingArguments(
        name,
        per_device_train_batch_size=batch_size,
        logging_steps=100, save_strategy="steps",
        per_device_eval_batch_size=batch_size,
        learning_rate=0.001,
        weight_decay=0.01,
        # warmup_steps=20,
        label_names=['input_ids', 'attention_mask'],  # 'logits', 'past_key_values'
        num_train_epochs=config.get("epochs", 2),
        push_to_hub=False,
        disable_tqdm=True,  # declutter the output a little
        bf16=True,
        gradient_checkpointing=True,
        deepspeed=deepspeed
    )

    disable_progress_bar()
    print("Loading model")
    model = GPTJForCausalLM.from_pretrained("EleutherAI/gpt-j-6B", use_cache=False, revision="float16", torch_dtype=torch.float16)

    tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-j-6B")
    tokenizer.pad_token = tokenizer.eos_token
    model.resize_token_embeddings(len(tokenizer))

    enable_progress_bar()

    metric = evaluate.load("accuracy")

    def compute_metrics(eval_pred):
        logits, labels = eval_pred
        predictions = np.argmax(logits, axis=-1)
        return metric.compute(predictions=predictions, references=labels)

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        compute_metrics=compute_metrics,
        tokenizer=tokenizer,
        data_collator=default_data_collator,
    )
    return trainer

In [7]:
from ray.train.huggingface import HuggingFaceTrainer
from ray.air.config import RunConfig, ScalingConfig, CheckpointConfig
from ray.air.integrations.mlflow import MLflowLoggerCallback
from ray.tune import SyncConfig

trainer = HuggingFaceTrainer(
    trainer_init_per_worker=trainer_init_per_worker,
    scaling_config=ScalingConfig(num_workers=8, use_gpu=True, resources_per_worker={"GPU": 1, "CPU": num_cpus}),
    datasets={"train": ray_datasets["train"], "evaluation": ray_datasets["validation"]},
    run_config=RunConfig(
        local_dir="/mnt/cluster_storage/",
        sync_config=SyncConfig(syncer=None),
        callbacks=[MLflowLoggerCallback(experiment_name=name)],
        checkpoint_config=CheckpointConfig(num_to_keep=1, checkpoint_score_attribute="eval_loss", checkpoint_score_order="min"),
    ),
    preprocessor=Chain(string_splitter, TokenizerPreprocessor("EleutherAI/gpt-j-6B", "text")),
)

In [None]:
results = trainer.fit()

In [None]:
results.checkpoint

In [8]:
from ray.air import Checkpoint
checkpoint = Checkpoint.from_directory("/mnt/cluster_storage/HuggingFaceTrainer_2023-02-08_14-55-52/HuggingFaceTrainer_bca80_00000_0_2023-02-08_14-55-53/rank_0/gpt-j-6B/checkpoint-394/")

In [9]:
checkpoint

Checkpoint(local_path=/efs/workspaces/expwrk_pz38qgkkv829gd4lz5k5zisa9g/cluster_storage/HuggingFaceTrainer_2023-02-08_14-55-52/HuggingFaceTrainer_bca80_00000_0_2023-02-08_14-55-53/rank_0/gpt-j-6B/checkpoint-394)

In [10]:
from ray.train.huggingface import HuggingFaceCheckpoint
from ray.air._internal.checkpointing import (
    load_preprocessor_from_dir,
    save_preprocessor_to_dir,
)

class HuggingFaceCheckpointPatched(HuggingFaceCheckpoint):
    def get_preprocessor(self):
        """Return the saved preprocessor, if one exists."""

        # The preprocessor will either be stored in an in-memory dict or
        # written to storage. In either case, it will use the PREPROCESSOR_KEY key.

        with self.as_directory() as checkpoint_path:
            preprocessor = load_preprocessor_from_dir(checkpoint_path)

        return preprocessor

In [32]:
from ray.train.huggingface import HuggingFacePredictor
from transformers import set_seed

@ray.remote(num_gpus=1)
def predict(uri, seed=None):
    if seed is None:
        rng = np.random.default_rng(seed=None)
        seed = rng.integers(0, 2**16)
    print(f"seed: {seed}")
    set_seed(seed)
    checkpoint = HuggingFaceCheckpointPatched.from_uri(uri)
    print("creating predictor")
    predictor = HuggingFacePredictor.from_checkpoint(checkpoint, task="text-generation", device=0, torch_dtype=torch.bfloat16)
    # No need to use AIR preprocessor, and it looks like the one I coded has
    # issues with being loaded, so we just get rid of it
    predictor._preprocessor = None
    print("predicting")
    return predictor.predict(
        pd.DataFrame([["Romeo:"]]),
        do_sample=True, 
        max_new_tokens=256, 
        top_k=50, 
        top_p=0.95, 
        num_return_sequences=3
    )

In [34]:
prediction_tasks = [predict.remote(checkpoint.uri) for i in range(8)]
predictions = ray.get(prediction_tasks)

(pid=729, ip=10.0.61.192)   from pandas import MultiIndex, Int64Index
(pid=3174, ip=10.0.16.95)   from pandas import MultiIndex, Int64Index
(pid=736, ip=10.0.2.170)   from pandas import MultiIndex, Int64Index
(pid=729, ip=10.0.60.56)   from pandas import MultiIndex, Int64Index
(pid=728, ip=10.0.42.1)   from pandas import MultiIndex, Int64Index
(pid=3174, ip=10.0.16.95) comet_ml is installed but `COMET_API_KEY` is not set.
(pid=729, ip=10.0.61.192) comet_ml is installed but `COMET_API_KEY` is not set.
(pid=485, ip=10.0.59.166)   from pandas import MultiIndex, Int64Index
(pid=736, ip=10.0.2.170) comet_ml is installed but `COMET_API_KEY` is not set.
(pid=729, ip=10.0.60.56) comet_ml is installed but `COMET_API_KEY` is not set.


(predict pid=3174, ip=10.0.16.95) seed: 36085
(predict pid=3174, ip=10.0.16.95) creating predictor
(predict pid=729, ip=10.0.61.192) seed: 5197
(predict pid=729, ip=10.0.61.192) creating predictor
(predict pid=736, ip=10.0.2.170) seed: 40401


(pid=485, ip=10.0.12.116)   from pandas import MultiIndex, Int64Index


(predict pid=736, ip=10.0.2.170) creating predictor
(predict pid=729, ip=10.0.60.56) seed: 11975
(predict pid=729, ip=10.0.60.56) creating predictor


(pid=728, ip=10.0.42.1) comet_ml is installed but `COMET_API_KEY` is not set.
(pid=486, ip=10.0.11.114)   from pandas import MultiIndex, Int64Index


(predict pid=728, ip=10.0.42.1) seed: 22916
(predict pid=728, ip=10.0.42.1) creating predictor


(pid=485, ip=10.0.59.166) comet_ml is installed but `COMET_API_KEY` is not set.


(predict pid=485, ip=10.0.59.166) seed: 63324
(predict pid=485, ip=10.0.59.166) creating predictor


(pid=485, ip=10.0.12.116) comet_ml is installed but `COMET_API_KEY` is not set.
(pid=486, ip=10.0.11.114) comet_ml is installed but `COMET_API_KEY` is not set.


(predict pid=485, ip=10.0.12.116) seed: 29835
(predict pid=485, ip=10.0.12.116) creating predictor
(predict pid=486, ip=10.0.11.114) seed: 25711
(predict pid=486, ip=10.0.11.114) creating predictor


(predict pid=729, ip=10.0.61.192) Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


(predict pid=729, ip=10.0.61.192) predicting
(predict pid=736, ip=10.0.2.170) predicting




(predict pid=3174, ip=10.0.16.95) predicting


(predict pid=3174, ip=10.0.16.95) Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
(predict pid=736, ip=10.0.2.170) Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


(predict pid=729, ip=10.0.60.56) predicting


(predict pid=729, ip=10.0.60.56) Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


(predict pid=728, ip=10.0.42.1) predicting


(predict pid=728, ip=10.0.42.1) Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


(predict pid=485, ip=10.0.59.166) predicting


(predict pid=485, ip=10.0.59.166) Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


(predict pid=486, ip=10.0.11.114) predicting


(predict pid=486, ip=10.0.11.114) Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


(predict pid=485, ip=10.0.12.116) predicting


(predict pid=485, ip=10.0.12.116) Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [53]:
predictions

[                                      generated_text
 0                             Romeo:\nI, it;, be at.
 1  Romeo:\nAnd,?\nNow\nWhat thee be one all a lor...
 2                                        Romeo:\nI:.,
                                       generated_text
 0  Romeo:\nAnd the thy son and a I of no father o...
 1  Romeo:\nThe me,, of my love's not and my a not...
 2                             Romeo:\nHI, thou love.,
                                       generated_text
 0                         Romeo:\nThat do you's you!
 1  Romeo:\nTo my other her of this her shall but ...
 2                        Romeo:\nIishness in a king.,
          generated_text
 0   Romeo:\nHow of me,.
 1      Romeo:\nTo I it.
 2  Romeo:\nBut have of.,
                                       generated_text
 0     Romeo:\nAs will in the man and my heart\nNow,.
 1                     Romeo:\nI's a-ay, it, thy man.
 2  Romeo:\nThe more dto the love,\nIly my a more ...,
                               