From 68ae1e8586b1303a53c6047f022a80efe669092c Mon Sep 17 00:00:00 2001 From: minimario Date: Tue, 6 Dec 2022 18:46:49 +0000 Subject: [PATCH 1/3] input length experiments --- Makefile | 7 +++++++ run.sh | 0 run_input_length.sh | 8 ++++++++ src/main.py | 9 ++++++--- src/pipelines/pipeline.py | 4 +++- src/utils/__init__.py | 2 +- src/utils/arguments.py | 1 + src/utils/dummy.py | 9 ++++++++- 8 files changed, 34 insertions(+), 6 deletions(-) mode change 100644 => 100755 run.sh create mode 100755 run_input_length.sh diff --git a/Makefile b/Makefile index a47032a..2a833e0 100644 --- a/Makefile +++ b/Makefile @@ -59,3 +59,10 @@ hf-1b-GPT2-mqa1-int8: ds-inference-1b-GPT2-mqa1-fp16: deepspeed --num_gpus 1 src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class DS_Inference_Pipeline --model_class GPT2 --n_positions 2048 --attention_type 3 --batch_size ${batch_size} + +# Input length experiments +hf-1b-GPT2-mqa1-int8-input-length: + python src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class HF_GPU_Pipeline --model_class GPT2 --n_positions 2048 --attention_type 3 --dtype int8 --batch_size ${batch_size} --max_input_length ${max_input_length} + +hf-1b-GPT2-mha-int8-input-length: + python src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class HF_GPU_Pipeline --model_class GPT2 --n_positions 2048 --attention_type 1 --dtype int8 --batch_size ${batch_size} --max_input_length ${max_input_length} \ No newline at end of file diff --git a/run.sh b/run.sh old mode 100644 new mode 100755 diff --git a/run_input_length.sh b/run_input_length.sh new file mode 100755 index 0000000..740f102 --- /dev/null +++ b/run_input_length.sh @@ -0,0 +1,8 @@ +export CUDA_VISIBLE_DEVICES=0 + +rm -rf ./tmp + +for max_input_length in {1900,1024,512,256,128,64,32,16,8,4} +do + make $1 batch_size=4 max_input_length=$max_input_length +done \ No newline at end of file diff --git a/src/main.py b/src/main.py index 2046829..0e3f29b 100644 --- a/src/main.py +++ b/src/main.py @@ -1,13 +1,16 @@ import pipelines -from utils import benchmark_end_to_end, get_arg_parser, get_args, get_dummy_batch - +from utils import benchmark_end_to_end, get_arg_parser, get_args, get_dummy_batch, get_dummy_batch_tokenizer +from transformers import AutoTokenizer def main() -> None: # deepspeed.init_distributed("nccl") args = get_args(get_arg_parser()) - inputs = get_dummy_batch(args.batch_size) + tokenizer = AutoTokenizer.from_pretrained("gpt2") + tokenizer.add_special_tokens({"pad_token": "[PAD]"}) + inputs = get_dummy_batch_tokenizer(args.batch_size, tokenizer, args.max_input_length) + generate_kwargs = dict(max_new_tokens=args.max_new_tokens, do_sample=False) pipeline_class = getattr(pipelines, args.pipeline_class) diff --git a/src/pipelines/pipeline.py b/src/pipelines/pipeline.py index 20a0988..150ac48 100644 --- a/src/pipelines/pipeline.py +++ b/src/pipelines/pipeline.py @@ -11,9 +11,10 @@ def __init__(self, args: Namespace) -> None: self.config, self.tokenizer, self.model_class = get_config_tokenizer_model_class(args) self.model = None self.input_device = None + self.max_input_length = args.max_input_length def __call__(self, text: List[str], **generate_kwargs) -> Tuple[List[str], List[int]]: - input_tokens = self.tokenizer(text, return_tensors="pt", padding=True) + input_tokens = self.tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=self.max_input_length) for t in input_tokens: if torch.is_tensor(input_tokens[t]): @@ -25,6 +26,7 @@ def __call__(self, text: List[str], **generate_kwargs) -> Tuple[List[str], List[ output_tokens = output.sequences input_token_lengths = [x.shape[0] for x in input_tokens.input_ids] + print("Input token lengths: ", input_token_lengths) output_token_lengths = [x.shape[0] for x in output_tokens] num_generated_tokens = [o - i for i, o in zip(input_token_lengths, output_token_lengths)] diff --git a/src/utils/__init__.py b/src/utils/__init__.py index a947456..a611fc9 100644 --- a/src/utils/__init__.py +++ b/src/utils/__init__.py @@ -1,3 +1,3 @@ from .arguments import get_arg_parser, get_args from .benchmark import benchmark_end_to_end -from .dummy import get_dummy_batch +from .dummy import get_dummy_batch, get_dummy_batch_tokenizer diff --git a/src/utils/arguments.py b/src/utils/arguments.py index 79f2497..bd99193 100644 --- a/src/utils/arguments.py +++ b/src/utils/arguments.py @@ -9,6 +9,7 @@ def get_arg_parser() -> ArgumentParser: parser.add_argument("--model_class", default="GPT2", type=str) parser.add_argument("--batch_size", default=1, type=int) parser.add_argument("--dtype", default="bfloat16", type=str) + parser.add_argument("--max_input_length", default=100, type=int) parser.add_argument("--max_new_tokens", default=100, type=int) parser.add_argument("--local_rank", type=int) parser.add_argument("--hidden_size", type=int) diff --git a/src/utils/dummy.py b/src/utils/dummy.py index ed06cdb..6e30c7d 100644 --- a/src/utils/dummy.py +++ b/src/utils/dummy.py @@ -1,7 +1,7 @@ import copy import math from typing import List - +import random dummy_input_sentences = [ "DeepSpeed is a machine learning framework", @@ -24,3 +24,10 @@ def get_dummy_batch(batch_size: int, input_sentences: List[str] = None) -> List[ input_sentences = input_sentences[:batch_size] return input_sentences + +def get_dummy_batch_tokenizer(batch_size : int, tokenizer, max_input_length : int) -> List[str]: + input_sentences = [] + for i in range(batch_size): + sentence = [random.randint(0, tokenizer.vocab_size-1) for _ in range(max_input_length)] + input_sentences.append(tokenizer.decode(sentence)) + return input_sentences \ No newline at end of file From 6c1ab2398dac12885e1e7f6f27d95bbe40dcfa81 Mon Sep 17 00:00:00 2001 From: minimario Date: Tue, 6 Dec 2022 21:11:27 +0000 Subject: [PATCH 2/3] sort input lengths in ascending order --- run_input_length.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/run_input_length.sh b/run_input_length.sh index 740f102..9830c6a 100755 --- a/run_input_length.sh +++ b/run_input_length.sh @@ -2,7 +2,7 @@ export CUDA_VISIBLE_DEVICES=0 rm -rf ./tmp -for max_input_length in {1900,1024,512,256,128,64,32,16,8,4} +for max_input_length in {4,8,16,32,64,128,256,512,1024,1536,1900} do - make $1 batch_size=4 max_input_length=$max_input_length + make $1 batch_size=32 max_input_length=$max_input_length done \ No newline at end of file From 4789384a3616f414e196ca4338191a491b3c23be Mon Sep 17 00:00:00 2001 From: minimario Date: Tue, 6 Dec 2022 21:32:21 +0000 Subject: [PATCH 3/3] make default max input length -1 --- src/main.py | 9 ++++++--- src/pipelines/pipeline.py | 5 ++++- src/utils/arguments.py | 2 +- 3 files changed, 11 insertions(+), 5 deletions(-) diff --git a/src/main.py b/src/main.py index 0e3f29b..5f2204f 100644 --- a/src/main.py +++ b/src/main.py @@ -7,9 +7,12 @@ def main() -> None: args = get_args(get_arg_parser()) - tokenizer = AutoTokenizer.from_pretrained("gpt2") - tokenizer.add_special_tokens({"pad_token": "[PAD]"}) - inputs = get_dummy_batch_tokenizer(args.batch_size, tokenizer, args.max_input_length) + if args.max_input_length == -1: + inputs = get_dummy_batch(args.batch_size) + else: + tokenizer = AutoTokenizer.from_pretrained("gpt2") + tokenizer.add_special_tokens({"pad_token": "[PAD]"}) + inputs = get_dummy_batch_tokenizer(args.batch_size, tokenizer, args.max_input_length) generate_kwargs = dict(max_new_tokens=args.max_new_tokens, do_sample=False) diff --git a/src/pipelines/pipeline.py b/src/pipelines/pipeline.py index 150ac48..c3e9baf 100644 --- a/src/pipelines/pipeline.py +++ b/src/pipelines/pipeline.py @@ -14,7 +14,10 @@ def __init__(self, args: Namespace) -> None: self.max_input_length = args.max_input_length def __call__(self, text: List[str], **generate_kwargs) -> Tuple[List[str], List[int]]: - input_tokens = self.tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=self.max_input_length) + if self.max_input_length == -1: + input_tokens = self.tokenizer(text, return_tensors="pt", padding=True) + else: + input_tokens = self.tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=self.max_input_length) for t in input_tokens: if torch.is_tensor(input_tokens[t]): diff --git a/src/utils/arguments.py b/src/utils/arguments.py index bd99193..158fbe3 100644 --- a/src/utils/arguments.py +++ b/src/utils/arguments.py @@ -9,7 +9,7 @@ def get_arg_parser() -> ArgumentParser: parser.add_argument("--model_class", default="GPT2", type=str) parser.add_argument("--batch_size", default=1, type=int) parser.add_argument("--dtype", default="bfloat16", type=str) - parser.add_argument("--max_input_length", default=100, type=int) + parser.add_argument("--max_input_length", default=-1, type=int) parser.add_argument("--max_new_tokens", default=100, type=int) parser.add_argument("--local_rank", type=int) parser.add_argument("--hidden_size", type=int)