From 812a72c0220aab136432f4fc7dc344753d5a2dac Mon Sep 17 00:00:00 2001 From: mayank31398 Date: Mon, 17 Oct 2022 17:17:09 +0530 Subject: [PATCH 01/43] refactor --- src/main.py | 18 +++ src/pipelines/__init__.py | 3 + src/{ => pipelines}/ds_inference.py | 21 +--- src/{ => pipelines}/hf.py | 25 ++-- src/{model.py => pipelines/pipeline.py} | 4 +- src/utils.py | 160 ------------------------ src/utils/__init__.py | 3 + src/utils/arguments.py | 23 ++++ src/utils/benchmark.py | 73 +++++++++++ src/utils/dummy.py | 26 ++++ src/utils/utils.py | 45 +++++++ 11 files changed, 205 insertions(+), 196 deletions(-) create mode 100644 src/main.py create mode 100644 src/pipelines/__init__.py rename src/{ => pipelines}/ds_inference.py (73%) rename src/{ => pipelines}/hf.py (50%) rename src/{model.py => pipelines/pipeline.py} (96%) delete mode 100644 src/utils.py create mode 100644 src/utils/__init__.py create mode 100644 src/utils/arguments.py create mode 100644 src/utils/benchmark.py create mode 100644 src/utils/dummy.py create mode 100644 src/utils/utils.py diff --git a/src/main.py b/src/main.py new file mode 100644 index 0000000..2046829 --- /dev/null +++ b/src/main.py @@ -0,0 +1,18 @@ +import pipelines +from utils import benchmark_end_to_end, get_arg_parser, get_args, get_dummy_batch + + +def main() -> None: + # deepspeed.init_distributed("nccl") + + args = get_args(get_arg_parser()) + + inputs = get_dummy_batch(args.batch_size) + generate_kwargs = dict(max_new_tokens=args.max_new_tokens, do_sample=False) + + pipeline_class = getattr(pipelines, args.pipeline_class) + benchmark_end_to_end(args, pipeline_class, inputs, generate_kwargs) + + +if __name__ == "__main__": + main() diff --git a/src/pipelines/__init__.py b/src/pipelines/__init__.py new file mode 100644 index 0000000..2581940 --- /dev/null +++ b/src/pipelines/__init__.py @@ -0,0 +1,3 @@ +from .ds_inference import DS_Inference_Pipeline +from .hf import HF_CPU_Pipeline, HF_GPU_Pipeline +from .pipeline import Pipeline diff --git a/src/ds_inference.py b/src/pipelines/ds_inference.py similarity index 73% rename from src/ds_inference.py rename to src/pipelines/ds_inference.py index 28997a3..fca1cb2 100644 --- a/src/ds_inference.py +++ b/src/pipelines/ds_inference.py @@ -5,12 +5,10 @@ import torch from transformers import BloomForCausalLM -import utils -from model import Model -from utils import benchmark_end_to_end, get_dummy_batch +from .pipeline import Pipeline -class HFAccelerateModel(Model): +class DS_Inference_Pipeline(Pipeline): def __init__(self, args: Namespace) -> None: super().__init__(args) @@ -40,18 +38,3 @@ def __init__(self, args: Namespace) -> None: ) self.input_device = torch.cuda.current_device() - - -def main() -> None: - deepspeed.init_distributed("nccl") - - args = utils.get_args(utils.get_arg_parser()) - - inputs = get_dummy_batch(args.batch_size) - generate_kwargs = dict(max_new_tokens=args.max_new_tokens, do_sample=False) - - benchmark_end_to_end(args, HFAccelerateModel, inputs, generate_kwargs) - - -if __name__ == "__main__": - main() diff --git a/src/hf.py b/src/pipelines/hf.py similarity index 50% rename from src/hf.py rename to src/pipelines/hf.py index 1d411eb..e662c1e 100644 --- a/src/hf.py +++ b/src/pipelines/hf.py @@ -3,13 +3,11 @@ import torch from transformers import BloomForCausalLM -import utils -from model import Model -from utils import benchmark_end_to_end, get_dummy_batch +from .pipeline import Pipeline -class HFAccelerateModel(Model): - def __init__(self, args: Namespace) -> None: +class HF_Pipeline(Pipeline): + def __init__(self, args: Namespace, device: str = "cpu") -> None: super().__init__(args) model_kwargs = {} @@ -18,19 +16,16 @@ def __init__(self, args: Namespace) -> None: else: model_kwargs["torch_dtype"] = args.dtype - self.input_device = "cuda:0" + self.input_device = device self.model = BloomForCausalLM._from_config(self.config, **model_kwargs).to(self.input_device) self.model.eval() -def main() -> None: - args = utils.get_args(utils.get_arg_parser()) - - inputs = get_dummy_batch(args.batch_size) - generate_kwargs = dict(max_new_tokens=args.max_new_tokens, do_sample=False) - - benchmark_end_to_end(args, HFAccelerateModel, inputs, generate_kwargs) +class HF_CPU_Pipeline(HF_Pipeline): + def __init__(self, args: Namespace) -> None: + super().__init__(args, "cpu") -if __name__ == "__main__": - main() +class HF_GPU_Pipeline(HF_CPU_Pipeline): + def __init__(self, args: Namespace) -> None: + super().__init__(args, "cuda:0") diff --git a/src/model.py b/src/pipelines/pipeline.py similarity index 96% rename from src/model.py rename to src/pipelines/pipeline.py index 35d0804..9d20cbf 100644 --- a/src/model.py +++ b/src/pipelines/pipeline.py @@ -5,7 +5,7 @@ from transformers import AutoTokenizer, BloomConfig -class Model(torch.nn.Module): +class Pipeline: def __init__(self, args: Namespace) -> None: super().__init__() @@ -45,7 +45,7 @@ def __init__(self, args: Namespace) -> None: self.model = None self.input_device = None - def generate(self, text: List[str], **generate_kwargs) -> Tuple[List[str], List[int]]: + def __call__(self, text: List[str], **generate_kwargs) -> Tuple[List[str], List[int]]: input_tokens = self.tokenizer(text, return_tensors="pt", padding=True) for t in input_tokens: diff --git a/src/utils.py b/src/utils.py deleted file mode 100644 index 5465ce1..0000000 --- a/src/utils.py +++ /dev/null @@ -1,160 +0,0 @@ -import copy -import gc -import math -import time -from argparse import ArgumentParser, Namespace -from functools import partial -from typing import Any, List, Tuple, Union - -import torch -import torch.distributed as dist - -from model import Model - - -# used for benchmarks -dummy_input_sentences = [ - "DeepSpeed is a machine learning framework", - "He is working on", - "He has a", - "He got all", - "Everyone is happy and I can", - "The new movie that got Oscar this year", - "In the far far distance from our galaxy,", - "Peace is the only way", -] - - -def get_dummy_batch(batch_size: int, input_sentences: List[str] = None) -> List[str]: - if input_sentences is None: - input_sentences = copy.deepcopy(dummy_input_sentences) - - if batch_size > len(input_sentences): - input_sentences *= math.ceil(batch_size / len(input_sentences)) - input_sentences = input_sentences[:batch_size] - - return input_sentences - - -def get_arg_parser() -> ArgumentParser: - parser = ArgumentParser() - parser.add_argument("--batch_size", default=1, type=int) - parser.add_argument("--dtype", default="bfloat16", type=str) - parser.add_argument("--max_new_tokens", default=100, type=int) - parser.add_argument("--local_rank", type=int) - parser.add_argument("--hidden_size", type=int) - parser.add_argument("--n_head", type=int) - parser.add_argument("--n_layer", type=int) - parser.add_argument("--benchmark_cycles", type=int, default=5) - return parser - - -def get_args(parser: ArgumentParser) -> Namespace: - args = parser.parse_args() - args.dtype = getattr(torch, args.dtype) - return args - - -def run_rank_n(func: partial, barrier: bool = False, rank: int = 0, other_rank_output: Any = None) -> Any: - # runs function on only process with specified rank - if dist.is_initialized(): - if dist.get_rank() == rank: - output = func() - if barrier: - dist.barrier() - return output - else: - if barrier: - dist.barrier() - return other_rank_output - else: - return func() - - -def print_rank_n(*values, rank: int = 0) -> None: - # print on only process with specified rank - if dist.is_initialized(): - if dist.get_rank() == rank: - print(*values) - else: - print(*values) - - -def run_and_log_time(execs: Union[List[partial], partial]) -> Tuple[Union[List[Any], Any], float]: - # runs a function / list of functions and times them - start_time = time.time() - - if type(execs) == list: - results = [] - for f in execs: - results.append(f()) - else: - results = execs() - - time_elapsed = time.time() - start_time - return results, time_elapsed - - -def benchmark_generation(model: Model, text: List[str], generate_kwargs: dict, cycles: int = 5) -> int: - # run benchmarks for number of cycles - total_new_tokens_generated = 0 - for _ in range(cycles): - _, num_generated_tokens = model.generate(text, **generate_kwargs) - total_new_tokens_generated += sum(new_tokens for new_tokens in num_generated_tokens) - return total_new_tokens_generated - - -def get_benchmark_results( - benchmark_time: float, initialization_time: float, total_new_tokens_generated: int, batch_size: int, cycles: int -) -> str: - throughput = total_new_tokens_generated / benchmark_time - latency = benchmark_time / cycles - return f""" -*** Performance stats: -Throughput (including tokenization) = {throughput:.2f} tokens/sec -Throughput (including tokenization) = {1000 / throughput:.2f} msecs/token -Model loading time = {initialization_time:.2f} secs -Total tokens generated = {total_new_tokens_generated} with batch size = {batch_size} -Latency = {latency:.2f} secs -Model loading time + generation time per batch = {initialization_time + latency:.2f} secs -""" - - -def benchmark_end_to_end(args: Namespace, model_class: Model, text: List[str], generate_kwargs: dict) -> None: - model, initialization_time = run_and_log_time(partial(model_class, args=args)) - - print_rank_n("num params =", model.get_num_parameters()) - - print_rank_n(f"generate_kwargs = {generate_kwargs}") - print_rank_n(f"batch_size = {args.batch_size}") - - # warmup is a must if measuring speed as it's when all the optimizations are performed - # e.g. on 8x80 a100 the first pass of 100 tokens takes 23sec, and the next one is 4secs - generated_text, _ = model.generate(text, **generate_kwargs) - - for i, o in zip(text, generated_text): - print_rank_n(f"{'-' * 60}\nINPUT = {i}\nOUTPUT = {o}\n") - - if args.benchmark_cycles > 0: - print_rank_n(f"*** Running benchmark") - - torch.cuda.empty_cache() - gc.collect() - torch.cuda.synchronize() - - # benchmark - total_new_tokens_generated, benchmark_time = run_and_log_time( - partial( - benchmark_generation, - model=model, - text=text, - generate_kwargs=generate_kwargs, - cycles=args.benchmark_cycles, - ) - ) - - print_rank_n( - get_benchmark_results( - benchmark_time, initialization_time, total_new_tokens_generated, args.batch_size, args.benchmark_cycles - ) - ) diff --git a/src/utils/__init__.py b/src/utils/__init__.py new file mode 100644 index 0000000..a947456 --- /dev/null +++ b/src/utils/__init__.py @@ -0,0 +1,3 @@ +from .arguments import get_arg_parser, get_args +from .benchmark import benchmark_end_to_end +from .dummy import get_dummy_batch diff --git a/src/utils/arguments.py b/src/utils/arguments.py new file mode 100644 index 0000000..4078d17 --- /dev/null +++ b/src/utils/arguments.py @@ -0,0 +1,23 @@ +from argparse import ArgumentParser, Namespace + +import torch + + +def get_arg_parser() -> ArgumentParser: + parser = ArgumentParser() + parser.add_argument("--pipeline_class", default="HF_GPU_Pipeline", type=str) + parser.add_argument("--batch_size", default=1, type=int) + parser.add_argument("--dtype", default="bfloat16", type=str) + parser.add_argument("--max_new_tokens", default=100, type=int) + parser.add_argument("--local_rank", type=int) + parser.add_argument("--hidden_size", type=int) + parser.add_argument("--n_head", type=int) + parser.add_argument("--n_layer", type=int) + parser.add_argument("--benchmark_cycles", type=int, default=5) + return parser + + +def get_args(parser: ArgumentParser) -> Namespace: + args = parser.parse_args() + args.dtype = getattr(torch, args.dtype) + return args diff --git a/src/utils/benchmark.py b/src/utils/benchmark.py new file mode 100644 index 0000000..0d71933 --- /dev/null +++ b/src/utils/benchmark.py @@ -0,0 +1,73 @@ +import gc +from argparse import Namespace +from functools import partial +from typing import List + +import torch + +from pipelines import Pipeline + + +def benchmark_generation(pipeline: Pipeline, text: List[str], generate_kwargs: dict, cycles: int = 5) -> int: + # run benchmarks for number of cycles + total_new_tokens_generated = 0 + for _ in range(cycles): + _, num_generated_tokens = pipeline(text, **generate_kwargs) + total_new_tokens_generated += sum(new_tokens for new_tokens in num_generated_tokens) + return total_new_tokens_generated + + +def get_benchmark_results( + benchmark_time: float, initialization_time: float, total_new_tokens_generated: int, batch_size: int, cycles: int +) -> str: + throughput = total_new_tokens_generated / benchmark_time + latency = benchmark_time / cycles + return f""" +*** Performance stats: +Throughput (including tokenization) = {throughput:.2f} tokens/sec +Throughput (including tokenization) = {1000 / throughput:.2f} msecs/token +Model loading time = {initialization_time:.2f} secs +Total tokens generated = {total_new_tokens_generated} with batch size = {batch_size} +Latency = {latency:.2f} secs +Model loading time + generation time per batch = {initialization_time + latency:.2f} secs +""" + + +def benchmark_end_to_end(args: Namespace, pipeline_class: Pipeline, text: List[str], generate_kwargs: dict) -> None: + pipeline, initialization_time = run_and_log_time(partial(pipeline_class, args=args)) + + print_rank_n("num params =", pipeline.get_num_parameters()) + + print_rank_n(f"generate_kwargs = {generate_kwargs}") + print_rank_n(f"batch_size = {args.batch_size}") + + # warmup is a must if measuring speed as it's when all the optimizations are performed + # e.g. on 8x80 a100 the first pass of 100 tokens takes 23sec, and the next one is 4secs + generated_text, _ = pipeline(text, **generate_kwargs) + + for i, o in zip(text, generated_text): + print_rank_n(f"{'-' * 60}\nINPUT = {i}\nOUTPUT = {o}\n") + + if args.benchmark_cycles > 0: + print_rank_n(f"*** Running benchmark") + + torch.cuda.empty_cache() + gc.collect() + torch.cuda.synchronize() + + # benchmark + total_new_tokens_generated, benchmark_time = run_and_log_time( + partial( + benchmark_generation, + pipeline=pipeline, + text=text, + generate_kwargs=generate_kwargs, + cycles=args.benchmark_cycles, + ) + ) + + print_rank_n( + get_benchmark_results( + benchmark_time, initialization_time, total_new_tokens_generated, args.batch_size, args.benchmark_cycles + ) + ) diff --git a/src/utils/dummy.py b/src/utils/dummy.py new file mode 100644 index 0000000..ed06cdb --- /dev/null +++ b/src/utils/dummy.py @@ -0,0 +1,26 @@ +import copy +import math +from typing import List + + +dummy_input_sentences = [ + "DeepSpeed is a machine learning framework", + "He is working on", + "He has a", + "He got all", + "Everyone is happy and I can", + "The new movie that got Oscar this year", + "In the far far distance from our galaxy,", + "Peace is the only way", +] + + +def get_dummy_batch(batch_size: int, input_sentences: List[str] = None) -> List[str]: + if input_sentences is None: + input_sentences = copy.deepcopy(dummy_input_sentences) + + if batch_size > len(input_sentences): + input_sentences *= math.ceil(batch_size / len(input_sentences)) + input_sentences = input_sentences[:batch_size] + + return input_sentences diff --git a/src/utils/utils.py b/src/utils/utils.py new file mode 100644 index 0000000..2ae7b7f --- /dev/null +++ b/src/utils/utils.py @@ -0,0 +1,45 @@ +import time +from functools import partial +from typing import Any, List, Tuple, Union + +import torch.distributed as dist + + +def run_and_log_time(execs: Union[List[partial], partial]) -> Tuple[Union[List[Any], Any], float]: + # runs a function / list of functions and times them + start_time = time.time() + + if type(execs) == list: + results = [] + for f in execs: + results.append(f()) + else: + results = execs() + + time_elapsed = time.time() - start_time + return results, time_elapsed + + +def run_rank_n(func: partial, barrier: bool = False, rank: int = 0, other_rank_output: Any = None) -> Any: + # runs function on only process with specified rank + if dist.is_initialized(): + if dist.get_rank() == rank: + output = func() + if barrier: + dist.barrier() + return output + else: + if barrier: + dist.barrier() + return other_rank_output + else: + return func() + + +def print_rank_n(*values, rank: int = 0) -> None: + # print on only process with specified rank + if dist.is_initialized(): + if dist.get_rank() == rank: + print(*values) + else: + print(*values) From 21713aca58dd1099d5e2121cab2a68a1a73aee08 Mon Sep 17 00:00:00 2001 From: mayank31398 Date: Mon, 17 Oct 2022 17:41:11 +0530 Subject: [PATCH 02/43] refactor --- src/pipelines/pipeline.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/pipelines/pipeline.py b/src/pipelines/pipeline.py index 9d20cbf..7c6dfb8 100644 --- a/src/pipelines/pipeline.py +++ b/src/pipelines/pipeline.py @@ -7,8 +7,6 @@ class Pipeline: def __init__(self, args: Namespace) -> None: - super().__init__() - self.config = BloomConfig.from_dict( { "apply_residual_connection_post_layernorm": False, @@ -40,6 +38,7 @@ def __init__(self, args: Namespace) -> None: } ) + # hardcoded for now to bigscience/bloom self.tokenizer = AutoTokenizer.from_pretrained("bigscience/bloom") self.model = None From 0ea738aac299035ca37a075c97d2564849052efb Mon Sep 17 00:00:00 2001 From: mayank31398 Date: Mon, 17 Oct 2022 17:44:40 +0530 Subject: [PATCH 03/43] refactor --- benchmark.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/benchmark.sh b/benchmark.sh index 001d543..25ec7e8 100644 --- a/benchmark.sh +++ b/benchmark.sh @@ -1,5 +1,5 @@ # HF -python src/hf.py --hidden_size 6144 --n_head 32 --n_layer 30 +python src/main.py --hidden_size 6144 --n_head 32 --n_layer 30 --pipeline_class HF_GPU_Pipeline # DS-inference -deepspeed --num_gpus 1 src/ds_inference.py --hidden_size 6144 --n_head 32 --n_layer 30 +deepspeed --num_gpus 1 src/main.py --hidden_size 6144 --n_head 32 --n_layer 30 --pipeline_class DS_Inference_Pipeline From 6239fc6658b82ee8d48077df1af1d04516b4d612 Mon Sep 17 00:00:00 2001 From: mayank31398 Date: Mon, 17 Oct 2022 17:46:19 +0530 Subject: [PATCH 04/43] refactor --- src/utils/benchmark.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/utils/benchmark.py b/src/utils/benchmark.py index 0d71933..709c2fd 100644 --- a/src/utils/benchmark.py +++ b/src/utils/benchmark.py @@ -7,6 +7,8 @@ from pipelines import Pipeline +from .utils import print_rank_n, run_and_log_time + def benchmark_generation(pipeline: Pipeline, text: List[str], generate_kwargs: dict, cycles: int = 5) -> int: # run benchmarks for number of cycles From 01e9515c611be1ed7e0181729ec8bf16afb90ff3 Mon Sep 17 00:00:00 2001 From: mayank31398 Date: Mon, 17 Oct 2022 17:49:23 +0530 Subject: [PATCH 05/43] refactor --- src/pipelines/hf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pipelines/hf.py b/src/pipelines/hf.py index e662c1e..22da2d2 100644 --- a/src/pipelines/hf.py +++ b/src/pipelines/hf.py @@ -26,6 +26,6 @@ def __init__(self, args: Namespace) -> None: super().__init__(args, "cpu") -class HF_GPU_Pipeline(HF_CPU_Pipeline): +class HF_GPU_Pipeline(HF_Pipeline): def __init__(self, args: Namespace) -> None: super().__init__(args, "cuda:0") From b5a29b889d72eaeb77867787bcefda3fe75d7c4f Mon Sep 17 00:00:00 2001 From: mayank31398 Date: Thu, 1 Dec 2022 11:46:54 +0530 Subject: [PATCH 06/43] test --- Makefile | 5 +++ benchmark.sh | 5 --- src/pipelines/pipeline.py | 68 ++++++++++++++++++--------------------- src/utils/arguments.py | 1 + 4 files changed, 38 insertions(+), 41 deletions(-) create mode 100644 Makefile delete mode 100644 benchmark.sh diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..cf4a539 --- /dev/null +++ b/Makefile @@ -0,0 +1,5 @@ +hf: + python src/main.py --hidden_size 6144 --n_head 32 --n_layer 30 --pipeline_class HF_CPU_Pipeline --model_class GPT2 --n_positions 2048 + +ds-inference: + deepspeed --num_gpus 1 src/main.py --hidden_size 6144 --n_head 32 --n_layer 30 --pipeline_class DS_Inference_Pipeline --model_class BLOOM diff --git a/benchmark.sh b/benchmark.sh deleted file mode 100644 index 25ec7e8..0000000 --- a/benchmark.sh +++ /dev/null @@ -1,5 +0,0 @@ -# HF -python src/main.py --hidden_size 6144 --n_head 32 --n_layer 30 --pipeline_class HF_GPU_Pipeline - -# DS-inference -deepspeed --num_gpus 1 src/main.py --hidden_size 6144 --n_head 32 --n_layer 30 --pipeline_class DS_Inference_Pipeline diff --git a/src/pipelines/pipeline.py b/src/pipelines/pipeline.py index 7c6dfb8..f371cb5 100644 --- a/src/pipelines/pipeline.py +++ b/src/pipelines/pipeline.py @@ -1,46 +1,13 @@ from argparse import Namespace -from typing import List, Tuple +from typing import List, Tuple, Union import torch -from transformers import AutoTokenizer, BloomConfig +from transformers import AutoTokenizer, BloomConfig, GPT2Config class Pipeline: def __init__(self, args: Namespace) -> None: - self.config = BloomConfig.from_dict( - { - "apply_residual_connection_post_layernorm": False, - "architectures": ["BloomModel"], - "attention_dropout": 0.0, - "attention_softmax_in_fp32": True, - "bias_dropout_fusion": True, - "bos_token_id": 1, - "eos_token_id": 2, - "hidden_dropout": 0.0, - "hidden_size": args.hidden_size, - "initializer_range": 0.02, - "layer_norm_epsilon": 1e-05, - "masked_softmax_fusion": True, - "model_type": "bloom", - "n_head": args.n_head, - "n_inner": None, - "n_layer": args.n_layer, - "offset_alibi": 100, - "pad_token_id": 3, - "pretraining_tp": 1, - "skip_bias_add": True, - "skip_bias_add_qkv": False, - "slow_but_exact": False, - "transformers_version": "4.22.2", - "unk_token_id": 0, - "use_cache": True, - "vocab_size": 250880, - } - ) - - # hardcoded for now to bigscience/bloom - self.tokenizer = AutoTokenizer.from_pretrained("bigscience/bloom") - + self.config, self.tokenizer = get_config_tokenizer(args) self.model = None self.input_device = None @@ -69,3 +36,32 @@ def get_num_parameters(self) -> int: for i in self.model.parameters(): param_count += i.numel() return param_count + + +def get_config_tokenizer(args: Namespace) -> Union[BloomConfig, GPT2Config]: + tokenizer = AutoTokenizer.from_pretrained("bigscience/bloom") + + if args.model_class.lower() == "bloom": + config = BloomConfig( + attention_softmax_in_fp32=True, + hidden_size=args.hidden_size, + n_head=args.n_head, + n_layer=args.n_layer, + vocab_size=len(tokenizer), + bos_token_id=tokenizer.bos_token_id, + eos_token_id=tokenizer.eos_token_id, + ) + elif args.model_class.lower() == "gpt2": + config = GPT2Config( + n_embd=args.hidden_size, + n_head=args.n_head, + n_layer=args.n_layer, + n_positions=args.n_positions, + bos_token_id=tokenizer.bos_token_id, + eos_token_id=tokenizer.eos_token_id, + attention_type=args.attention_type, + print_details=False, + vocab_size=len(tokenizer), + ) + + return config, tokenizer diff --git a/src/utils/arguments.py b/src/utils/arguments.py index 4078d17..9a46002 100644 --- a/src/utils/arguments.py +++ b/src/utils/arguments.py @@ -6,6 +6,7 @@ def get_arg_parser() -> ArgumentParser: parser = ArgumentParser() parser.add_argument("--pipeline_class", default="HF_GPU_Pipeline", type=str) + parser.add_argument("--model_class", default="GPT2", type=str) parser.add_argument("--batch_size", default=1, type=int) parser.add_argument("--dtype", default="bfloat16", type=str) parser.add_argument("--max_new_tokens", default=100, type=int) From e4a29b5e99588402c2a2693fac779e8594e31fcf Mon Sep 17 00:00:00 2001 From: mayank31398 Date: Thu, 1 Dec 2022 13:26:58 +0530 Subject: [PATCH 07/43] test --- src/utils/arguments.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/utils/arguments.py b/src/utils/arguments.py index 9a46002..bcc3c38 100644 --- a/src/utils/arguments.py +++ b/src/utils/arguments.py @@ -12,6 +12,7 @@ def get_arg_parser() -> ArgumentParser: parser.add_argument("--max_new_tokens", default=100, type=int) parser.add_argument("--local_rank", type=int) parser.add_argument("--hidden_size", type=int) + parser.add_argument("--n_positions", type=int) parser.add_argument("--n_head", type=int) parser.add_argument("--n_layer", type=int) parser.add_argument("--benchmark_cycles", type=int, default=5) From 48f0aa094bb91df0b43e2afcd7c80de16685bbc7 Mon Sep 17 00:00:00 2001 From: mayank31398 Date: Thu, 1 Dec 2022 13:28:08 +0530 Subject: [PATCH 08/43] test --- Makefile | 2 +- src/utils/arguments.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/Makefile b/Makefile index cf4a539..4c814de 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,5 @@ hf: - python src/main.py --hidden_size 6144 --n_head 32 --n_layer 30 --pipeline_class HF_CPU_Pipeline --model_class GPT2 --n_positions 2048 + python src/main.py --hidden_size 6144 --n_head 32 --n_layer 30 --pipeline_class HF_CPU_Pipeline --model_class GPT2 --n_positions 2048 --attention_type 2 ds-inference: deepspeed --num_gpus 1 src/main.py --hidden_size 6144 --n_head 32 --n_layer 30 --pipeline_class DS_Inference_Pipeline --model_class BLOOM diff --git a/src/utils/arguments.py b/src/utils/arguments.py index bcc3c38..a2322e0 100644 --- a/src/utils/arguments.py +++ b/src/utils/arguments.py @@ -12,6 +12,7 @@ def get_arg_parser() -> ArgumentParser: parser.add_argument("--max_new_tokens", default=100, type=int) parser.add_argument("--local_rank", type=int) parser.add_argument("--hidden_size", type=int) + parser.add_argument("--attention_type", type=int) parser.add_argument("--n_positions", type=int) parser.add_argument("--n_head", type=int) parser.add_argument("--n_layer", type=int) From 646b63b6a6fc8b1584bd5010c77af38968e7ffc1 Mon Sep 17 00:00:00 2001 From: mayank31398 Date: Thu, 1 Dec 2022 13:34:53 +0530 Subject: [PATCH 09/43] test --- src/pipelines/ds_inference.py | 3 +-- src/pipelines/hf.py | 3 +-- src/pipelines/pipeline.py | 10 ++++++---- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/src/pipelines/ds_inference.py b/src/pipelines/ds_inference.py index fca1cb2..31d4d8e 100644 --- a/src/pipelines/ds_inference.py +++ b/src/pipelines/ds_inference.py @@ -3,7 +3,6 @@ import deepspeed import torch -from transformers import BloomForCausalLM from .pipeline import Pipeline @@ -16,7 +15,7 @@ def __init__(self, args: Namespace) -> None: # with deepspeed.OnDevice(dtype=torch.bfloat16, device="meta"): # model = BloomForCausalLM._from_config(config, torch_dtype=torch.bfloat16) - self.model = BloomForCausalLM._from_config(self.config, torch_dtype=torch.bfloat16) + self.model = self.model_class._from_config(self.config, torch_dtype=torch.bfloat16) self.model.eval() # checkpoints_json = os.path.join(args.model_name, "checkpoints.json") diff --git a/src/pipelines/hf.py b/src/pipelines/hf.py index 22da2d2..60b7a77 100644 --- a/src/pipelines/hf.py +++ b/src/pipelines/hf.py @@ -1,7 +1,6 @@ from argparse import Namespace import torch -from transformers import BloomForCausalLM from .pipeline import Pipeline @@ -17,7 +16,7 @@ def __init__(self, args: Namespace, device: str = "cpu") -> None: model_kwargs["torch_dtype"] = args.dtype self.input_device = device - self.model = BloomForCausalLM._from_config(self.config, **model_kwargs).to(self.input_device) + self.model = self.model_class._from_config(self.config, **model_kwargs).to(self.input_device) self.model.eval() diff --git a/src/pipelines/pipeline.py b/src/pipelines/pipeline.py index f371cb5..551dc23 100644 --- a/src/pipelines/pipeline.py +++ b/src/pipelines/pipeline.py @@ -2,12 +2,12 @@ from typing import List, Tuple, Union import torch -from transformers import AutoTokenizer, BloomConfig, GPT2Config +from transformers import AutoTokenizer, BloomConfig, BloomForCausalLM, GPT2Config, GPT2LMHeadModel class Pipeline: def __init__(self, args: Namespace) -> None: - self.config, self.tokenizer = get_config_tokenizer(args) + self.config, self.tokenizer, self.model_class = get_config_tokenizer_model_class(args) self.model = None self.input_device = None @@ -38,7 +38,7 @@ def get_num_parameters(self) -> int: return param_count -def get_config_tokenizer(args: Namespace) -> Union[BloomConfig, GPT2Config]: +def get_config_tokenizer_model_class(args: Namespace) -> Union[BloomConfig, GPT2Config]: tokenizer = AutoTokenizer.from_pretrained("bigscience/bloom") if args.model_class.lower() == "bloom": @@ -51,6 +51,7 @@ def get_config_tokenizer(args: Namespace) -> Union[BloomConfig, GPT2Config]: bos_token_id=tokenizer.bos_token_id, eos_token_id=tokenizer.eos_token_id, ) + model_class = BloomForCausalLM elif args.model_class.lower() == "gpt2": config = GPT2Config( n_embd=args.hidden_size, @@ -63,5 +64,6 @@ def get_config_tokenizer(args: Namespace) -> Union[BloomConfig, GPT2Config]: print_details=False, vocab_size=len(tokenizer), ) + model_class = GPT2LMHeadModel - return config, tokenizer + return config, tokenizer, model_class From 3281d16a986ff29436a78560094954a62a1d6c74 Mon Sep 17 00:00:00 2001 From: mayank31398 Date: Thu, 1 Dec 2022 14:07:19 +0530 Subject: [PATCH 10/43] test --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 4c814de..a6bddae 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,5 @@ hf: - python src/main.py --hidden_size 6144 --n_head 32 --n_layer 30 --pipeline_class HF_CPU_Pipeline --model_class GPT2 --n_positions 2048 --attention_type 2 + python src/main.py --hidden_size 6144 --n_head 32 --n_layer 30 --pipeline_class HF_GPU_Pipeline --model_class GPT2 --n_positions 2048 --attention_type 2 ds-inference: deepspeed --num_gpus 1 src/main.py --hidden_size 6144 --n_head 32 --n_layer 30 --pipeline_class DS_Inference_Pipeline --model_class BLOOM From 2fbb6c3cbd75f15710c48f4a3e5c37221d4679cf Mon Sep 17 00:00:00 2001 From: mayank31398 Date: Thu, 1 Dec 2022 14:11:18 +0530 Subject: [PATCH 11/43] test --- Makefile | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/Makefile b/Makefile index a6bddae..b34dcf8 100644 --- a/Makefile +++ b/Makefile @@ -1,3 +1,11 @@ +install-mqa-transformers: + git clone https://github.com/bigcode-project/transformers.git; \ + cd transformers; \ + git checkout multi_query; \ + pip install .; \ + cd ..; \ + rm -rf transformers; + hf: python src/main.py --hidden_size 6144 --n_head 32 --n_layer 30 --pipeline_class HF_GPU_Pipeline --model_class GPT2 --n_positions 2048 --attention_type 2 From 1090704ebccd555be42c5bcc36673d7d8e438ae0 Mon Sep 17 00:00:00 2001 From: mayank31398 Date: Thu, 1 Dec 2022 15:39:51 +0530 Subject: [PATCH 12/43] test --- src/pipelines/pipeline.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/src/pipelines/pipeline.py b/src/pipelines/pipeline.py index 551dc23..3596c6b 100644 --- a/src/pipelines/pipeline.py +++ b/src/pipelines/pipeline.py @@ -60,10 +60,21 @@ def get_config_tokenizer_model_class(args: Namespace) -> Union[BloomConfig, GPT2 n_positions=args.n_positions, bos_token_id=tokenizer.bos_token_id, eos_token_id=tokenizer.eos_token_id, - attention_type=args.attention_type, + attention_type=get_attention_type(args.attention_type), print_details=False, vocab_size=len(tokenizer), ) model_class = GPT2LMHeadModel return config, tokenizer, model_class + + +def get_attention_type(attention_type: int): + from transformers.models.gpt2.modeling_gpt2 import AttentionType + + if attention_type == 1: + return AttentionType.MULTI_HEAD + elif attention_type == 2: + return AttentionType.MULTI_QUERY + elif attention_type == 3: + return AttentionType.MULTI_QUERY_1 From b94ea81b776791c1fe23e83bd5e1b27d75f2db45 Mon Sep 17 00:00:00 2001 From: mayank31398 Date: Sun, 4 Dec 2022 01:06:28 +0530 Subject: [PATCH 13/43] fp32, bf16, int8 --- Makefile | 56 ++++++++++++++++++++++++++++++++++++++++++++++++++++---- run.sh | 10 ++++++++++ 2 files changed, 62 insertions(+), 4 deletions(-) create mode 100644 run.sh diff --git a/Makefile b/Makefile index b34dcf8..2e75294 100644 --- a/Makefile +++ b/Makefile @@ -1,3 +1,5 @@ +batch_size := 1 + install-mqa-transformers: git clone https://github.com/bigcode-project/transformers.git; \ cd transformers; \ @@ -6,8 +8,54 @@ install-mqa-transformers: cd ..; \ rm -rf transformers; -hf: - python src/main.py --hidden_size 6144 --n_head 32 --n_layer 30 --pipeline_class HF_GPU_Pipeline --model_class GPT2 --n_positions 2048 --attention_type 2 +# BLOOM AliBi +hf-1b-bloom-fp32: + python src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class HF_GPU_Pipeline --model_class BLOOM --dtype float32 --batch_size ${batch_size} + +hf-1b-bloom-bf16: + python src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class HF_GPU_Pipeline --model_class BLOOM --dtype bfloat16 --batch_size ${batch_size} + +hf-1b-bloom-int8: + python src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class HF_GPU_Pipeline --model_class BLOOM --dtype int8 --batch_size ${batch_size} + +ds-inference-1b-bloom: + deepspeed --num_gpus 1 src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class DS_Inference_Pipeline --model_class BLOOM --batch_size ${batch_size} + +# GPT2 MHA +hf-1b-GPT2-mha-fp32: + python src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class HF_GPU_Pipeline --model_class GPT2 --n_positions 2048 --attention_type 1 --dtype float32 --batch_size ${batch_size} + +hf-1b-GPT2-mha-bf16: + python src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class HF_GPU_Pipeline --model_class GPT2 --n_positions 2048 --attention_type 1 --dtype bfloat16 --batch_size ${batch_size} + +hf-1b-GPT2-mha-int8: + python src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class HF_GPU_Pipeline --model_class GPT2 --n_positions 2048 --attention_type 1 --dtype int8 --batch_size ${batch_size} + +ds-inference-1b-GPT2-mha: + deepspeed --num_gpus 1 src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class DS_Inference_Pipeline --model_class GPT2 --n_positions 2048 --attention_type 1 --batch_size ${batch_size} + +# GPT2 MQA +hf-1b-GPT2-mqa-fp32: + python src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class HF_GPU_Pipeline --model_class GPT2 --n_positions 2048 --attention_type 2 --dtype float32 --batch_size ${batch_size} + +hf-1b-GPT2-mqa-bf16: + python src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class HF_GPU_Pipeline --model_class GPT2 --n_positions 2048 --attention_type 2 --dtype bfloat16 --batch_size ${batch_size} + +hf-1b-GPT2-mqa-int8: + python src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class HF_GPU_Pipeline --model_class GPT2 --n_positions 2048 --attention_type 2 --dtype int8 --batch_size ${batch_size} + +ds-inference-1b-GPT2-mqa: + deepspeed --num_gpus 1 src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class DS_Inference_Pipeline --model_class GPT2 --n_positions 2048 --attention_type 2 --batch_size ${batch_size} + +# GPT2 MQA1 +hf-1b-GPT2-mqa1-fp32: + python src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class HF_GPU_Pipeline --model_class GPT2 --n_positions 2048 --attention_type 3 --dtype float32 --batch_size ${batch_size} + +hf-1b-GPT2-mqa1-bf16: + python src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class HF_GPU_Pipeline --model_class GPT2 --n_positions 2048 --attention_type 3 --dtype bfloat16 --batch_size ${batch_size} + +hf-1b-GPT2-mqa1-int8: + python src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class HF_GPU_Pipeline --model_class GPT2 --n_positions 2048 --attention_type 3 --dtype int8 --batch_size ${batch_size} -ds-inference: - deepspeed --num_gpus 1 src/main.py --hidden_size 6144 --n_head 32 --n_layer 30 --pipeline_class DS_Inference_Pipeline --model_class BLOOM +ds-inference-1b-GPT2-mqa1: + deepspeed --num_gpus 1 src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class DS_Inference_Pipeline --model_class GPT2 --n_positions 2048 --attention_type 3 --batch_size ${batch_size} diff --git a/run.sh b/run.sh new file mode 100644 index 0000000..dc05add --- /dev/null +++ b/run.sh @@ -0,0 +1,10 @@ +for bs in {1,2,4,8,16,32,64} +do + make $1 batch_size=$bs +done + +for i in {1..20} +do + bs=$(($i*128)) + make $1 batch_size=$bs +done From 17534fd97953297b78b19f41cabce61f624879bb Mon Sep 17 00:00:00 2001 From: mayank31398 Date: Sun, 4 Dec 2022 01:18:14 +0530 Subject: [PATCH 14/43] fp32, bf16, int8 --- src/pipelines/pipeline.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pipelines/pipeline.py b/src/pipelines/pipeline.py index 3596c6b..d812474 100644 --- a/src/pipelines/pipeline.py +++ b/src/pipelines/pipeline.py @@ -39,7 +39,7 @@ def get_num_parameters(self) -> int: def get_config_tokenizer_model_class(args: Namespace) -> Union[BloomConfig, GPT2Config]: - tokenizer = AutoTokenizer.from_pretrained("bigscience/bloom") + tokenizer = AutoTokenizer.from_pretrained("gpt2") if args.model_class.lower() == "bloom": config = BloomConfig( From e7230b5d2ee8b39005b305740c4522d0bf284194 Mon Sep 17 00:00:00 2001 From: mayank31398 Date: Sun, 4 Dec 2022 01:20:46 +0530 Subject: [PATCH 15/43] fp32, bf16, int8 --- src/pipelines/pipeline.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/pipelines/pipeline.py b/src/pipelines/pipeline.py index d812474..7e6bca3 100644 --- a/src/pipelines/pipeline.py +++ b/src/pipelines/pipeline.py @@ -40,6 +40,7 @@ def get_num_parameters(self) -> int: def get_config_tokenizer_model_class(args: Namespace) -> Union[BloomConfig, GPT2Config]: tokenizer = AutoTokenizer.from_pretrained("gpt2") + tokenizer.add_special_tokens({"pad_token": "[PAD]"}) if args.model_class.lower() == "bloom": config = BloomConfig( From 38c616b09fb5eb9a9fdcaa1ed34c2376104beca6 Mon Sep 17 00:00:00 2001 From: mayank31398 Date: Sun, 4 Dec 2022 01:35:48 +0530 Subject: [PATCH 16/43] use_cache --- src/pipelines/pipeline.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/pipelines/pipeline.py b/src/pipelines/pipeline.py index 7e6bca3..7fd15ad 100644 --- a/src/pipelines/pipeline.py +++ b/src/pipelines/pipeline.py @@ -51,6 +51,7 @@ def get_config_tokenizer_model_class(args: Namespace) -> Union[BloomConfig, GPT2 vocab_size=len(tokenizer), bos_token_id=tokenizer.bos_token_id, eos_token_id=tokenizer.eos_token_id, + use_cache=True, ) model_class = BloomForCausalLM elif args.model_class.lower() == "gpt2": @@ -64,6 +65,7 @@ def get_config_tokenizer_model_class(args: Namespace) -> Union[BloomConfig, GPT2 attention_type=get_attention_type(args.attention_type), print_details=False, vocab_size=len(tokenizer), + use_cache=True, ) model_class = GPT2LMHeadModel From 15a2c80fe5f683d5a5b245087f2507cc25168649 Mon Sep 17 00:00:00 2001 From: mayank31398 Date: Sun, 4 Dec 2022 02:23:32 +0530 Subject: [PATCH 17/43] use_cache --- README.md | 3 ++- run.sh | 2 ++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index cbb2100..6c21a4e 100644 --- a/README.md +++ b/README.md @@ -1 +1,2 @@ -# bigcode-inference-benchmark \ No newline at end of file +# bigcode-inference-benchmark + diff --git a/run.sh b/run.sh index dc05add..3a8e1d1 100644 --- a/run.sh +++ b/run.sh @@ -1,3 +1,5 @@ +export CUDA_VISIBLE_DEVICES=0 + for bs in {1,2,4,8,16,32,64} do make $1 batch_size=$bs From 80ba9bbe93086c6c6c8b16746ececaad869c937f Mon Sep 17 00:00:00 2001 From: mayank31398 Date: Sun, 4 Dec 2022 03:51:40 +0530 Subject: [PATCH 18/43] gc --- README.md | 39 +++++++++++++++++++++++++++++++++++++++ src/utils/arguments.py | 1 + src/utils/benchmark.py | 7 ++++--- 3 files changed, 44 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 6c21a4e..a87d23a 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,41 @@ # bigcode-inference-benchmark +BLOOM\ +HF-accelerate\ +A100 80GB + +tokens/sec | msec/token +| batch_size | fp32 | bf16 | int8
LLM.int8() | +|:----------:|:---------------:|:----:|:------------------:| +| 1 | 45.04 \| 22.20 | | | +| 2 | 84.39 \| 11.85 | | | +| 4 | 167.85 \| 5.96 | | | +| 8 | 326.72 \| 3.06 | | | +| 16 | 682.63 \| 1.46 | | | +| 32 | 1374.97 \| 0.73 | | | +| 64 | 2380.43 \| 0.42 | | | +| 128 | 2684.96 \| 0.37 | | | +| 256 | 3322.43 \| 0.30 | | | +| 384 | 2585.01 \| 0.39 | | | +| 512 | 2618.64 \| 0.38 | | | +| 640 | 2672.61 \| 0.37 | | | +| 768 | 2630.32 \| 0.38 | | | +| 896 | 2558.04 \| 0.39 | | | + +sec +| batch_size | fp32 | bf16 | int8
LLM.int8() | +|:----------:|:-----:|:----:|:------------------:| +| 1 | 2.22 | | | +| 2 | 2.37 | | | +| 4 | 2.38 | | | +| 8 | 2.45 | | | +| 16 | 2.34 | | | +| 32 | 2.33 | | | +| 64 | 2.69 | | | +| 128 | 4.77 | | | +| 256 | 7.71 | | | +| 384 | 14.85 | | | +| 512 | 19.55 | | | +| 640 | 23.95 | | | +| 768 | 29.20 | | | +| 896 | 35.03 | | | \ No newline at end of file diff --git a/src/utils/arguments.py b/src/utils/arguments.py index a2322e0..79f2497 100644 --- a/src/utils/arguments.py +++ b/src/utils/arguments.py @@ -17,6 +17,7 @@ def get_arg_parser() -> ArgumentParser: parser.add_argument("--n_head", type=int) parser.add_argument("--n_layer", type=int) parser.add_argument("--benchmark_cycles", type=int, default=5) + parser.add_argument("--clear_every_run", action="store_true") return parser diff --git a/src/utils/benchmark.py b/src/utils/benchmark.py index 709c2fd..aa14961 100644 --- a/src/utils/benchmark.py +++ b/src/utils/benchmark.py @@ -53,9 +53,10 @@ def benchmark_end_to_end(args: Namespace, pipeline_class: Pipeline, text: List[s if args.benchmark_cycles > 0: print_rank_n(f"*** Running benchmark") - torch.cuda.empty_cache() - gc.collect() - torch.cuda.synchronize() + if args.clear_every_run: + torch.cuda.empty_cache() + gc.collect() + torch.cuda.synchronize() # benchmark total_new_tokens_generated, benchmark_time = run_and_log_time( From f28f8ac15425065f75682768264d5d7f539b4531 Mon Sep 17 00:00:00 2001 From: mayank31398 Date: Sun, 4 Dec 2022 04:37:57 +0530 Subject: [PATCH 19/43] benchmark --- README.md | 56 ++++++++++++++++++++++++++++--------------------------- 1 file changed, 29 insertions(+), 27 deletions(-) diff --git a/README.md b/README.md index a87d23a..bc98092 100644 --- a/README.md +++ b/README.md @@ -7,35 +7,37 @@ A100 80GB tokens/sec | msec/token | batch_size | fp32 | bf16 | int8
LLM.int8() | |:----------:|:---------------:|:----:|:------------------:| -| 1 | 45.04 \| 22.20 | | | -| 2 | 84.39 \| 11.85 | | | -| 4 | 167.85 \| 5.96 | | | -| 8 | 326.72 \| 3.06 | | | -| 16 | 682.63 \| 1.46 | | | -| 32 | 1374.97 \| 0.73 | | | -| 64 | 2380.43 \| 0.42 | | | -| 128 | 2684.96 \| 0.37 | | | -| 256 | 3322.43 \| 0.30 | | | -| 384 | 2585.01 \| 0.39 | | | -| 512 | 2618.64 \| 0.38 | | | -| 640 | 2672.61 \| 0.37 | | | -| 768 | 2630.32 \| 0.38 | | | -| 896 | 2558.04 \| 0.39 | | | +| 1 | 45.31 \| 22.07 | | | +| 2 | 86.60 \| 11.55 | | | +| 4 | 171.38 \| 5.83 | | | +| 8 | 325.98 \| 3.07 | | | +| 16 | 655.23 \| 1.53 | | | +| 32 | 1356.57 \| 0.74 | | | +| 64 | 2373.14 \| 0.42 | | | +| 128 | 2688.91 \| 0.37 | | | +| 256 | 3325.01 \| 0.30 | | | +| 384 | 3261.28 \| 0.31 | | | +| 512 | 3369.69 \| 0.30 | | | +| 640 | 3506.41 \| 0.29 | | | +| 768 | 3461.95 \| 0.29 | | | +| 896 | 3346.01 \| 0.30 | | | +| 1024 | oom | | | sec | batch_size | fp32 | bf16 | int8
LLM.int8() | |:----------:|:-----:|:----:|:------------------:| -| 1 | 2.22 | | | -| 2 | 2.37 | | | -| 4 | 2.38 | | | +| 1 | 2.21 | | | +| 2 | 2.31 | | | +| 4 | 2.33 | | | | 8 | 2.45 | | | -| 16 | 2.34 | | | -| 32 | 2.33 | | | -| 64 | 2.69 | | | -| 128 | 4.77 | | | -| 256 | 7.71 | | | -| 384 | 14.85 | | | -| 512 | 19.55 | | | -| 640 | 23.95 | | | -| 768 | 29.20 | | | -| 896 | 35.03 | | | \ No newline at end of file +| 16 | 2.44 | | | +| 32 | 2.36 | | | +| 64 | 2.70 | | | +| 128 | 4.76 | | | +| 256 | 7.70 | | | +| 384 | 11.77 | | | +| 512 | 15.19 | | | +| 640 | 18.25 | | | +| 768 | 22.18 | | | +| 896 | 26.78 | | | +| 1024 | oom | | | \ No newline at end of file From d04dc1496f683a39dcb04dc7857bae7845bc33ff Mon Sep 17 00:00:00 2001 From: mayank31398 Date: Sun, 4 Dec 2022 07:06:23 +0530 Subject: [PATCH 20/43] benchmark --- README.md | 82 ++++++++++++++++++++++++++++++++----------------------- 1 file changed, 48 insertions(+), 34 deletions(-) diff --git a/README.md b/README.md index bc98092..320931b 100644 --- a/README.md +++ b/README.md @@ -5,39 +5,53 @@ HF-accelerate\ A100 80GB tokens/sec | msec/token -| batch_size | fp32 | bf16 | int8
LLM.int8() | -|:----------:|:---------------:|:----:|:------------------:| -| 1 | 45.31 \| 22.07 | | | -| 2 | 86.60 \| 11.55 | | | -| 4 | 171.38 \| 5.83 | | | -| 8 | 325.98 \| 3.07 | | | -| 16 | 655.23 \| 1.53 | | | -| 32 | 1356.57 \| 0.74 | | | -| 64 | 2373.14 \| 0.42 | | | -| 128 | 2688.91 \| 0.37 | | | -| 256 | 3325.01 \| 0.30 | | | -| 384 | 3261.28 \| 0.31 | | | -| 512 | 3369.69 \| 0.30 | | | -| 640 | 3506.41 \| 0.29 | | | -| 768 | 3461.95 \| 0.29 | | | -| 896 | 3346.01 \| 0.30 | | | -| 1024 | oom | | | +| batch_size | fp32 | bf16 | int8
LLM.int8() | +|:----------:|:---------------:|:---------------:|:------------------:| +| 1 | 45.31 \| 22.07 | 42.23 \| 23.68 | | +| 2 | 86.60 \| 11.55 | 83.79 \| 11.93 | | +| 4 | 171.38 \| 5.83 | 168.91 \| 5.92 | | +| 8 | 325.98 \| 3.07 | 328.11 \| 3.05 | | +| 16 | 655.23 \| 1.53 | 669.15 \| 1.49 | | +| 32 | 1356.57 \| 0.74 | 1277.78 \| 0.78 | | +| 64 | 2373.14 \| 0.42 | 2605.26 \| 0.38 | | +| 128 | 2688.91 \| 0.37 | 4780.32 \| 0.21 | | +| 256 | 3325.01 \| 0.30 | 6549.67 \| 0.15 | | +| 384 | 3261.28 \| 0.31 | 7319.86 \| 0.14 | | +| 512 | 3369.69 \| 0.30 | 7425.47 \| 0.13 | | +| 640 | 3506.41 \| 0.29 | 7553.05 \| 0.13 | | +| 768 | 3461.95 \| 0.29 | 7681.78 \| 0.13 | | +| 896 | 3346.01 \| 0.30 | 7544.19 \| 0.13 | | +| 1024 | oom | 7703.84 \| 0.13 | | +| 1152 | oom | 7728.71 \| 0.13 | | +| 1280 | oom | 7799.99 \| 0.13 | | +| 1408 | oom | 7776.64 \| 0.13 | | +| 1536 | oom | 7802.61 \| 0.13 | | +| 1664 | oom | 7783.20 \| 0.13 | | +| 1792 | oom | 7738.55 \| 0.13 | | +| 1920 | oom | oom | | sec -| batch_size | fp32 | bf16 | int8
LLM.int8() | -|:----------:|:-----:|:----:|:------------------:| -| 1 | 2.21 | | | -| 2 | 2.31 | | | -| 4 | 2.33 | | | -| 8 | 2.45 | | | -| 16 | 2.44 | | | -| 32 | 2.36 | | | -| 64 | 2.70 | | | -| 128 | 4.76 | | | -| 256 | 7.70 | | | -| 384 | 11.77 | | | -| 512 | 15.19 | | | -| 640 | 18.25 | | | -| 768 | 22.18 | | | -| 896 | 26.78 | | | -| 1024 | oom | | | \ No newline at end of file +| batch_size | fp32 | bf16 | int8
LLM.int8() | +|:----------:|:-----:|:-----:|:------------------:| +| 1 | 2.21 | 2.37 | | +| 2 | 2.31 | 2.39 | | +| 4 | 2.33 | 2.37 | | +| 8 | 2.45 | 2.44 | | +| 16 | 2.44 | 2.39 | | +| 32 | 2.36 | 2.50 | | +| 64 | 2.70 | 2.46 | | +| 128 | 4.76 | 2.68 | | +| 256 | 7.70 | 3.91 | | +| 384 | 11.77 | 5.25 | | +| 512 | 15.19 | 6.90 | | +| 640 | 18.25 | 8.47 | | +| 768 | 22.18 | 10.00 | | +| 896 | 26.78 | 11.88 | | +| 1024 | oom | 13.29 | | +| 1152 | oom | 14.91 | | +| 1280 | oom | 16.41 | | +| 1408 | oom | 18.11 | | +| 1536 | oom | 19.69 | | +| 1664 | oom | 21.38 | | +| 1792 | oom | 23.16 | | +| 1920 | oom | oom | | \ No newline at end of file From 9dc52684b0bbf293b51907e191973e9f473b4fda Mon Sep 17 00:00:00 2001 From: mayank31398 Date: Sun, 4 Dec 2022 07:16:01 +0530 Subject: [PATCH 21/43] benchmark --- README.md | 97 +++++++++++++++++++++++++++---------------------------- 1 file changed, 48 insertions(+), 49 deletions(-) diff --git a/README.md b/README.md index 320931b..fad6142 100644 --- a/README.md +++ b/README.md @@ -1,57 +1,56 @@ # bigcode-inference-benchmark BLOOM\ -HF-accelerate\ A100 80GB tokens/sec | msec/token -| batch_size | fp32 | bf16 | int8
LLM.int8() | -|:----------:|:---------------:|:---------------:|:------------------:| -| 1 | 45.31 \| 22.07 | 42.23 \| 23.68 | | -| 2 | 86.60 \| 11.55 | 83.79 \| 11.93 | | -| 4 | 171.38 \| 5.83 | 168.91 \| 5.92 | | -| 8 | 325.98 \| 3.07 | 328.11 \| 3.05 | | -| 16 | 655.23 \| 1.53 | 669.15 \| 1.49 | | -| 32 | 1356.57 \| 0.74 | 1277.78 \| 0.78 | | -| 64 | 2373.14 \| 0.42 | 2605.26 \| 0.38 | | -| 128 | 2688.91 \| 0.37 | 4780.32 \| 0.21 | | -| 256 | 3325.01 \| 0.30 | 6549.67 \| 0.15 | | -| 384 | 3261.28 \| 0.31 | 7319.86 \| 0.14 | | -| 512 | 3369.69 \| 0.30 | 7425.47 \| 0.13 | | -| 640 | 3506.41 \| 0.29 | 7553.05 \| 0.13 | | -| 768 | 3461.95 \| 0.29 | 7681.78 \| 0.13 | | -| 896 | 3346.01 \| 0.30 | 7544.19 \| 0.13 | | -| 1024 | oom | 7703.84 \| 0.13 | | -| 1152 | oom | 7728.71 \| 0.13 | | -| 1280 | oom | 7799.99 \| 0.13 | | -| 1408 | oom | 7776.64 \| 0.13 | | -| 1536 | oom | 7802.61 \| 0.13 | | -| 1664 | oom | 7783.20 \| 0.13 | | -| 1792 | oom | 7738.55 \| 0.13 | | -| 1920 | oom | oom | | +| batch_size | HF (fp32) | HF (bf16) | HF (int8) | +|:----------:|:---------------:|:---------------:|:---------------:| +| 1 | 45.31 \| 22.07 | 42.23 \| 23.68 | | +| 2 | 86.60 \| 11.55 | 83.79 \| 11.93 | | +| 4 | 171.38 \| 5.83 | 168.91 \| 5.92 | | +| 8 | 325.98 \| 3.07 | 328.11 \| 3.05 | | +| 16 | 655.23 \| 1.53 | 669.15 \| 1.49 | | +| 32 | 1356.57 \| 0.74 | 1277.78 \| 0.78 | | +| 64 | 2373.14 \| 0.42 | 2605.26 \| 0.38 | | +| 128 | 2688.91 \| 0.37 | 4780.32 \| 0.21 | | +| 256 | 3325.01 \| 0.30 | 6549.67 \| 0.15 | | +| 384 | 3261.28 \| 0.31 | 7319.86 \| 0.14 | | +| 512 | 3369.69 \| 0.30 | 7425.47 \| 0.13 | | +| 640 | 3506.41 \| 0.29 | 7553.05 \| 0.13 | | +| 768 | 3461.95 \| 0.29 | 7681.78 \| 0.13 | | +| 896 | 3346.01 \| 0.30 | 7544.19 \| 0.13 | | +| 1024 | oom | 7703.84 \| 0.13 | | +| 1152 | oom | 7728.71 \| 0.13 | | +| 1280 | oom | 7799.99 \| 0.13 | | +| 1408 | oom | 7776.64 \| 0.13 | | +| 1536 | oom | 7802.61 \| 0.13 | | +| 1664 | oom | 7783.20 \| 0.13 | | +| 1792 | oom | 7738.55 \| 0.13 | | +| 1920 | oom | oom | | sec -| batch_size | fp32 | bf16 | int8
LLM.int8() | -|:----------:|:-----:|:-----:|:------------------:| -| 1 | 2.21 | 2.37 | | -| 2 | 2.31 | 2.39 | | -| 4 | 2.33 | 2.37 | | -| 8 | 2.45 | 2.44 | | -| 16 | 2.44 | 2.39 | | -| 32 | 2.36 | 2.50 | | -| 64 | 2.70 | 2.46 | | -| 128 | 4.76 | 2.68 | | -| 256 | 7.70 | 3.91 | | -| 384 | 11.77 | 5.25 | | -| 512 | 15.19 | 6.90 | | -| 640 | 18.25 | 8.47 | | -| 768 | 22.18 | 10.00 | | -| 896 | 26.78 | 11.88 | | -| 1024 | oom | 13.29 | | -| 1152 | oom | 14.91 | | -| 1280 | oom | 16.41 | | -| 1408 | oom | 18.11 | | -| 1536 | oom | 19.69 | | -| 1664 | oom | 21.38 | | -| 1792 | oom | 23.16 | | -| 1920 | oom | oom | | \ No newline at end of file +| batch_size | HF (fp32) | HF (bf16) | HF (int8) | +|:----------:|:---------:|:---------:|:---------:| +| 1 | 2.21 | 2.37 | | +| 2 | 2.31 | 2.39 | | +| 4 | 2.33 | 2.37 | | +| 8 | 2.45 | 2.44 | | +| 16 | 2.44 | 2.39 | | +| 32 | 2.36 | 2.50 | | +| 64 | 2.70 | 2.46 | | +| 128 | 4.76 | 2.68 | | +| 256 | 7.70 | 3.91 | | +| 384 | 11.77 | 5.25 | | +| 512 | 15.19 | 6.90 | | +| 640 | 18.25 | 8.47 | | +| 768 | 22.18 | 10.00 | | +| 896 | 26.78 | 11.88 | | +| 1024 | oom | 13.29 | | +| 1152 | oom | 14.91 | | +| 1280 | oom | 16.41 | | +| 1408 | oom | 18.11 | | +| 1536 | oom | 19.69 | | +| 1664 | oom | 21.38 | | +| 1792 | oom | 23.16 | | +| 1920 | oom | oom | | \ No newline at end of file From 23a5eb1a6064571c0acffef6c6896462ce7aa453 Mon Sep 17 00:00:00 2001 From: mayank31398 Date: Sun, 4 Dec 2022 18:56:19 +0530 Subject: [PATCH 22/43] fix --- run.sh | 2 ++ src/pipelines/ds_inference.py | 2 +- src/pipelines/hf.py | 6 +++++- src/pipelines/pipeline.py | 4 ++++ 4 files changed, 12 insertions(+), 2 deletions(-) diff --git a/run.sh b/run.sh index 3a8e1d1..342b4ff 100644 --- a/run.sh +++ b/run.sh @@ -1,5 +1,7 @@ export CUDA_VISIBLE_DEVICES=0 +rm -rf ./tmp + for bs in {1,2,4,8,16,32,64} do make $1 batch_size=$bs diff --git a/src/pipelines/ds_inference.py b/src/pipelines/ds_inference.py index 31d4d8e..96a27d5 100644 --- a/src/pipelines/ds_inference.py +++ b/src/pipelines/ds_inference.py @@ -15,7 +15,7 @@ def __init__(self, args: Namespace) -> None: # with deepspeed.OnDevice(dtype=torch.bfloat16, device="meta"): # model = BloomForCausalLM._from_config(config, torch_dtype=torch.bfloat16) - self.model = self.model_class._from_config(self.config, torch_dtype=torch.bfloat16) + self.model = self.model_class.from_pretrained("tmp", torch_dtype=torch.bfloat16) self.model.eval() # checkpoints_json = os.path.join(args.model_name, "checkpoints.json") diff --git a/src/pipelines/hf.py b/src/pipelines/hf.py index 60b7a77..48aeb56 100644 --- a/src/pipelines/hf.py +++ b/src/pipelines/hf.py @@ -10,13 +10,17 @@ def __init__(self, args: Namespace, device: str = "cpu") -> None: super().__init__(args) model_kwargs = {} + + if device.startswith("cuda"): + model_kwargs["device_map"] = "balanced" + if args.dtype == torch.int8: model_kwargs["load_in_8bit"] = True else: model_kwargs["torch_dtype"] = args.dtype self.input_device = device - self.model = self.model_class._from_config(self.config, **model_kwargs).to(self.input_device) + self.model = self.model_class.from_pretrained("tmp", **model_kwargs) self.model.eval() diff --git a/src/pipelines/pipeline.py b/src/pipelines/pipeline.py index 7fd15ad..e7d0830 100644 --- a/src/pipelines/pipeline.py +++ b/src/pipelines/pipeline.py @@ -1,3 +1,4 @@ +import os from argparse import Namespace from typing import List, Tuple, Union @@ -69,6 +70,9 @@ def get_config_tokenizer_model_class(args: Namespace) -> Union[BloomConfig, GPT2 ) model_class = GPT2LMHeadModel + if not os.path.exists("tmp"): + model_class._from_config(config).save_pretrained("tmp") + return config, tokenizer, model_class From 391e05597beb94787a61eaf674a47c3a0104b209 Mon Sep 17 00:00:00 2001 From: mayank31398 Date: Sun, 4 Dec 2022 20:04:43 +0530 Subject: [PATCH 23/43] fix --- src/pipelines/hf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pipelines/hf.py b/src/pipelines/hf.py index 48aeb56..41405af 100644 --- a/src/pipelines/hf.py +++ b/src/pipelines/hf.py @@ -12,7 +12,7 @@ def __init__(self, args: Namespace, device: str = "cpu") -> None: model_kwargs = {} if device.startswith("cuda"): - model_kwargs["device_map"] = "balanced" + model_kwargs["device_map"] = {0: "80GIB"} if args.dtype == torch.int8: model_kwargs["load_in_8bit"] = True From 856c77b285176a01ca7dfd4e35e82d1f8a197d41 Mon Sep 17 00:00:00 2001 From: mayank31398 Date: Sun, 4 Dec 2022 20:07:32 +0530 Subject: [PATCH 24/43] fix --- src/pipelines/hf.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/pipelines/hf.py b/src/pipelines/hf.py index 41405af..a3dca81 100644 --- a/src/pipelines/hf.py +++ b/src/pipelines/hf.py @@ -11,16 +11,13 @@ def __init__(self, args: Namespace, device: str = "cpu") -> None: model_kwargs = {} - if device.startswith("cuda"): - model_kwargs["device_map"] = {0: "80GIB"} - if args.dtype == torch.int8: model_kwargs["load_in_8bit"] = True else: model_kwargs["torch_dtype"] = args.dtype self.input_device = device - self.model = self.model_class.from_pretrained("tmp", **model_kwargs) + self.model = self.model_class.from_pretrained("tmp", **model_kwargs).to(self.input_device) self.model.eval() From 9d99f463271b1d4b74166966ccb496899cc085c8 Mon Sep 17 00:00:00 2001 From: mayank31398 Date: Sun, 4 Dec 2022 22:14:12 +0530 Subject: [PATCH 25/43] fp32 --- README.md | 52 ++++++++++++++++++++++++++-------------------------- 1 file changed, 26 insertions(+), 26 deletions(-) diff --git a/README.md b/README.md index fad6142..462050f 100644 --- a/README.md +++ b/README.md @@ -6,20 +6,20 @@ A100 80GB tokens/sec | msec/token | batch_size | HF (fp32) | HF (bf16) | HF (int8) | |:----------:|:---------------:|:---------------:|:---------------:| -| 1 | 45.31 \| 22.07 | 42.23 \| 23.68 | | -| 2 | 86.60 \| 11.55 | 83.79 \| 11.93 | | -| 4 | 171.38 \| 5.83 | 168.91 \| 5.92 | | -| 8 | 325.98 \| 3.07 | 328.11 \| 3.05 | | -| 16 | 655.23 \| 1.53 | 669.15 \| 1.49 | | -| 32 | 1356.57 \| 0.74 | 1277.78 \| 0.78 | | -| 64 | 2373.14 \| 0.42 | 2605.26 \| 0.38 | | -| 128 | 2688.91 \| 0.37 | 4780.32 \| 0.21 | | -| 256 | 3325.01 \| 0.30 | 6549.67 \| 0.15 | | -| 384 | 3261.28 \| 0.31 | 7319.86 \| 0.14 | | -| 512 | 3369.69 \| 0.30 | 7425.47 \| 0.13 | | -| 640 | 3506.41 \| 0.29 | 7553.05 \| 0.13 | | -| 768 | 3461.95 \| 0.29 | 7681.78 \| 0.13 | | -| 896 | 3346.01 \| 0.30 | 7544.19 \| 0.13 | | +| 1 | 44.38 \| 22.53 | 42.23 \| 23.68 | | +| 2 | 85.82 \| 11.65 | 83.79 \| 11.93 | | +| 4 | 171.77 \| 5.82 | 168.91 \| 5.92 | | +| 8 | 334.21 \| 2.99 | 328.11 \| 3.05 | | +| 16 | 658.77 \| 1.52 | 669.15 \| 1.49 | | +| 32 | 1312.31 \| 0.76 | 1277.78 \| 0.78 | | +| 64 | 2312.48 \| 0.43 | 2605.26 \| 0.38 | | +| 128 | 2686.37 \| 0.37 | 4780.32 \| 0.21 | | +| 256 | 3313.04 \| 0.30 | 6549.67 \| 0.15 | | +| 384 | 3253.52 \| 0.31 | 7319.86 \| 0.14 | | +| 512 | 3361.34 \| 0.30 | 7425.47 \| 0.13 | | +| 640 | 3497.55 \| 0.29 | 7553.05 \| 0.13 | | +| 768 | 3460.71 \| 0.29 | 7681.78 \| 0.13 | | +| 896 | 3339.99 \| 0.30 | 7544.19 \| 0.13 | | | 1024 | oom | 7703.84 \| 0.13 | | | 1152 | oom | 7728.71 \| 0.13 | | | 1280 | oom | 7799.99 \| 0.13 | | @@ -32,20 +32,20 @@ tokens/sec | msec/token sec | batch_size | HF (fp32) | HF (bf16) | HF (int8) | |:----------:|:---------:|:---------:|:---------:| -| 1 | 2.21 | 2.37 | | -| 2 | 2.31 | 2.39 | | +| 1 | 2.25 | 2.37 | | +| 2 | 2.33 | 2.39 | | | 4 | 2.33 | 2.37 | | -| 8 | 2.45 | 2.44 | | -| 16 | 2.44 | 2.39 | | -| 32 | 2.36 | 2.50 | | -| 64 | 2.70 | 2.46 | | +| 8 | 2.39 | 2.44 | | +| 16 | 2.43 | 2.39 | | +| 32 | 2.44 | 2.50 | | +| 64 | 2.77 | 2.46 | | | 128 | 4.76 | 2.68 | | -| 256 | 7.70 | 3.91 | | -| 384 | 11.77 | 5.25 | | -| 512 | 15.19 | 6.90 | | -| 640 | 18.25 | 8.47 | | -| 768 | 22.18 | 10.00 | | -| 896 | 26.78 | 11.88 | | +| 256 | 7.73 | 3.91 | | +| 384 | 11.80 | 5.25 | | +| 512 | 15.23 | 6.90 | | +| 640 | 18.30 | 8.47 | | +| 768 | 22.19 | 10.00 | | +| 896 | 26.83 | 11.88 | | | 1024 | oom | 13.29 | | | 1152 | oom | 14.91 | | | 1280 | oom | 16.41 | | From dfe8cb32b44ce07597d9cc57b16ccc173c40cf83 Mon Sep 17 00:00:00 2001 From: mayank31398 Date: Sun, 4 Dec 2022 23:03:31 +0530 Subject: [PATCH 26/43] bf16 --- README.md | 76 +++++++++++++++++++++++++++---------------------------- 1 file changed, 38 insertions(+), 38 deletions(-) diff --git a/README.md b/README.md index 462050f..26724ab 100644 --- a/README.md +++ b/README.md @@ -6,51 +6,51 @@ A100 80GB tokens/sec | msec/token | batch_size | HF (fp32) | HF (bf16) | HF (int8) | |:----------:|:---------------:|:---------------:|:---------------:| -| 1 | 44.38 \| 22.53 | 42.23 \| 23.68 | | -| 2 | 85.82 \| 11.65 | 83.79 \| 11.93 | | -| 4 | 171.77 \| 5.82 | 168.91 \| 5.92 | | -| 8 | 334.21 \| 2.99 | 328.11 \| 3.05 | | -| 16 | 658.77 \| 1.52 | 669.15 \| 1.49 | | -| 32 | 1312.31 \| 0.76 | 1277.78 \| 0.78 | | -| 64 | 2312.48 \| 0.43 | 2605.26 \| 0.38 | | -| 128 | 2686.37 \| 0.37 | 4780.32 \| 0.21 | | -| 256 | 3313.04 \| 0.30 | 6549.67 \| 0.15 | | -| 384 | 3253.52 \| 0.31 | 7319.86 \| 0.14 | | -| 512 | 3361.34 \| 0.30 | 7425.47 \| 0.13 | | -| 640 | 3497.55 \| 0.29 | 7553.05 \| 0.13 | | -| 768 | 3460.71 \| 0.29 | 7681.78 \| 0.13 | | -| 896 | 3339.99 \| 0.30 | 7544.19 \| 0.13 | | -| 1024 | oom | 7703.84 \| 0.13 | | -| 1152 | oom | 7728.71 \| 0.13 | | -| 1280 | oom | 7799.99 \| 0.13 | | -| 1408 | oom | 7776.64 \| 0.13 | | -| 1536 | oom | 7802.61 \| 0.13 | | -| 1664 | oom | 7783.20 \| 0.13 | | -| 1792 | oom | 7738.55 \| 0.13 | | +| 1 | 44.38 \| 22.53 | 41.00 \| 24.39 | | +| 2 | 85.82 \| 11.65 | 79.20 \| 12.63 | | +| 4 | 171.77 \| 5.82 | 160.72 \| 6.22 | | +| 8 | 334.21 \| 2.99 | 317.56 \| 3.15 | | +| 16 | 658.77 \| 1.52 | 644.14 \| 1.55 | | +| 32 | 1312.31 \| 0.76 | 1277.62 \| 0.78 | | +| 64 | 2312.48 \| 0.43 | 2683.15 \| 0.37 | | +| 128 | 2686.37 \| 0.37 | 4766.97 \| 0.21 | | +| 256 | 3313.04 \| 0.30 | 6578.06 \| 0.15 | | +| 384 | 3253.52 \| 0.31 | 7285.37 \| 0.14 | | +| 512 | 3361.34 \| 0.30 | 7436.71 \| 0.13 | | +| 640 | 3497.55 \| 0.29 | 7554.05 \| 0.13 | | +| 768 | 3460.71 \| 0.29 | 7678.89 \| 0.13 | | +| 896 | 3339.99 \| 0.30 | 7542.81 \| 0.13 | | +| 1024 | oom | 7702.06 \| 0.13 | | +| 1152 | oom | 7719.68 \| 0.13 | | +| 1280 | oom | 7786.51 \| 0.13 | | +| 1408 | oom | 7770.26 \| 0.13 | | +| 1536 | oom | 7783.86 \| 0.13 | | +| 1664 | oom | 7772.43 \| 0.13 | | +| 1792 | oom | 7747.92 \| 0.13 | | | 1920 | oom | oom | | sec | batch_size | HF (fp32) | HF (bf16) | HF (int8) | |:----------:|:---------:|:---------:|:---------:| -| 1 | 2.25 | 2.37 | | -| 2 | 2.33 | 2.39 | | -| 4 | 2.33 | 2.37 | | -| 8 | 2.39 | 2.44 | | -| 16 | 2.43 | 2.39 | | +| 1 | 2.25 | 2.44 | | +| 2 | 2.33 | 2.53 | | +| 4 | 2.33 | 2.49 | | +| 8 | 2.39 | 2.52 | | +| 16 | 2.43 | 2.48 | | | 32 | 2.44 | 2.50 | | -| 64 | 2.77 | 2.46 | | -| 128 | 4.76 | 2.68 | | -| 256 | 7.73 | 3.91 | | -| 384 | 11.80 | 5.25 | | -| 512 | 15.23 | 6.90 | | +| 64 | 2.77 | 2.39 | | +| 128 | 4.76 | 2.69 | | +| 256 | 7.73 | 3.89 | | +| 384 | 11.80 | 5.27 | | +| 512 | 15.23 | 6.88 | | | 640 | 18.30 | 8.47 | | | 768 | 22.19 | 10.00 | | | 896 | 26.83 | 11.88 | | -| 1024 | oom | 13.29 | | -| 1152 | oom | 14.91 | | -| 1280 | oom | 16.41 | | -| 1408 | oom | 18.11 | | -| 1536 | oom | 19.69 | | -| 1664 | oom | 21.38 | | -| 1792 | oom | 23.16 | | +| 1024 | oom | 13.30 | | +| 1152 | oom | 14.92 | | +| 1280 | oom | 16.44 | | +| 1408 | oom | 18.12 | | +| 1536 | oom | 19.73 | | +| 1664 | oom | 21.41 | | +| 1792 | oom | 23.13 | | | 1920 | oom | oom | | \ No newline at end of file From 7344ae0c1de3891561afb70c4a66e4c44f1b7e7a Mon Sep 17 00:00:00 2001 From: mayank31398 Date: Sun, 4 Dec 2022 23:09:00 +0530 Subject: [PATCH 27/43] bf16 --- Makefile | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Makefile b/Makefile index 2e75294..2ea810d 100644 --- a/Makefile +++ b/Makefile @@ -18,7 +18,7 @@ hf-1b-bloom-bf16: hf-1b-bloom-int8: python src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class HF_GPU_Pipeline --model_class BLOOM --dtype int8 --batch_size ${batch_size} -ds-inference-1b-bloom: +ds-inference-1b-bloom-fp16: deepspeed --num_gpus 1 src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class DS_Inference_Pipeline --model_class BLOOM --batch_size ${batch_size} # GPT2 MHA @@ -31,7 +31,7 @@ hf-1b-GPT2-mha-bf16: hf-1b-GPT2-mha-int8: python src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class HF_GPU_Pipeline --model_class GPT2 --n_positions 2048 --attention_type 1 --dtype int8 --batch_size ${batch_size} -ds-inference-1b-GPT2-mha: +ds-inference-1b-GPT2-mha-fp16: deepspeed --num_gpus 1 src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class DS_Inference_Pipeline --model_class GPT2 --n_positions 2048 --attention_type 1 --batch_size ${batch_size} # GPT2 MQA @@ -44,7 +44,7 @@ hf-1b-GPT2-mqa-bf16: hf-1b-GPT2-mqa-int8: python src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class HF_GPU_Pipeline --model_class GPT2 --n_positions 2048 --attention_type 2 --dtype int8 --batch_size ${batch_size} -ds-inference-1b-GPT2-mqa: +ds-inference-1b-GPT2-mqa-fp16: deepspeed --num_gpus 1 src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class DS_Inference_Pipeline --model_class GPT2 --n_positions 2048 --attention_type 2 --batch_size ${batch_size} # GPT2 MQA1 @@ -57,5 +57,5 @@ hf-1b-GPT2-mqa1-bf16: hf-1b-GPT2-mqa1-int8: python src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class HF_GPU_Pipeline --model_class GPT2 --n_positions 2048 --attention_type 3 --dtype int8 --batch_size ${batch_size} -ds-inference-1b-GPT2-mqa1: +ds-inference-1b-GPT2-mqa1-fp16: deepspeed --num_gpus 1 src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class DS_Inference_Pipeline --model_class GPT2 --n_positions 2048 --attention_type 3 --batch_size ${batch_size} From a4c3b81cf0f295422e313edd77e2edf9384fcbcf Mon Sep 17 00:00:00 2001 From: mayank31398 Date: Mon, 5 Dec 2022 00:30:16 +0530 Subject: [PATCH 28/43] ds-inference --- README.md | 107 ++++++++++++++++++++++++++++-------------------------- 1 file changed, 56 insertions(+), 51 deletions(-) diff --git a/README.md b/README.md index 26724ab..6552df0 100644 --- a/README.md +++ b/README.md @@ -1,56 +1,61 @@ # bigcode-inference-benchmark +A100 80GB BLOOM\ -A100 80GB +```python +hidden_size = 2048 +n_head = 16 +n_layer = 24 +``` -tokens/sec | msec/token -| batch_size | HF (fp32) | HF (bf16) | HF (int8) | -|:----------:|:---------------:|:---------------:|:---------------:| -| 1 | 44.38 \| 22.53 | 41.00 \| 24.39 | | -| 2 | 85.82 \| 11.65 | 79.20 \| 12.63 | | -| 4 | 171.77 \| 5.82 | 160.72 \| 6.22 | | -| 8 | 334.21 \| 2.99 | 317.56 \| 3.15 | | -| 16 | 658.77 \| 1.52 | 644.14 \| 1.55 | | -| 32 | 1312.31 \| 0.76 | 1277.62 \| 0.78 | | -| 64 | 2312.48 \| 0.43 | 2683.15 \| 0.37 | | -| 128 | 2686.37 \| 0.37 | 4766.97 \| 0.21 | | -| 256 | 3313.04 \| 0.30 | 6578.06 \| 0.15 | | -| 384 | 3253.52 \| 0.31 | 7285.37 \| 0.14 | | -| 512 | 3361.34 \| 0.30 | 7436.71 \| 0.13 | | -| 640 | 3497.55 \| 0.29 | 7554.05 \| 0.13 | | -| 768 | 3460.71 \| 0.29 | 7678.89 \| 0.13 | | -| 896 | 3339.99 \| 0.30 | 7542.81 \| 0.13 | | -| 1024 | oom | 7702.06 \| 0.13 | | -| 1152 | oom | 7719.68 \| 0.13 | | -| 1280 | oom | 7786.51 \| 0.13 | | -| 1408 | oom | 7770.26 \| 0.13 | | -| 1536 | oom | 7783.86 \| 0.13 | | -| 1664 | oom | 7772.43 \| 0.13 | | -| 1792 | oom | 7747.92 \| 0.13 | | -| 1920 | oom | oom | | +Throughput (tokens/sec | msec/token) +| batch_size | HF (fp32) | HF (bf16) | HF (int8) | DS-inference (fp16) | +|:----------:|:---------------:|:---------------:|:---------------:|:-------------------:| +| 1 | 44.38 \| 22.53 | 41.00 \| 24.39 | | 61.61 | 16.23 | +| 2 | 85.82 \| 11.65 | 79.20 \| 12.63 | | 121.55 | 8.23 | +| 4 | 171.77 \| 5.82 | 160.72 \| 6.22 | | 240.06 | 4.17 | +| 8 | 334.21 \| 2.99 | 317.56 \| 3.15 | | 492.42 | 2.03 | +| 16 | 658.77 \| 1.52 | 644.14 \| 1.55 | | 970.59 | 1.03 | +| 32 | 1312.31 \| 0.76 | 1277.62 \| 0.78 | | 1999.04 | 0.50 | +| 64 | 2312.48 \| 0.43 | 2683.15 \| 0.37 | | 3971.09 | 0.25 | +| 128 | 2686.37 \| 0.37 | 4766.97 \| 0.21 | | 7514.59 | 0.13 | +| 256 | 3313.04 \| 0.30 | 6578.06 \| 0.15 | | 10226.50 | 0.10 | +| 384 | 3253.52 \| 0.31 | 7285.37 \| 0.14 | | 11094.27 | 0.09 | +| 512 | 3361.34 \| 0.30 | 7436.71 \| 0.13 | | 11390.85 | 0.09 | +| 640 | 3497.55 \| 0.29 | 7554.05 \| 0.13 | | 11625.71 | 0.09 | +| 768 | 3460.71 \| 0.29 | 7678.89 \| 0.13 | | 11814.31 | 0.08 | +| 896 | 3339.99 \| 0.30 | 7542.81 \| 0.13 | | 11744.38 | 0.09 | +| 1024 | oom | 7702.06 \| 0.13 | | 11534.95 | 0.09 | +| 1152 | oom | 7719.68 \| 0.13 | | oom | +| 1280 | oom | 7786.51 \| 0.13 | | oom | +| 1408 | oom | 7770.26 \| 0.13 | | oom | +| 1536 | oom | 7783.86 \| 0.13 | | oom | +| 1664 | oom | 7772.43 \| 0.13 | | oom | +| 1792 | oom | 7747.92 \| 0.13 | | oom | +| 1920 | oom | oom | | oom | -sec -| batch_size | HF (fp32) | HF (bf16) | HF (int8) | -|:----------:|:---------:|:---------:|:---------:| -| 1 | 2.25 | 2.44 | | -| 2 | 2.33 | 2.53 | | -| 4 | 2.33 | 2.49 | | -| 8 | 2.39 | 2.52 | | -| 16 | 2.43 | 2.48 | | -| 32 | 2.44 | 2.50 | | -| 64 | 2.77 | 2.39 | | -| 128 | 4.76 | 2.69 | | -| 256 | 7.73 | 3.89 | | -| 384 | 11.80 | 5.27 | | -| 512 | 15.23 | 6.88 | | -| 640 | 18.30 | 8.47 | | -| 768 | 22.19 | 10.00 | | -| 896 | 26.83 | 11.88 | | -| 1024 | oom | 13.30 | | -| 1152 | oom | 14.92 | | -| 1280 | oom | 16.44 | | -| 1408 | oom | 18.12 | | -| 1536 | oom | 19.73 | | -| 1664 | oom | 21.41 | | -| 1792 | oom | 23.13 | | -| 1920 | oom | oom | | \ No newline at end of file +Latency (sec) +| batch_size | HF (fp32) | HF (bf16) | HF (int8) | DS-inference (fp16) | +|:----------:|:---------:|:---------:|:---------:|:-------------------:| +| 1 | 2.25 | 2.44 | | 1.62 | +| 2 | 2.33 | 2.53 | | 1.65 | +| 4 | 2.33 | 2.49 | | 1.67 | +| 8 | 2.39 | 2.52 | | 1.62 | +| 16 | 2.43 | 2.48 | | 1.65 | +| 32 | 2.44 | 2.50 | | 1.60 | +| 64 | 2.77 | 2.39 | | 1.61 | +| 128 | 4.76 | 2.69 | | 1.70 | +| 256 | 7.73 | 3.89 | | 2.50 | +| 384 | 11.80 | 5.27 | | 3.46 | +| 512 | 15.23 | 6.88 | | 4.49 | +| 640 | 18.30 | 8.47 | | 5.51 | +| 768 | 22.19 | 10.00 | | 6.50 | +| 896 | 26.83 | 11.88 | | 7.63 | +| 1024 | oom | 13.30 | | 8.88 | +| 1152 | oom | 14.92 | | oom | +| 1280 | oom | 16.44 | | oom | +| 1408 | oom | 18.12 | | oom | +| 1536 | oom | 19.73 | | oom | +| 1664 | oom | 21.41 | | oom | +| 1792 | oom | 23.13 | | oom | +| 1920 | oom | oom | | oom | \ No newline at end of file From a0f308db975d271fd9fcf0cba27a576e59ae60dd Mon Sep 17 00:00:00 2001 From: mayank31398 Date: Mon, 5 Dec 2022 03:23:15 +0530 Subject: [PATCH 29/43] device map --- src/pipelines/hf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pipelines/hf.py b/src/pipelines/hf.py index a3dca81..e7d2ac0 100644 --- a/src/pipelines/hf.py +++ b/src/pipelines/hf.py @@ -9,7 +9,7 @@ class HF_Pipeline(Pipeline): def __init__(self, args: Namespace, device: str = "cpu") -> None: super().__init__(args) - model_kwargs = {} + model_kwargs = {"device_map": "auto"} if args.dtype == torch.int8: model_kwargs["load_in_8bit"] = True From 0947688aca02e3ccc859178de7aa69b24187b2a8 Mon Sep 17 00:00:00 2001 From: mayank31398 Date: Mon, 5 Dec 2022 03:24:41 +0530 Subject: [PATCH 30/43] device map --- src/pipelines/hf.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/pipelines/hf.py b/src/pipelines/hf.py index e7d2ac0..1050074 100644 --- a/src/pipelines/hf.py +++ b/src/pipelines/hf.py @@ -9,10 +9,11 @@ class HF_Pipeline(Pipeline): def __init__(self, args: Namespace, device: str = "cpu") -> None: super().__init__(args) - model_kwargs = {"device_map": "auto"} + model_kwargs = {} if args.dtype == torch.int8: model_kwargs["load_in_8bit"] = True + model_kwargs["device_map"] = "auto" else: model_kwargs["torch_dtype"] = args.dtype From 379bfd9739a94161aece6757c02694e5d545b216 Mon Sep 17 00:00:00 2001 From: mayank31398 Date: Mon, 5 Dec 2022 03:38:36 +0530 Subject: [PATCH 31/43] fix --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 2ea810d..a47032a 100644 --- a/Makefile +++ b/Makefile @@ -3,7 +3,7 @@ batch_size := 1 install-mqa-transformers: git clone https://github.com/bigcode-project/transformers.git; \ cd transformers; \ - git checkout multi_query; \ + git checkout mayank/multi_query; \ pip install .; \ cd ..; \ rm -rf transformers; From 6dc0c0786ec40eec6fadc11f1a282be0d5360f62 Mon Sep 17 00:00:00 2001 From: mayank31398 Date: Mon, 5 Dec 2022 13:29:08 +0530 Subject: [PATCH 32/43] fp32 --- README.md | 56 +++++++++++++++++++++++++++---------------------------- 1 file changed, 28 insertions(+), 28 deletions(-) diff --git a/README.md b/README.md index 6552df0..b285956 100644 --- a/README.md +++ b/README.md @@ -11,20 +11,20 @@ n_layer = 24 Throughput (tokens/sec | msec/token) | batch_size | HF (fp32) | HF (bf16) | HF (int8) | DS-inference (fp16) | |:----------:|:---------------:|:---------------:|:---------------:|:-------------------:| -| 1 | 44.38 \| 22.53 | 41.00 \| 24.39 | | 61.61 | 16.23 | -| 2 | 85.82 \| 11.65 | 79.20 \| 12.63 | | 121.55 | 8.23 | -| 4 | 171.77 \| 5.82 | 160.72 \| 6.22 | | 240.06 | 4.17 | -| 8 | 334.21 \| 2.99 | 317.56 \| 3.15 | | 492.42 | 2.03 | -| 16 | 658.77 \| 1.52 | 644.14 \| 1.55 | | 970.59 | 1.03 | -| 32 | 1312.31 \| 0.76 | 1277.62 \| 0.78 | | 1999.04 | 0.50 | -| 64 | 2312.48 \| 0.43 | 2683.15 \| 0.37 | | 3971.09 | 0.25 | -| 128 | 2686.37 \| 0.37 | 4766.97 \| 0.21 | | 7514.59 | 0.13 | -| 256 | 3313.04 \| 0.30 | 6578.06 \| 0.15 | | 10226.50 | 0.10 | -| 384 | 3253.52 \| 0.31 | 7285.37 \| 0.14 | | 11094.27 | 0.09 | -| 512 | 3361.34 \| 0.30 | 7436.71 \| 0.13 | | 11390.85 | 0.09 | -| 640 | 3497.55 \| 0.29 | 7554.05 \| 0.13 | | 11625.71 | 0.09 | -| 768 | 3460.71 \| 0.29 | 7678.89 \| 0.13 | | 11814.31 | 0.08 | -| 896 | 3339.99 \| 0.30 | 7542.81 \| 0.13 | | 11744.38 | 0.09 | +| 1 | 51.59 \| 19.38 | 41.00 \| 24.39 | | 61.61 | 16.23 | +| 2 | 103.92 \| 9.62 | 79.20 \| 12.63 | | 121.55 | 8.23 | +| 4 | 211.96 \| 4.72 | 160.72 \| 6.22 | | 240.06 | 4.17 | +| 8 | 411.79 \| 2.43 | 317.56 \| 3.15 | | 492.42 | 2.03 | +| 16 | 804.55 \| 1.24 | 644.14 \| 1.55 | | 970.59 | 1.03 | +| 32 | 1574.68 \| 0.64 | 1277.62 \| 0.78 | | 1999.04 | 0.50 | +| 64 | 2712.46 \| 0.37 | 2683.15 \| 0.37 | | 3971.09 | 0.25 | +| 128 | 2974.36 \| 0.34 | 4766.97 \| 0.21 | | 7514.59 | 0.13 | +| 256 | 3695.44 \| 0.27 | 6578.06 \| 0.15 | | 10226.50 | 0.10 | +| 384 | 3591.13 \| 0.28 | 7285.37 \| 0.14 | | 11094.27 | 0.09 | +| 512 | 3708.54 \| 0.27 | 7436.71 \| 0.13 | | 11390.85 | 0.09 | +| 640 | 3859.43 \| 0.26 | 7554.05 \| 0.13 | | 11625.71 | 0.09 | +| 768 | 3804.82 \| 0.26 | 7678.89 \| 0.13 | | 11814.31 | 0.08 | +| 896 | 3652.42 \| 0.27 | 7542.81 \| 0.13 | | 11744.38 | 0.09 | | 1024 | oom | 7702.06 \| 0.13 | | 11534.95 | 0.09 | | 1152 | oom | 7719.68 \| 0.13 | | oom | | 1280 | oom | 7786.51 \| 0.13 | | oom | @@ -37,20 +37,20 @@ Throughput (tokens/sec | msec/token) Latency (sec) | batch_size | HF (fp32) | HF (bf16) | HF (int8) | DS-inference (fp16) | |:----------:|:---------:|:---------:|:---------:|:-------------------:| -| 1 | 2.25 | 2.44 | | 1.62 | -| 2 | 2.33 | 2.53 | | 1.65 | -| 4 | 2.33 | 2.49 | | 1.67 | -| 8 | 2.39 | 2.52 | | 1.62 | -| 16 | 2.43 | 2.48 | | 1.65 | -| 32 | 2.44 | 2.50 | | 1.60 | -| 64 | 2.77 | 2.39 | | 1.61 | -| 128 | 4.76 | 2.69 | | 1.70 | -| 256 | 7.73 | 3.89 | | 2.50 | -| 384 | 11.80 | 5.27 | | 3.46 | -| 512 | 15.23 | 6.88 | | 4.49 | -| 640 | 18.30 | 8.47 | | 5.51 | -| 768 | 22.19 | 10.00 | | 6.50 | -| 896 | 26.83 | 11.88 | | 7.63 | +| 1 | 1.94 | 2.44 | | 1.62 | +| 2 | 1.92 | 2.53 | | 1.65 | +| 4 | 1.89 | 2.49 | | 1.67 | +| 8 | 1.94 | 2.52 | | 1.62 | +| 16 | 1.99 | 2.48 | | 1.65 | +| 32 | 2.03 | 2.50 | | 1.60 | +| 64 | 2.36 | 2.39 | | 1.61 | +| 128 | 4.30 | 2.69 | | 1.70 | +| 256 | 6.93 | 3.89 | | 2.50 | +| 384 | 10.69 | 5.27 | | 3.46 | +| 512 | 14.82 | 6.88 | | 4.49 | +| 640 | 19.85 | 8.47 | | 5.51 | +| 768 | 20.18 | 10.00 | | 6.50 | +| 896 | 24.53 | 11.88 | | 7.63 | | 1024 | oom | 13.30 | | 8.88 | | 1152 | oom | 14.92 | | oom | | 1280 | oom | 16.44 | | oom | From 7dc67ea0a3b28b27048cd96b18ac32911e9d9d61 Mon Sep 17 00:00:00 2001 From: mayank31398 Date: Mon, 5 Dec 2022 14:13:10 +0530 Subject: [PATCH 33/43] bf16 --- README.md | 84 +++++++++++++++++++++++++++---------------------------- 1 file changed, 42 insertions(+), 42 deletions(-) diff --git a/README.md b/README.md index b285956..ff22562 100644 --- a/README.md +++ b/README.md @@ -11,51 +11,51 @@ n_layer = 24 Throughput (tokens/sec | msec/token) | batch_size | HF (fp32) | HF (bf16) | HF (int8) | DS-inference (fp16) | |:----------:|:---------------:|:---------------:|:---------------:|:-------------------:| -| 1 | 51.59 \| 19.38 | 41.00 \| 24.39 | | 61.61 | 16.23 | -| 2 | 103.92 \| 9.62 | 79.20 \| 12.63 | | 121.55 | 8.23 | -| 4 | 211.96 \| 4.72 | 160.72 \| 6.22 | | 240.06 | 4.17 | -| 8 | 411.79 \| 2.43 | 317.56 \| 3.15 | | 492.42 | 2.03 | -| 16 | 804.55 \| 1.24 | 644.14 \| 1.55 | | 970.59 | 1.03 | -| 32 | 1574.68 \| 0.64 | 1277.62 \| 0.78 | | 1999.04 | 0.50 | -| 64 | 2712.46 \| 0.37 | 2683.15 \| 0.37 | | 3971.09 | 0.25 | -| 128 | 2974.36 \| 0.34 | 4766.97 \| 0.21 | | 7514.59 | 0.13 | -| 256 | 3695.44 \| 0.27 | 6578.06 \| 0.15 | | 10226.50 | 0.10 | -| 384 | 3591.13 \| 0.28 | 7285.37 \| 0.14 | | 11094.27 | 0.09 | -| 512 | 3708.54 \| 0.27 | 7436.71 \| 0.13 | | 11390.85 | 0.09 | -| 640 | 3859.43 \| 0.26 | 7554.05 \| 0.13 | | 11625.71 | 0.09 | -| 768 | 3804.82 \| 0.26 | 7678.89 \| 0.13 | | 11814.31 | 0.08 | -| 896 | 3652.42 \| 0.27 | 7542.81 \| 0.13 | | 11744.38 | 0.09 | -| 1024 | oom | 7702.06 \| 0.13 | | 11534.95 | 0.09 | -| 1152 | oom | 7719.68 \| 0.13 | | oom | -| 1280 | oom | 7786.51 \| 0.13 | | oom | -| 1408 | oom | 7770.26 \| 0.13 | | oom | -| 1536 | oom | 7783.86 \| 0.13 | | oom | -| 1664 | oom | 7772.43 \| 0.13 | | oom | -| 1792 | oom | 7747.92 \| 0.13 | | oom | +| 1 | 51.59 \| 19.38 | 47.46 \| 21.07 | | 61.61 | 16.23 | +| 2 | 103.92 \| 9.62 | 96.88 \| 10.32 | | 121.55 | 8.23 | +| 4 | 211.96 \| 4.72 | 193.72 \| 5.16 | | 240.06 | 4.17 | +| 8 | 411.79 \| 2.43 | 370.67 \| 2.70 | | 492.42 | 2.03 | +| 16 | 804.55 \| 1.24 | 781.29 \| 1.28 | | 970.59 | 1.03 | +| 32 | 1574.68 \| 0.64 | 1539.19 \| 0.65 | | 1999.04 | 0.50 | +| 64 | 2712.46 \| 0.37 | 3038.01 \| 0.33 | | 3971.09 | 0.25 | +| 128 | 2974.36 \| 0.34 | 5795.97 \| 0.17 | | 7514.59 | 0.13 | +| 256 | 3695.44 \| 0.27 | 8216.27 \| 0.12 | | 10226.50 | 0.10 | +| 384 | 3591.13 \| 0.28 | 9328.18 \| 0.11 | | 11094.27 | 0.09 | +| 512 | 3708.54 \| 0.27 | 9446.34 \| 0.11 | | 11390.85 | 0.09 | +| 640 | 3859.43 \| 0.26 | 9572.53 \| 0.10 | | 11625.71 | 0.09 | +| 768 | 3804.82 \| 0.26 | 9464.75 \| 0.11 | | 11814.31 | 0.08 | +| 896 | 3652.42 \| 0.27 | 9482.11 \| 0.11 | | 11744.38 | 0.09 | +| 1024 | oom | 9710.46 \| 0.10 | | 11534.95 | 0.09 | +| 1152 | oom | 9712.39 \| 0.10 | | oom | +| 1280 | oom | 9667.19 \| 0.10 | | oom | +| 1408 | oom | 9771.91 \| 0.10 | | oom | +| 1536 | oom | 9744.56 \| 0.10 | | oom | +| 1664 | oom | 9719.82 \| 0.10 | | oom | +| 1792 | oom | 9690.61 \| 0.10 | | oom | | 1920 | oom | oom | | oom | Latency (sec) | batch_size | HF (fp32) | HF (bf16) | HF (int8) | DS-inference (fp16) | |:----------:|:---------:|:---------:|:---------:|:-------------------:| -| 1 | 1.94 | 2.44 | | 1.62 | -| 2 | 1.92 | 2.53 | | 1.65 | -| 4 | 1.89 | 2.49 | | 1.67 | -| 8 | 1.94 | 2.52 | | 1.62 | -| 16 | 1.99 | 2.48 | | 1.65 | -| 32 | 2.03 | 2.50 | | 1.60 | -| 64 | 2.36 | 2.39 | | 1.61 | -| 128 | 4.30 | 2.69 | | 1.70 | -| 256 | 6.93 | 3.89 | | 2.50 | -| 384 | 10.69 | 5.27 | | 3.46 | -| 512 | 14.82 | 6.88 | | 4.49 | -| 640 | 19.85 | 8.47 | | 5.51 | -| 768 | 20.18 | 10.00 | | 6.50 | -| 896 | 24.53 | 11.88 | | 7.63 | -| 1024 | oom | 13.30 | | 8.88 | -| 1152 | oom | 14.92 | | oom | -| 1280 | oom | 16.44 | | oom | -| 1408 | oom | 18.12 | | oom | -| 1536 | oom | 19.73 | | oom | -| 1664 | oom | 21.41 | | oom | -| 1792 | oom | 23.13 | | oom | +| 1 | 1.94 | 2.11 | | 1.62 | +| 2 | 1.92 | 2.06 | | 1.65 | +| 4 | 1.89 | 2.06 | | 1.67 | +| 8 | 1.94 | 2.16 | | 1.62 | +| 16 | 1.99 | 2.05 | | 1.65 | +| 32 | 2.03 | 2.08 | | 1.60 | +| 64 | 2.36 | 2.11 | | 1.61 | +| 128 | 4.30 | 2.21 | | 1.70 | +| 256 | 6.93 | 3.12 | | 2.50 | +| 384 | 10.69 | 4.12 | | 3.46 | +| 512 | 14.82 | 5.42 | | 4.49 | +| 640 | 19.85 | 6.69 | | 5.51 | +| 768 | 20.18 | 8.11 | | 6.50 | +| 896 | 24.53 | 9.45 | | 7.63 | +| 1024 | oom | 10.55 | | 8.88 | +| 1152 | oom | 11.86 | | oom | +| 1280 | oom | 13.24 | | oom | +| 1408 | oom | 14.41 | | oom | +| 1536 | oom | 15.76 | | oom | +| 1664 | oom | 17.12 | | oom | +| 1792 | oom | 18.49 | | oom | | 1920 | oom | oom | | oom | \ No newline at end of file From 2ac761d640702292abccfd6cc810c9630096bc8e Mon Sep 17 00:00:00 2001 From: mayank31398 Date: Mon, 5 Dec 2022 17:45:13 +0530 Subject: [PATCH 34/43] int8 --- README.md | 88 +++++++++++++++++++++++++++---------------------------- 1 file changed, 44 insertions(+), 44 deletions(-) diff --git a/README.md b/README.md index ff22562..f68f6cb 100644 --- a/README.md +++ b/README.md @@ -11,51 +11,51 @@ n_layer = 24 Throughput (tokens/sec | msec/token) | batch_size | HF (fp32) | HF (bf16) | HF (int8) | DS-inference (fp16) | |:----------:|:---------------:|:---------------:|:---------------:|:-------------------:| -| 1 | 51.59 \| 19.38 | 47.46 \| 21.07 | | 61.61 | 16.23 | -| 2 | 103.92 \| 9.62 | 96.88 \| 10.32 | | 121.55 | 8.23 | -| 4 | 211.96 \| 4.72 | 193.72 \| 5.16 | | 240.06 | 4.17 | -| 8 | 411.79 \| 2.43 | 370.67 \| 2.70 | | 492.42 | 2.03 | -| 16 | 804.55 \| 1.24 | 781.29 \| 1.28 | | 970.59 | 1.03 | -| 32 | 1574.68 \| 0.64 | 1539.19 \| 0.65 | | 1999.04 | 0.50 | -| 64 | 2712.46 \| 0.37 | 3038.01 \| 0.33 | | 3971.09 | 0.25 | -| 128 | 2974.36 \| 0.34 | 5795.97 \| 0.17 | | 7514.59 | 0.13 | -| 256 | 3695.44 \| 0.27 | 8216.27 \| 0.12 | | 10226.50 | 0.10 | -| 384 | 3591.13 \| 0.28 | 9328.18 \| 0.11 | | 11094.27 | 0.09 | -| 512 | 3708.54 \| 0.27 | 9446.34 \| 0.11 | | 11390.85 | 0.09 | -| 640 | 3859.43 \| 0.26 | 9572.53 \| 0.10 | | 11625.71 | 0.09 | -| 768 | 3804.82 \| 0.26 | 9464.75 \| 0.11 | | 11814.31 | 0.08 | -| 896 | 3652.42 \| 0.27 | 9482.11 \| 0.11 | | 11744.38 | 0.09 | -| 1024 | oom | 9710.46 \| 0.10 | | 11534.95 | 0.09 | -| 1152 | oom | 9712.39 \| 0.10 | | oom | -| 1280 | oom | 9667.19 \| 0.10 | | oom | -| 1408 | oom | 9771.91 \| 0.10 | | oom | -| 1536 | oom | 9744.56 \| 0.10 | | oom | -| 1664 | oom | 9719.82 \| 0.10 | | oom | -| 1792 | oom | 9690.61 \| 0.10 | | oom | -| 1920 | oom | oom | | oom | +| 1 | 51.59 \| 19.38 | 47.46 \| 21.07 | 16.53 \| 60.49 | 61.61 \| 16.23 | +| 2 | 103.92 \| 9.62 | 96.88 \| 10.32 | 33.79 \| 29.60 | 121.55 \| 8.23 | +| 4 | 211.96 \| 4.72 | 193.72 \| 5.16 | 67.38 \| 14.84 | 240.06 \| 4.17 | +| 8 | 411.79 \| 2.43 | 370.67 \| 2.70 | 134.34 \| 7.44 | 492.42 \| 2.03 | +| 16 | 804.55 \| 1.24 | 781.29 \| 1.28 | 275.69 \| 3.63 | 970.59 \| 1.03 | +| 32 | 1574.68 \| 0.64 | 1539.19 \| 0.65 | 537.14 \| 1.86 | 1999.04 \| 0.50 | +| 64 | 2712.46 \| 0.37 | 3038.01 \| 0.33 | 1070.50 \| 0.93 | 3971.09 \| 0.25 | +| 128 | 2974.36 \| 0.34 | 5795.97 \| 0.17 | 2055.34 \| 0.49 | 7514.59 \| 0.13 | +| 256 | 3695.44 \| 0.27 | 8216.27 \| 0.12 | 3523.77 \| 0.28 | 10226.50 \| 0.10 | +| 384 | 3591.13 \| 0.28 | 9328.18 \| 0.11 | 4585.33 \| 0.22 | 11094.27 \| 0.09 | +| 512 | 3708.54 \| 0.27 | 9446.34 \| 0.11 | 5416.48 \| 0.18 | 11390.85 \| 0.09 | +| 640 | 3859.43 \| 0.26 | 9572.53 \| 0.10 | 6113.65 \| 0.16 | 11625.71 \| 0.09 | +| 768 | 3804.82 \| 0.26 | 9464.75 \| 0.11 | 6582.52 \| 0.15 | 11814.31 \| 0.08 | +| 896 | 3652.42 \| 0.27 | 9482.11 \| 0.11 | 7111.08 \| 0.14 | 11744.38 \| 0.09 | +| 1024 | oom | 9710.46 \| 0.10 | 7486.36 \| 0.13 | 11534.95 \| 0.09 | +| 1152 | oom | 9712.39 \| 0.10 | 7544.99 \| 0.13 | oom | +| 1280 | oom | 9667.19 \| 0.10 | 7858.91 \| 0.13 | oom | +| 1408 | oom | 9771.91 \| 0.10 | 8116.30 \| 0.12 | oom | +| 1536 | oom | 9744.56 \| 0.10 | 8201.28 \| 0.12 | oom | +| 1664 | oom | 9719.82 \| 0.10 | 8227.56 \| 0.12 | oom | +| 1792 | oom | 9690.61 \| 0.10 | 8344.36 \| 0.12 | oom | +| 1920 | oom | oom | oom | oom | Latency (sec) | batch_size | HF (fp32) | HF (bf16) | HF (int8) | DS-inference (fp16) | |:----------:|:---------:|:---------:|:---------:|:-------------------:| -| 1 | 1.94 | 2.11 | | 1.62 | -| 2 | 1.92 | 2.06 | | 1.65 | -| 4 | 1.89 | 2.06 | | 1.67 | -| 8 | 1.94 | 2.16 | | 1.62 | -| 16 | 1.99 | 2.05 | | 1.65 | -| 32 | 2.03 | 2.08 | | 1.60 | -| 64 | 2.36 | 2.11 | | 1.61 | -| 128 | 4.30 | 2.21 | | 1.70 | -| 256 | 6.93 | 3.12 | | 2.50 | -| 384 | 10.69 | 4.12 | | 3.46 | -| 512 | 14.82 | 5.42 | | 4.49 | -| 640 | 19.85 | 6.69 | | 5.51 | -| 768 | 20.18 | 8.11 | | 6.50 | -| 896 | 24.53 | 9.45 | | 7.63 | -| 1024 | oom | 10.55 | | 8.88 | -| 1152 | oom | 11.86 | | oom | -| 1280 | oom | 13.24 | | oom | -| 1408 | oom | 14.41 | | oom | -| 1536 | oom | 15.76 | | oom | -| 1664 | oom | 17.12 | | oom | -| 1792 | oom | 18.49 | | oom | -| 1920 | oom | oom | | oom | \ No newline at end of file +| 1 | 1.94 | 2.11 | 6.05 | 1.62 | +| 2 | 1.92 | 2.06 | 5.92 | 1.65 | +| 4 | 1.89 | 2.06 | 5.94 | 1.67 | +| 8 | 1.94 | 2.16 | 5.96 | 1.62 | +| 16 | 1.99 | 2.05 | 5.80 | 1.65 | +| 32 | 2.03 | 2.08 | 5.96 | 1.60 | +| 64 | 2.36 | 2.11 | 5.98 | 1.61 | +| 128 | 4.30 | 2.21 | 6.23 | 1.70 | +| 256 | 6.93 | 3.12 | 7.26 | 2.50 | +| 384 | 10.69 | 4.12 | 8.37 | 3.46 | +| 512 | 14.82 | 5.42 | 9.45 | 4.49 | +| 640 | 19.85 | 6.69 | 10.47 | 5.51 | +| 768 | 20.18 | 8.11 | 11.67 | 6.50 | +| 896 | 24.53 | 9.45 | 12.60 | 7.63 | +| 1024 | oom | 10.55 | 13.68 | 8.88 | +| 1152 | oom | 11.86 | 15.27 | oom | +| 1280 | oom | 13.24 | 16.29 | oom | +| 1408 | oom | 14.41 | 17.35 | oom | +| 1536 | oom | 15.76 | 18.73 | oom | +| 1664 | oom | 17.12 | 20.22 | oom | +| 1792 | oom | 18.49 | 21.48 | oom | +| 1920 | oom | oom | oom | oom | \ No newline at end of file From 28e1e715df9835759389839d497b80991a700747 Mon Sep 17 00:00:00 2001 From: mayank31398 Date: Mon, 5 Dec 2022 19:24:05 +0530 Subject: [PATCH 35/43] attention_type --- README.md | 11 +++++++++-- src/pipelines/pipeline.py | 13 +------------ 2 files changed, 10 insertions(+), 14 deletions(-) diff --git a/README.md b/README.md index f68f6cb..8c9ffd8 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ # bigcode-inference-benchmark A100 80GB -BLOOM\ +BLOOM ```python hidden_size = 2048 n_head = 16 @@ -58,4 +58,11 @@ Latency (sec) | 1536 | oom | 15.76 | 18.73 | oom | | 1664 | oom | 17.12 | 20.22 | oom | | 1792 | oom | 18.49 | 21.48 | oom | -| 1920 | oom | oom | oom | oom | \ No newline at end of file +| 1920 | oom | oom | oom | oom | + +GPT2 MHA +```python +hidden_size = 2048 +n_head = 16 +n_layer = 24 +``` diff --git a/src/pipelines/pipeline.py b/src/pipelines/pipeline.py index e7d0830..20a0988 100644 --- a/src/pipelines/pipeline.py +++ b/src/pipelines/pipeline.py @@ -63,7 +63,7 @@ def get_config_tokenizer_model_class(args: Namespace) -> Union[BloomConfig, GPT2 n_positions=args.n_positions, bos_token_id=tokenizer.bos_token_id, eos_token_id=tokenizer.eos_token_id, - attention_type=get_attention_type(args.attention_type), + attention_type=args.attention_type, print_details=False, vocab_size=len(tokenizer), use_cache=True, @@ -74,14 +74,3 @@ def get_config_tokenizer_model_class(args: Namespace) -> Union[BloomConfig, GPT2 model_class._from_config(config).save_pretrained("tmp") return config, tokenizer, model_class - - -def get_attention_type(attention_type: int): - from transformers.models.gpt2.modeling_gpt2 import AttentionType - - if attention_type == 1: - return AttentionType.MULTI_HEAD - elif attention_type == 2: - return AttentionType.MULTI_QUERY - elif attention_type == 3: - return AttentionType.MULTI_QUERY_1 From b2c7de7c9c7fa61bc30fdc7f2c8c7f4c2e1237d3 Mon Sep 17 00:00:00 2001 From: mayank31398 Date: Mon, 5 Dec 2022 21:05:36 +0530 Subject: [PATCH 36/43] fp32 --- README.md | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/README.md b/README.md index 8c9ffd8..fdb7e1b 100644 --- a/README.md +++ b/README.md @@ -66,3 +66,29 @@ hidden_size = 2048 n_head = 16 n_layer = 24 ``` + +Throughput (tokens/sec | msec/token) +| batch_size | HF (fp32) | HF (bf16) | HF (int8) | DS-inference (fp16) | +|:----------:|:---------------:|:---------------:|:---------------:|:-------------------:| +| 1 | 41.34 \| 24.19 | 47.46 \| 21.07 | 16.53 \| 60.49 | 61.61 \| 16.23 | +| 2 | 79.21 \| 12.62 | 96.88 \| 10.32 | 33.79 \| 29.60 | 121.55 \| 8.23 | +| 4 | 160.78 \| 6.22 | 193.72 \| 5.16 | 67.38 \| 14.84 | 240.06 \| 4.17 | +| 8 | 324.26 \| 3.08 | 370.67 \| 2.70 | 134.34 \| 7.44 | 492.42 \| 2.03 | +| 16 | 637.18 \| 1.57 | 781.29 \| 1.28 | 275.69 \| 3.63 | 970.59 \| 1.03 | +| 32 | 1310.62 \| 0.76 | 1539.19 \| 0.65 | 537.14 \| 1.86 | 1999.04 \| 0.50 | +| 64 | 2092.72 \| 0.48 | 3038.01 \| 0.33 | 1070.50 \| 0.93 | 3971.09 \| 0.25 | +| 128 | 2854.47 \| 0.35 | 5795.97 \| 0.17 | 2055.34 \| 0.49 | 7514.59 \| 0.13 | +| 256 | 3504.34 \| 0.29 | 8216.27 \| 0.12 | 3523.77 \| 0.28 | 10226.50 \| 0.10 | +| 384 | 3811.93 \| 0.26 | 9328.18 \| 0.11 | 4585.33 \| 0.22 | 11094.27 \| 0.09 | +| 512 | 3794.15 \| 0.26 | 9446.34 \| 0.11 | 5416.48 \| 0.18 | 11390.85 \| 0.09 | +| 640 | 4120.75 \| 0.24 | 9572.53 \| 0.10 | 6113.65 \| 0.16 | 11625.71 \| 0.09 | +| 768 | 3946.79 \| 0.25 | 9464.75 \| 0.11 | 6582.52 \| 0.15 | 11814.31 \| 0.08 | +| 896 | 3925.22 \| 0.25 | 9482.11 \| 0.11 | 7111.08 \| 0.14 | 11744.38 \| 0.09 | +| 1024 | oom | 9710.46 \| 0.10 | 7486.36 \| 0.13 | 11534.95 \| 0.09 | +| 1152 | oom | 9712.39 \| 0.10 | 7544.99 \| 0.13 | oom | +| 1280 | oom | 9667.19 \| 0.10 | 7858.91 \| 0.13 | oom | +| 1408 | oom | 9771.91 \| 0.10 | 8116.30 \| 0.12 | oom | +| 1536 | oom | 9744.56 \| 0.10 | 8201.28 \| 0.12 | oom | +| 1664 | oom | 9719.82 \| 0.10 | 8227.56 \| 0.12 | oom | +| 1792 | oom | 9690.61 \| 0.10 | 8344.36 \| 0.12 | oom | +| 1920 | oom | oom | oom | oom | From 76b3b8df3fadca7d133fa8492389c707e31674d9 Mon Sep 17 00:00:00 2001 From: mayank31398 Date: Tue, 6 Dec 2022 10:02:43 +0530 Subject: [PATCH 37/43] bf16 --- README.md | 76 +++++++++++++++++++++++++++++++++++++------------------ 1 file changed, 51 insertions(+), 25 deletions(-) diff --git a/README.md b/README.md index fdb7e1b..c6e2b8b 100644 --- a/README.md +++ b/README.md @@ -60,7 +60,7 @@ Latency (sec) | 1792 | oom | 18.49 | 21.48 | oom | | 1920 | oom | oom | oom | oom | -GPT2 MHA +GPT2 Multi-Head Attention ```python hidden_size = 2048 n_head = 16 @@ -68,27 +68,53 @@ n_layer = 24 ``` Throughput (tokens/sec | msec/token) -| batch_size | HF (fp32) | HF (bf16) | HF (int8) | DS-inference (fp16) | -|:----------:|:---------------:|:---------------:|:---------------:|:-------------------:| -| 1 | 41.34 \| 24.19 | 47.46 \| 21.07 | 16.53 \| 60.49 | 61.61 \| 16.23 | -| 2 | 79.21 \| 12.62 | 96.88 \| 10.32 | 33.79 \| 29.60 | 121.55 \| 8.23 | -| 4 | 160.78 \| 6.22 | 193.72 \| 5.16 | 67.38 \| 14.84 | 240.06 \| 4.17 | -| 8 | 324.26 \| 3.08 | 370.67 \| 2.70 | 134.34 \| 7.44 | 492.42 \| 2.03 | -| 16 | 637.18 \| 1.57 | 781.29 \| 1.28 | 275.69 \| 3.63 | 970.59 \| 1.03 | -| 32 | 1310.62 \| 0.76 | 1539.19 \| 0.65 | 537.14 \| 1.86 | 1999.04 \| 0.50 | -| 64 | 2092.72 \| 0.48 | 3038.01 \| 0.33 | 1070.50 \| 0.93 | 3971.09 \| 0.25 | -| 128 | 2854.47 \| 0.35 | 5795.97 \| 0.17 | 2055.34 \| 0.49 | 7514.59 \| 0.13 | -| 256 | 3504.34 \| 0.29 | 8216.27 \| 0.12 | 3523.77 \| 0.28 | 10226.50 \| 0.10 | -| 384 | 3811.93 \| 0.26 | 9328.18 \| 0.11 | 4585.33 \| 0.22 | 11094.27 \| 0.09 | -| 512 | 3794.15 \| 0.26 | 9446.34 \| 0.11 | 5416.48 \| 0.18 | 11390.85 \| 0.09 | -| 640 | 4120.75 \| 0.24 | 9572.53 \| 0.10 | 6113.65 \| 0.16 | 11625.71 \| 0.09 | -| 768 | 3946.79 \| 0.25 | 9464.75 \| 0.11 | 6582.52 \| 0.15 | 11814.31 \| 0.08 | -| 896 | 3925.22 \| 0.25 | 9482.11 \| 0.11 | 7111.08 \| 0.14 | 11744.38 \| 0.09 | -| 1024 | oom | 9710.46 \| 0.10 | 7486.36 \| 0.13 | 11534.95 \| 0.09 | -| 1152 | oom | 9712.39 \| 0.10 | 7544.99 \| 0.13 | oom | -| 1280 | oom | 9667.19 \| 0.10 | 7858.91 \| 0.13 | oom | -| 1408 | oom | 9771.91 \| 0.10 | 8116.30 \| 0.12 | oom | -| 1536 | oom | 9744.56 \| 0.10 | 8201.28 \| 0.12 | oom | -| 1664 | oom | 9719.82 \| 0.10 | 8227.56 \| 0.12 | oom | -| 1792 | oom | 9690.61 \| 0.10 | 8344.36 \| 0.12 | oom | -| 1920 | oom | oom | oom | oom | +| batch_size | HF (fp32) | HF (bf16) | HF (int8) | DS-inference (fp16) | +|:----------:|:---------------:|:----------------:|:---------------:|:-------------------:| +| 1 | 41.34 \| 24.19 | 40.69 \| 24.57 | 16.53 \| 60.49 | 61.61 \| 16.23 | +| 2 | 79.21 \| 12.62 | 80.87 \| 12.37 | 33.79 \| 29.60 | 121.55 \| 8.23 | +| 4 | 160.78 \| 6.22 | 154.98 \| 6.45 | 67.38 \| 14.84 | 240.06 \| 4.17 | +| 8 | 324.26 \| 3.08 | 332.90 \| 3.00 | 134.34 \| 7.44 | 492.42 \| 2.03 | +| 16 | 637.18 \| 1.57 | 669.27 \| 1.49 | 275.69 \| 3.63 | 970.59 \| 1.03 | +| 32 | 1310.62 \| 0.76 | 1287.95 \| 0.78 | 537.14 \| 1.86 | 1999.04 \| 0.50 | +| 64 | 2092.72 \| 0.48 | 2487.35 \| 0.40 | 1070.50 \| 0.93 | 3971.09 \| 0.25 | +| 128 | 2854.47 \| 0.35 | 4268.99 \| 0.23 | 2055.34 \| 0.49 | 7514.59 \| 0.13 | +| 256 | 3504.34 \| 0.29 | 6917.01 \| 0.14 | 3523.77 \| 0.28 | 10226.50 \| 0.10 | +| 384 | 3811.93 \| 0.26 | 8821.31 \| 0.11 | 4585.33 \| 0.22 | 11094.27 \| 0.09 | +| 512 | 3794.15 \| 0.26 | 10068.51 \| 0.10 | 5416.48 \| 0.18 | 11390.85 \| 0.09 | +| 640 | 4120.75 \| 0.24 | 10547.88 \| 0.09 | 6113.65 \| 0.16 | 11625.71 \| 0.09 | +| 768 | 3946.79 \| 0.25 | 10675.09 \| 0.09 | 6582.52 \| 0.15 | 11814.31 \| 0.08 | +| 896 | 3925.22 \| 0.25 | 10780.82 \| 0.09 | 7111.08 \| 0.14 | 11744.38 \| 0.09 | +| 1024 | oom | 11192.55 \| 0.09 | 7486.36 \| 0.13 | 11534.95 \| 0.09 | +| 1152 | oom | 11178.30 \| 0.09 | 7544.99 \| 0.13 | oom | +| 1280 | oom | 11383.98 \| 0.09 | 7858.91 \| 0.13 | oom | +| 1408 | oom | 11477.66 \| 0.09 | 8116.30 \| 0.12 | oom | +| 1536 | oom | 11382.66 \| 0.09 | 8201.28 \| 0.12 | oom | +| 1664 | oom | 11571.52 \| 0.09 | 8227.56 \| 0.12 | oom | +| 1792 | oom | 11394.20 \| 0.09 | 8344.36 \| 0.12 | oom | +| 1920 | oom | oom | oom | oom | + +Latency (sec) +| batch_size | HF (fp32) | HF (bf16) | HF (int8) | DS-inference (fp16) | +|:----------:|:---------:|:---------:|:---------:|:-------------------:| +| 1 | 1.94 | 2.46 | 6.05 | 1.62 | +| 2 | 1.92 | 2.47 | 5.92 | 1.65 | +| 4 | 1.89 | 2.58 | 5.94 | 1.67 | +| 8 | 1.94 | 2.40 | 5.96 | 1.62 | +| 16 | 1.99 | 2.39 | 5.80 | 1.65 | +| 32 | 2.03 | 2.48 | 5.96 | 1.60 | +| 64 | 2.36 | 2.57 | 5.98 | 1.61 | +| 128 | 4.30 | 3.00 | 6.23 | 1.70 | +| 256 | 6.93 | 3.70 | 7.26 | 2.50 | +| 384 | 10.69 | 4.35 | 8.37 | 3.46 | +| 512 | 14.82 | 5.09 | 9.45 | 4.49 | +| 640 | 19.85 | 6.07 | 10.47 | 5.51 | +| 768 | 20.18 | 7.19 | 11.67 | 6.50 | +| 896 | 24.53 | 8.31 | 12.60 | 7.63 | +| 1024 | oom | 9.15 | 13.68 | 8.88 | +| 1152 | oom | 10.31 | 15.27 | oom | +| 1280 | oom | 11.24 | 16.29 | oom | +| 1408 | oom | 12.27 | 17.35 | oom | +| 1536 | oom | 13.49 | 18.73 | oom | +| 1664 | oom | 14.38 | 20.22 | oom | +| 1792 | oom | 15.73 | 21.48 | oom | +| 1920 | oom | oom | oom | oom | From c149ee91fa6454534018aa827e95427d645076c3 Mon Sep 17 00:00:00 2001 From: mayank31398 Date: Tue, 6 Dec 2022 15:41:42 +0530 Subject: [PATCH 38/43] fp32 --- README.md | 56 +++++++++++++++++++++++++++---------------------------- 1 file changed, 28 insertions(+), 28 deletions(-) diff --git a/README.md b/README.md index c6e2b8b..8844d5c 100644 --- a/README.md +++ b/README.md @@ -70,20 +70,20 @@ n_layer = 24 Throughput (tokens/sec | msec/token) | batch_size | HF (fp32) | HF (bf16) | HF (int8) | DS-inference (fp16) | |:----------:|:---------------:|:----------------:|:---------------:|:-------------------:| -| 1 | 41.34 \| 24.19 | 40.69 \| 24.57 | 16.53 \| 60.49 | 61.61 \| 16.23 | -| 2 | 79.21 \| 12.62 | 80.87 \| 12.37 | 33.79 \| 29.60 | 121.55 \| 8.23 | -| 4 | 160.78 \| 6.22 | 154.98 \| 6.45 | 67.38 \| 14.84 | 240.06 \| 4.17 | -| 8 | 324.26 \| 3.08 | 332.90 \| 3.00 | 134.34 \| 7.44 | 492.42 \| 2.03 | -| 16 | 637.18 \| 1.57 | 669.27 \| 1.49 | 275.69 \| 3.63 | 970.59 \| 1.03 | -| 32 | 1310.62 \| 0.76 | 1287.95 \| 0.78 | 537.14 \| 1.86 | 1999.04 \| 0.50 | -| 64 | 2092.72 \| 0.48 | 2487.35 \| 0.40 | 1070.50 \| 0.93 | 3971.09 \| 0.25 | -| 128 | 2854.47 \| 0.35 | 4268.99 \| 0.23 | 2055.34 \| 0.49 | 7514.59 \| 0.13 | -| 256 | 3504.34 \| 0.29 | 6917.01 \| 0.14 | 3523.77 \| 0.28 | 10226.50 \| 0.10 | -| 384 | 3811.93 \| 0.26 | 8821.31 \| 0.11 | 4585.33 \| 0.22 | 11094.27 \| 0.09 | -| 512 | 3794.15 \| 0.26 | 10068.51 \| 0.10 | 5416.48 \| 0.18 | 11390.85 \| 0.09 | -| 640 | 4120.75 \| 0.24 | 10547.88 \| 0.09 | 6113.65 \| 0.16 | 11625.71 \| 0.09 | -| 768 | 3946.79 \| 0.25 | 10675.09 \| 0.09 | 6582.52 \| 0.15 | 11814.31 \| 0.08 | -| 896 | 3925.22 \| 0.25 | 10780.82 \| 0.09 | 7111.08 \| 0.14 | 11744.38 \| 0.09 | +| 1 | 43.11 \| 23.20 | 40.69 \| 24.57 | 16.53 \| 60.49 | 61.61 \| 16.23 | +| 2 | 80.76 \| 12.38 | 80.87 \| 12.37 | 33.79 \| 29.60 | 121.55 \| 8.23 | +| 4 | 160.38 \| 6.24 | 154.98 \| 6.45 | 67.38 \| 14.84 | 240.06 \| 4.17 | +| 8 | 328.62 \| 3.04 | 332.90 \| 3.00 | 134.34 \| 7.44 | 492.42 \| 2.03 | +| 16 | 662.08 \| 1.51 | 669.27 \| 1.49 | 275.69 \| 3.63 | 970.59 \| 1.03 | +| 32 | 1314.92 \| 0.76 | 1287.95 \| 0.78 | 537.14 \| 1.86 | 1999.04 \| 0.50 | +| 64 | 2118.17 \| 0.47 | 2487.35 \| 0.40 | 1070.50 \| 0.93 | 3971.09 \| 0.25 | +| 128 | 2860.26 \| 0.35 | 4268.99 \| 0.23 | 2055.34 \| 0.49 | 7514.59 \| 0.13 | +| 256 | 3487.86 \| 0.29 | 6917.01 \| 0.14 | 3523.77 \| 0.28 | 10226.50 \| 0.10 | +| 384 | 3794.16 \| 0.26 | 8821.31 \| 0.11 | 4585.33 \| 0.22 | 11094.27 \| 0.09 | +| 512 | 3804.37 \| 0.26 | 10068.51 \| 0.10 | 5416.48 \| 0.18 | 11390.85 \| 0.09 | +| 640 | 4124.01 \| 0.24 | 10547.88 \| 0.09 | 6113.65 \| 0.16 | 11625.71 \| 0.09 | +| 768 | 3950.39 \| 0.25 | 10675.09 \| 0.09 | 6582.52 \| 0.15 | 11814.31 \| 0.08 | +| 896 | 3937.28 \| 0.25 | 10780.82 \| 0.09 | 7111.08 \| 0.14 | 11744.38 \| 0.09 | | 1024 | oom | 11192.55 \| 0.09 | 7486.36 \| 0.13 | 11534.95 \| 0.09 | | 1152 | oom | 11178.30 \| 0.09 | 7544.99 \| 0.13 | oom | | 1280 | oom | 11383.98 \| 0.09 | 7858.91 \| 0.13 | oom | @@ -96,20 +96,20 @@ Throughput (tokens/sec | msec/token) Latency (sec) | batch_size | HF (fp32) | HF (bf16) | HF (int8) | DS-inference (fp16) | |:----------:|:---------:|:---------:|:---------:|:-------------------:| -| 1 | 1.94 | 2.46 | 6.05 | 1.62 | -| 2 | 1.92 | 2.47 | 5.92 | 1.65 | -| 4 | 1.89 | 2.58 | 5.94 | 1.67 | -| 8 | 1.94 | 2.40 | 5.96 | 1.62 | -| 16 | 1.99 | 2.39 | 5.80 | 1.65 | -| 32 | 2.03 | 2.48 | 5.96 | 1.60 | -| 64 | 2.36 | 2.57 | 5.98 | 1.61 | -| 128 | 4.30 | 3.00 | 6.23 | 1.70 | -| 256 | 6.93 | 3.70 | 7.26 | 2.50 | -| 384 | 10.69 | 4.35 | 8.37 | 3.46 | -| 512 | 14.82 | 5.09 | 9.45 | 4.49 | -| 640 | 19.85 | 6.07 | 10.47 | 5.51 | -| 768 | 20.18 | 7.19 | 11.67 | 6.50 | -| 896 | 24.53 | 8.31 | 12.60 | 7.63 | +| 1 | 2.32 | 2.46 | 6.05 | 1.62 | +| 2 | 2.48 | 2.47 | 5.92 | 1.65 | +| 4 | 2.49 | 2.58 | 5.94 | 1.67 | +| 8 | 2.43 | 2.40 | 5.96 | 1.62 | +| 16 | 2.42 | 2.39 | 5.80 | 1.65 | +| 32 | 2.43 | 2.48 | 5.96 | 1.60 | +| 64 | 3.02 | 2.57 | 5.98 | 1.61 | +| 128 | 4.48 | 3.00 | 6.23 | 1.70 | +| 256 | 7.34 | 3.70 | 7.26 | 2.50 | +| 384 | 10.12 | 4.35 | 8.37 | 3.46 | +| 512 | 13.46 | 5.09 | 9.45 | 4.49 | +| 640 | 15.52 | 6.07 | 10.47 | 5.51 | +| 768 | 19.44 | 7.19 | 11.67 | 6.50 | +| 896 | 22.76 | 8.31 | 12.60 | 7.63 | | 1024 | oom | 9.15 | 13.68 | 8.88 | | 1152 | oom | 10.31 | 15.27 | oom | | 1280 | oom | 11.24 | 16.29 | oom | From 8427b9434bc78815d3689e8a90bf97863c0ef796 Mon Sep 17 00:00:00 2001 From: mayank31398 Date: Tue, 6 Dec 2022 23:43:00 +0530 Subject: [PATCH 39/43] int8 --- README.md | 90 +++++++++++++++++++++++++++---------------------------- 1 file changed, 45 insertions(+), 45 deletions(-) diff --git a/README.md b/README.md index 8844d5c..b5f264e 100644 --- a/README.md +++ b/README.md @@ -68,53 +68,53 @@ n_layer = 24 ``` Throughput (tokens/sec | msec/token) -| batch_size | HF (fp32) | HF (bf16) | HF (int8) | DS-inference (fp16) | -|:----------:|:---------------:|:----------------:|:---------------:|:-------------------:| -| 1 | 43.11 \| 23.20 | 40.69 \| 24.57 | 16.53 \| 60.49 | 61.61 \| 16.23 | -| 2 | 80.76 \| 12.38 | 80.87 \| 12.37 | 33.79 \| 29.60 | 121.55 \| 8.23 | -| 4 | 160.38 \| 6.24 | 154.98 \| 6.45 | 67.38 \| 14.84 | 240.06 \| 4.17 | -| 8 | 328.62 \| 3.04 | 332.90 \| 3.00 | 134.34 \| 7.44 | 492.42 \| 2.03 | -| 16 | 662.08 \| 1.51 | 669.27 \| 1.49 | 275.69 \| 3.63 | 970.59 \| 1.03 | -| 32 | 1314.92 \| 0.76 | 1287.95 \| 0.78 | 537.14 \| 1.86 | 1999.04 \| 0.50 | -| 64 | 2118.17 \| 0.47 | 2487.35 \| 0.40 | 1070.50 \| 0.93 | 3971.09 \| 0.25 | -| 128 | 2860.26 \| 0.35 | 4268.99 \| 0.23 | 2055.34 \| 0.49 | 7514.59 \| 0.13 | -| 256 | 3487.86 \| 0.29 | 6917.01 \| 0.14 | 3523.77 \| 0.28 | 10226.50 \| 0.10 | -| 384 | 3794.16 \| 0.26 | 8821.31 \| 0.11 | 4585.33 \| 0.22 | 11094.27 \| 0.09 | -| 512 | 3804.37 \| 0.26 | 10068.51 \| 0.10 | 5416.48 \| 0.18 | 11390.85 \| 0.09 | -| 640 | 4124.01 \| 0.24 | 10547.88 \| 0.09 | 6113.65 \| 0.16 | 11625.71 \| 0.09 | -| 768 | 3950.39 \| 0.25 | 10675.09 \| 0.09 | 6582.52 \| 0.15 | 11814.31 \| 0.08 | -| 896 | 3937.28 \| 0.25 | 10780.82 \| 0.09 | 7111.08 \| 0.14 | 11744.38 \| 0.09 | -| 1024 | oom | 11192.55 \| 0.09 | 7486.36 \| 0.13 | 11534.95 \| 0.09 | -| 1152 | oom | 11178.30 \| 0.09 | 7544.99 \| 0.13 | oom | -| 1280 | oom | 11383.98 \| 0.09 | 7858.91 \| 0.13 | oom | -| 1408 | oom | 11477.66 \| 0.09 | 8116.30 \| 0.12 | oom | -| 1536 | oom | 11382.66 \| 0.09 | 8201.28 \| 0.12 | oom | -| 1664 | oom | 11571.52 \| 0.09 | 8227.56 \| 0.12 | oom | -| 1792 | oom | 11394.20 \| 0.09 | 8344.36 \| 0.12 | oom | -| 1920 | oom | oom | oom | oom | +| batch_size | HF (fp32) | HF (bf16) | HF (int8) | DS-inference (fp16) | +|:----------:|:---------------:|:----------------:|:----------------:|:-------------------:| +| 1 | 43.11 \| 23.20 | 40.69 \| 24.57 | 32.29 \| 30.97 | 61.61 \| 16.23 | +| 2 | 80.76 \| 12.38 | 80.87 \| 12.37 | 63.54 \| 15.74 | 121.55 \| 8.23 | +| 4 | 160.38 \| 6.24 | 154.98 \| 6.45 | 131.00 \| 7.63 | 240.06 \| 4.17 | +| 8 | 328.62 \| 3.04 | 332.90 \| 3.00 | 260.16 \| 3.84 | 492.42 \| 2.03 | +| 16 | 662.08 \| 1.51 | 669.27 \| 1.49 | 523.29 \| 1.91 | 970.59 \| 1.03 | +| 32 | 1314.92 \| 0.76 | 1287.95 \| 0.78 | 1055.57 \| 0.95 | 1999.04 \| 0.50 | +| 64 | 2118.17 \| 0.47 | 2487.35 \| 0.40 | 1969.26 \| 0.51 | 3971.09 \| 0.25 | +| 128 | 2860.26 \| 0.35 | 4268.99 \| 0.23 | 3581.49 \| 0.28 | 7514.59 \| 0.13 | +| 256 | 3487.86 \| 0.29 | 6917.01 \| 0.14 | 6132.47 \| 0.16 | 10226.50 \| 0.10 | +| 384 | 3794.16 \| 0.26 | 8821.31 \| 0.11 | 7774.37 \| 0.13 | 11094.27 \| 0.09 | +| 512 | 3804.37 \| 0.26 | 10068.51 \| 0.10 | 8872.88 \| 0.11 | 11390.85 \| 0.09 | +| 640 | 4124.01 \| 0.24 | 10547.88 \| 0.09 | 9956.58 \| 0.10 | 11625.71 \| 0.09 | +| 768 | 3950.39 \| 0.25 | 10675.09 \| 0.09 | 10584.21 \| 0.09 | 11814.31 \| 0.08 | +| 896 | 3937.28 \| 0.25 | 10780.82 \| 0.09 | 10994.00 \| 0.09 | 11744.38 \| 0.09 | +| 1024 | oom | 11192.55 \| 0.09 | 11306.37 \| 0.09 | 11534.95 \| 0.09 | +| 1152 | oom | 11178.30 \| 0.09 | 11290.51 \| 0.09 | oom | +| 1280 | oom | 11383.98 \| 0.09 | 11459.89 \| 0.09 | oom | +| 1408 | oom | 11477.66 \| 0.09 | 11565.90 \| 0.09 | oom | +| 1536 | oom | 11382.66 \| 0.09 | 11491.99 \| 0.09 | oom | +| 1664 | oom | 11571.52 \| 0.09 | 11603.73 \| 0.09 | oom | +| 1792 | oom | 11394.20 \| 0.09 | 11412.46 \| 0.09 | oom | +| 1920 | oom | oom | oom | oom | Latency (sec) | batch_size | HF (fp32) | HF (bf16) | HF (int8) | DS-inference (fp16) | |:----------:|:---------:|:---------:|:---------:|:-------------------:| -| 1 | 2.32 | 2.46 | 6.05 | 1.62 | -| 2 | 2.48 | 2.47 | 5.92 | 1.65 | -| 4 | 2.49 | 2.58 | 5.94 | 1.67 | -| 8 | 2.43 | 2.40 | 5.96 | 1.62 | -| 16 | 2.42 | 2.39 | 5.80 | 1.65 | -| 32 | 2.43 | 2.48 | 5.96 | 1.60 | -| 64 | 3.02 | 2.57 | 5.98 | 1.61 | -| 128 | 4.48 | 3.00 | 6.23 | 1.70 | -| 256 | 7.34 | 3.70 | 7.26 | 2.50 | -| 384 | 10.12 | 4.35 | 8.37 | 3.46 | -| 512 | 13.46 | 5.09 | 9.45 | 4.49 | -| 640 | 15.52 | 6.07 | 10.47 | 5.51 | -| 768 | 19.44 | 7.19 | 11.67 | 6.50 | -| 896 | 22.76 | 8.31 | 12.60 | 7.63 | -| 1024 | oom | 9.15 | 13.68 | 8.88 | -| 1152 | oom | 10.31 | 15.27 | oom | -| 1280 | oom | 11.24 | 16.29 | oom | -| 1408 | oom | 12.27 | 17.35 | oom | -| 1536 | oom | 13.49 | 18.73 | oom | -| 1664 | oom | 14.38 | 20.22 | oom | -| 1792 | oom | 15.73 | 21.48 | oom | +| 1 | 2.32 | 2.46 | 3.10 | 1.62 | +| 2 | 2.48 | 2.47 | 3.15 | 1.65 | +| 4 | 2.49 | 2.58 | 3.05 | 1.67 | +| 8 | 2.43 | 2.40 | 3.07 | 1.62 | +| 16 | 2.42 | 2.39 | 3.06 | 1.65 | +| 32 | 2.43 | 2.48 | 3.03 | 1.60 | +| 64 | 3.02 | 2.57 | 3.25 | 1.61 | +| 128 | 4.48 | 3.00 | 3.57 | 1.70 | +| 256 | 7.34 | 3.70 | 4.17 | 2.50 | +| 384 | 10.12 | 4.35 | 4.94 | 3.46 | +| 512 | 13.46 | 5.09 | 5.77 | 4.49 | +| 640 | 15.52 | 6.07 | 6.43 | 5.51 | +| 768 | 19.44 | 7.19 | 7.26 | 6.50 | +| 896 | 22.76 | 8.31 | 8.15 | 7.63 | +| 1024 | oom | 9.15 | 9.06 | 8.88 | +| 1152 | oom | 10.31 | 10.20 | oom | +| 1280 | oom | 11.24 | 11.17 | oom | +| 1408 | oom | 12.27 | 12.17 | oom | +| 1536 | oom | 13.49 | 13.37 | oom | +| 1664 | oom | 14.38 | 14.34 | oom | +| 1792 | oom | 15.73 | 15.70 | oom | | 1920 | oom | oom | oom | oom | From 487954f3ad43fcd7224954e889b6efb36adec24b Mon Sep 17 00:00:00 2001 From: mayank31398 Date: Wed, 7 Dec 2022 02:12:34 +0530 Subject: [PATCH 40/43] fp16 --- README.md | 61 ++++++++++++++++++++++++++++--------------------------- 1 file changed, 31 insertions(+), 30 deletions(-) diff --git a/README.md b/README.md index b5f264e..e083012 100644 --- a/README.md +++ b/README.md @@ -65,26 +65,27 @@ GPT2 Multi-Head Attention hidden_size = 2048 n_head = 16 n_layer = 24 +total_params = 1315725312 ``` Throughput (tokens/sec | msec/token) | batch_size | HF (fp32) | HF (bf16) | HF (int8) | DS-inference (fp16) | |:----------:|:---------------:|:----------------:|:----------------:|:-------------------:| -| 1 | 43.11 \| 23.20 | 40.69 \| 24.57 | 32.29 \| 30.97 | 61.61 \| 16.23 | -| 2 | 80.76 \| 12.38 | 80.87 \| 12.37 | 63.54 \| 15.74 | 121.55 \| 8.23 | -| 4 | 160.38 \| 6.24 | 154.98 \| 6.45 | 131.00 \| 7.63 | 240.06 \| 4.17 | -| 8 | 328.62 \| 3.04 | 332.90 \| 3.00 | 260.16 \| 3.84 | 492.42 \| 2.03 | -| 16 | 662.08 \| 1.51 | 669.27 \| 1.49 | 523.29 \| 1.91 | 970.59 \| 1.03 | -| 32 | 1314.92 \| 0.76 | 1287.95 \| 0.78 | 1055.57 \| 0.95 | 1999.04 \| 0.50 | -| 64 | 2118.17 \| 0.47 | 2487.35 \| 0.40 | 1969.26 \| 0.51 | 3971.09 \| 0.25 | -| 128 | 2860.26 \| 0.35 | 4268.99 \| 0.23 | 3581.49 \| 0.28 | 7514.59 \| 0.13 | -| 256 | 3487.86 \| 0.29 | 6917.01 \| 0.14 | 6132.47 \| 0.16 | 10226.50 \| 0.10 | -| 384 | 3794.16 \| 0.26 | 8821.31 \| 0.11 | 7774.37 \| 0.13 | 11094.27 \| 0.09 | -| 512 | 3804.37 \| 0.26 | 10068.51 \| 0.10 | 8872.88 \| 0.11 | 11390.85 \| 0.09 | -| 640 | 4124.01 \| 0.24 | 10547.88 \| 0.09 | 9956.58 \| 0.10 | 11625.71 \| 0.09 | -| 768 | 3950.39 \| 0.25 | 10675.09 \| 0.09 | 10584.21 \| 0.09 | 11814.31 \| 0.08 | -| 896 | 3937.28 \| 0.25 | 10780.82 \| 0.09 | 10994.00 \| 0.09 | 11744.38 \| 0.09 | -| 1024 | oom | 11192.55 \| 0.09 | 11306.37 \| 0.09 | 11534.95 \| 0.09 | +| 1 | 43.11 \| 23.20 | 40.69 \| 24.57 | 32.29 \| 30.97 | 122.76 \| 8.15 | +| 2 | 80.76 \| 12.38 | 80.87 \| 12.37 | 63.54 \| 15.74 | 247.85 \| 4.03 | +| 4 | 160.38 \| 6.24 | 154.98 \| 6.45 | 131.00 \| 7.63 | 503.52 \| 1.99 | +| 8 | 328.62 \| 3.04 | 332.90 \| 3.00 | 260.16 \| 3.84 | 1022.20 \| 0.98 | +| 16 | 662.08 \| 1.51 | 669.27 \| 1.49 | 523.29 \| 1.91 | 2027.35 \| 0.49 | +| 32 | 1314.92 \| 0.76 | 1287.95 \| 0.78 | 1055.57 \| 0.95 | 4231.82 \| 0.24 | +| 64 | 2118.17 \| 0.47 | 2487.35 \| 0.40 | 1969.26 \| 0.51 | 8311.39 \| 0.12 | +| 128 | 2860.26 \| 0.35 | 4268.99 \| 0.23 | 3581.49 \| 0.28 | 15879.15 \| 0.06 | +| 256 | 3487.86 \| 0.29 | 6917.01 \| 0.14 | 6132.47 \| 0.16 | 21635.49 \| 0.05 | +| 384 | 3794.16 \| 0.26 | 8821.31 \| 0.11 | 7774.37 \| 0.13 | 23872.25 \| 0.04 | +| 512 | 3804.37 \| 0.26 | 10068.51 \| 0.10 | 8872.88 \| 0.11 | 25009.06 \| 0.04 | +| 640 | 4124.01 \| 0.24 | 10547.88 \| 0.09 | 9956.58 \| 0.10 | oom | +| 768 | 3950.39 \| 0.25 | 10675.09 \| 0.09 | 10584.21 \| 0.09 | oom | +| 896 | 3937.28 \| 0.25 | 10780.82 \| 0.09 | 10994.00 \| 0.09 | oom | +| 1024 | oom | 11192.55 \| 0.09 | 11306.37 \| 0.09 | oom | | 1152 | oom | 11178.30 \| 0.09 | 11290.51 \| 0.09 | oom | | 1280 | oom | 11383.98 \| 0.09 | 11459.89 \| 0.09 | oom | | 1408 | oom | 11477.66 \| 0.09 | 11565.90 \| 0.09 | oom | @@ -96,21 +97,21 @@ Throughput (tokens/sec | msec/token) Latency (sec) | batch_size | HF (fp32) | HF (bf16) | HF (int8) | DS-inference (fp16) | |:----------:|:---------:|:---------:|:---------:|:-------------------:| -| 1 | 2.32 | 2.46 | 3.10 | 1.62 | -| 2 | 2.48 | 2.47 | 3.15 | 1.65 | -| 4 | 2.49 | 2.58 | 3.05 | 1.67 | -| 8 | 2.43 | 2.40 | 3.07 | 1.62 | -| 16 | 2.42 | 2.39 | 3.06 | 1.65 | -| 32 | 2.43 | 2.48 | 3.03 | 1.60 | -| 64 | 3.02 | 2.57 | 3.25 | 1.61 | -| 128 | 4.48 | 3.00 | 3.57 | 1.70 | -| 256 | 7.34 | 3.70 | 4.17 | 2.50 | -| 384 | 10.12 | 4.35 | 4.94 | 3.46 | -| 512 | 13.46 | 5.09 | 5.77 | 4.49 | -| 640 | 15.52 | 6.07 | 6.43 | 5.51 | -| 768 | 19.44 | 7.19 | 7.26 | 6.50 | -| 896 | 22.76 | 8.31 | 8.15 | 7.63 | -| 1024 | oom | 9.15 | 9.06 | 8.88 | +| 1 | 2.32 | 2.46 | 3.10 | 0.81 | +| 2 | 2.48 | 2.47 | 3.15 | 0.81 | +| 4 | 2.49 | 2.58 | 3.05 | 0.79 | +| 8 | 2.43 | 2.40 | 3.07 | 0.78 | +| 16 | 2.42 | 2.39 | 3.06 | 0.79 | +| 32 | 2.43 | 2.48 | 3.03 | 0.76 | +| 64 | 3.02 | 2.57 | 3.25 | 0.77 | +| 128 | 4.48 | 3.00 | 3.57 | 0.81 | +| 256 | 7.34 | 3.70 | 4.17 | 1.18 | +| 384 | 10.12 | 4.35 | 4.94 | 1.61 | +| 512 | 13.46 | 5.09 | 5.77 | 2.05 | +| 640 | 15.52 | 6.07 | 6.43 | oom | +| 768 | 19.44 | 7.19 | 7.26 | oom | +| 896 | 22.76 | 8.31 | 8.15 | oom | +| 1024 | oom | 9.15 | 9.06 | oom | | 1152 | oom | 10.31 | 10.20 | oom | | 1280 | oom | 11.24 | 11.17 | oom | | 1408 | oom | 12.27 | 12.17 | oom | From 0253839c6785646fc1b7563c6fcd19c369f660ef Mon Sep 17 00:00:00 2001 From: mayank31398 Date: Wed, 7 Dec 2022 02:19:28 +0530 Subject: [PATCH 41/43] total params --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index e083012..3610f5a 100644 --- a/README.md +++ b/README.md @@ -6,6 +6,7 @@ BLOOM hidden_size = 2048 n_head = 16 n_layer = 24 +total_params = 1311535104 ``` Throughput (tokens/sec | msec/token) From 893c5217387b16f1c2311efba57ded4607e1d008 Mon Sep 17 00:00:00 2001 From: mayank31398 Date: Wed, 7 Dec 2022 02:39:38 +0530 Subject: [PATCH 42/43] models --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 3610f5a..9d4ec25 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ # bigcode-inference-benchmark A100 80GB -BLOOM +## BLOOM ```python hidden_size = 2048 n_head = 16 @@ -61,7 +61,7 @@ Latency (sec) | 1792 | oom | 18.49 | 21.48 | oom | | 1920 | oom | oom | oom | oom | -GPT2 Multi-Head Attention +## GPT2 Multi-Head Attention ```python hidden_size = 2048 n_head = 16 From daea92dad31daf066bb275d005a1aa3c6c5ec4f0 Mon Sep 17 00:00:00 2001 From: Alex Gu Date: Tue, 6 Dec 2022 15:39:14 -0600 Subject: [PATCH 43/43] Add code to vary input length (#5) * input length experiments * sort input lengths in ascending order * make default max input length -1 make some updates to Alex's code --- Makefile | 7 +++++++ run.sh => run_batch_size.sh | 0 run_input_length.sh | 8 ++++++++ src/main.py | 3 ++- src/utils/arguments.py | 1 + src/utils/dummy.py | 6 ++++-- 6 files changed, 22 insertions(+), 3 deletions(-) rename run.sh => run_batch_size.sh (100%) mode change 100644 => 100755 create mode 100755 run_input_length.sh diff --git a/Makefile b/Makefile index a47032a..6f0cbde 100644 --- a/Makefile +++ b/Makefile @@ -59,3 +59,10 @@ hf-1b-GPT2-mqa1-int8: ds-inference-1b-GPT2-mqa1-fp16: deepspeed --num_gpus 1 src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class DS_Inference_Pipeline --model_class GPT2 --n_positions 2048 --attention_type 3 --batch_size ${batch_size} + +# Input length experiments +hf-1b-GPT2-mqa1-int8-input-length: + python src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class HF_GPU_Pipeline --model_class GPT2 --n_positions 2048 --attention_type 3 --dtype int8 --batch_size ${batch_size} --max_input_length ${max_input_length} + +hf-1b-GPT2-mha-int8-input-length: + python src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class HF_GPU_Pipeline --model_class GPT2 --n_positions 2048 --attention_type 1 --dtype int8 --batch_size ${batch_size} --max_input_length ${max_input_length} diff --git a/run.sh b/run_batch_size.sh old mode 100644 new mode 100755 similarity index 100% rename from run.sh rename to run_batch_size.sh diff --git a/run_input_length.sh b/run_input_length.sh new file mode 100755 index 0000000..0bda0d5 --- /dev/null +++ b/run_input_length.sh @@ -0,0 +1,8 @@ +export CUDA_VISIBLE_DEVICES=0 + +rm -rf ./tmp + +for max_input_length in {4,8,16,32,64,128,256,512,1024,1536,1900} +do + make $1 batch_size=32 max_input_length=$max_input_length +done diff --git a/src/main.py b/src/main.py index 2046829..30ec6a1 100644 --- a/src/main.py +++ b/src/main.py @@ -7,7 +7,8 @@ def main() -> None: args = get_args(get_arg_parser()) - inputs = get_dummy_batch(args.batch_size) + inputs = get_dummy_batch(args.batch_size, args.max_input_length) + generate_kwargs = dict(max_new_tokens=args.max_new_tokens, do_sample=False) pipeline_class = getattr(pipelines, args.pipeline_class) diff --git a/src/utils/arguments.py b/src/utils/arguments.py index 79f2497..158fbe3 100644 --- a/src/utils/arguments.py +++ b/src/utils/arguments.py @@ -9,6 +9,7 @@ def get_arg_parser() -> ArgumentParser: parser.add_argument("--model_class", default="GPT2", type=str) parser.add_argument("--batch_size", default=1, type=int) parser.add_argument("--dtype", default="bfloat16", type=str) + parser.add_argument("--max_input_length", default=-1, type=int) parser.add_argument("--max_new_tokens", default=100, type=int) parser.add_argument("--local_rank", type=int) parser.add_argument("--hidden_size", type=int) diff --git a/src/utils/dummy.py b/src/utils/dummy.py index ed06cdb..e1055a0 100644 --- a/src/utils/dummy.py +++ b/src/utils/dummy.py @@ -15,9 +15,11 @@ ] -def get_dummy_batch(batch_size: int, input_sentences: List[str] = None) -> List[str]: - if input_sentences is None: +def get_dummy_batch(batch_size: int, max_input_length: int = -1) -> List[str]: + if max_input_length == -1: input_sentences = copy.deepcopy(dummy_input_sentences) + else: + input_sentences = batch_size * ["Hello " * max_input_length] if batch_size > len(input_sentences): input_sentences *= math.ceil(batch_size / len(input_sentences))