From 812a72c0220aab136432f4fc7dc344753d5a2dac Mon Sep 17 00:00:00 2001
From: mayank31398 <mayank31398@gmail.com>
Date: Mon, 17 Oct 2022 17:17:09 +0530
Subject: [PATCH 01/43] refactor

---
 src/main.py                             |  18 +++
 src/pipelines/__init__.py               |   3 +
 src/{ => pipelines}/ds_inference.py     |  21 +---
 src/{ => pipelines}/hf.py               |  25 ++--
 src/{model.py => pipelines/pipeline.py} |   4 +-
 src/utils.py                            | 160 ------------------------
 src/utils/__init__.py                   |   3 +
 src/utils/arguments.py                  |  23 ++++
 src/utils/benchmark.py                  |  73 +++++++++++
 src/utils/dummy.py                      |  26 ++++
 src/utils/utils.py                      |  45 +++++++
 11 files changed, 205 insertions(+), 196 deletions(-)
 create mode 100644 src/main.py
 create mode 100644 src/pipelines/__init__.py
 rename src/{ => pipelines}/ds_inference.py (73%)
 rename src/{ => pipelines}/hf.py (50%)
 rename src/{model.py => pipelines/pipeline.py} (96%)
 delete mode 100644 src/utils.py
 create mode 100644 src/utils/__init__.py
 create mode 100644 src/utils/arguments.py
 create mode 100644 src/utils/benchmark.py
 create mode 100644 src/utils/dummy.py
 create mode 100644 src/utils/utils.py

diff --git a/src/main.py b/src/main.py
new file mode 100644
index 0000000..2046829
--- /dev/null
+++ b/src/main.py
@@ -0,0 +1,18 @@
+import pipelines
+from utils import benchmark_end_to_end, get_arg_parser, get_args, get_dummy_batch
+
+
+def main() -> None:
+    # deepspeed.init_distributed("nccl")
+
+    args = get_args(get_arg_parser())
+
+    inputs = get_dummy_batch(args.batch_size)
+    generate_kwargs = dict(max_new_tokens=args.max_new_tokens, do_sample=False)
+
+    pipeline_class = getattr(pipelines, args.pipeline_class)
+    benchmark_end_to_end(args, pipeline_class, inputs, generate_kwargs)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/pipelines/__init__.py b/src/pipelines/__init__.py
new file mode 100644
index 0000000..2581940
--- /dev/null
+++ b/src/pipelines/__init__.py
@@ -0,0 +1,3 @@
+from .ds_inference import DS_Inference_Pipeline
+from .hf import HF_CPU_Pipeline, HF_GPU_Pipeline
+from .pipeline import Pipeline
diff --git a/src/ds_inference.py b/src/pipelines/ds_inference.py
similarity index 73%
rename from src/ds_inference.py
rename to src/pipelines/ds_inference.py
index 28997a3..fca1cb2 100644
--- a/src/ds_inference.py
+++ b/src/pipelines/ds_inference.py
@@ -5,12 +5,10 @@
 import torch
 from transformers import BloomForCausalLM
 
-import utils
-from model import Model
-from utils import benchmark_end_to_end, get_dummy_batch
+from .pipeline import Pipeline
 
 
-class HFAccelerateModel(Model):
+class DS_Inference_Pipeline(Pipeline):
     def __init__(self, args: Namespace) -> None:
         super().__init__(args)
 
@@ -40,18 +38,3 @@ def __init__(self, args: Namespace) -> None:
         )
 
         self.input_device = torch.cuda.current_device()
-
-
-def main() -> None:
-    deepspeed.init_distributed("nccl")
-
-    args = utils.get_args(utils.get_arg_parser())
-
-    inputs = get_dummy_batch(args.batch_size)
-    generate_kwargs = dict(max_new_tokens=args.max_new_tokens, do_sample=False)
-
-    benchmark_end_to_end(args, HFAccelerateModel, inputs, generate_kwargs)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/src/hf.py b/src/pipelines/hf.py
similarity index 50%
rename from src/hf.py
rename to src/pipelines/hf.py
index 1d411eb..e662c1e 100644
--- a/src/hf.py
+++ b/src/pipelines/hf.py
@@ -3,13 +3,11 @@
 import torch
 from transformers import BloomForCausalLM
 
-import utils
-from model import Model
-from utils import benchmark_end_to_end, get_dummy_batch
+from .pipeline import Pipeline
 
 
-class HFAccelerateModel(Model):
-    def __init__(self, args: Namespace) -> None:
+class HF_Pipeline(Pipeline):
+    def __init__(self, args: Namespace, device: str = "cpu") -> None:
         super().__init__(args)
 
         model_kwargs = {}
@@ -18,19 +16,16 @@ def __init__(self, args: Namespace) -> None:
         else:
             model_kwargs["torch_dtype"] = args.dtype
 
-        self.input_device = "cuda:0"
+        self.input_device = device
         self.model = BloomForCausalLM._from_config(self.config, **model_kwargs).to(self.input_device)
         self.model.eval()
 
 
-def main() -> None:
-    args = utils.get_args(utils.get_arg_parser())
-
-    inputs = get_dummy_batch(args.batch_size)
-    generate_kwargs = dict(max_new_tokens=args.max_new_tokens, do_sample=False)
-
-    benchmark_end_to_end(args, HFAccelerateModel, inputs, generate_kwargs)
+class HF_CPU_Pipeline(HF_Pipeline):
+    def __init__(self, args: Namespace) -> None:
+        super().__init__(args, "cpu")
 
 
-if __name__ == "__main__":
-    main()
+class HF_GPU_Pipeline(HF_CPU_Pipeline):
+    def __init__(self, args: Namespace) -> None:
+        super().__init__(args, "cuda:0")
diff --git a/src/model.py b/src/pipelines/pipeline.py
similarity index 96%
rename from src/model.py
rename to src/pipelines/pipeline.py
index 35d0804..9d20cbf 100644
--- a/src/model.py
+++ b/src/pipelines/pipeline.py
@@ -5,7 +5,7 @@
 from transformers import AutoTokenizer, BloomConfig
 
 
-class Model(torch.nn.Module):
+class Pipeline:
     def __init__(self, args: Namespace) -> None:
         super().__init__()
 
@@ -45,7 +45,7 @@ def __init__(self, args: Namespace) -> None:
         self.model = None
         self.input_device = None
 
-    def generate(self, text: List[str], **generate_kwargs) -> Tuple[List[str], List[int]]:
+    def __call__(self, text: List[str], **generate_kwargs) -> Tuple[List[str], List[int]]:
         input_tokens = self.tokenizer(text, return_tensors="pt", padding=True)
 
         for t in input_tokens:
diff --git a/src/utils.py b/src/utils.py
deleted file mode 100644
index 5465ce1..0000000
--- a/src/utils.py
+++ /dev/null
@@ -1,160 +0,0 @@
-import copy
-import gc
-import math
-import time
-from argparse import ArgumentParser, Namespace
-from functools import partial
-from typing import Any, List, Tuple, Union
-
-import torch
-import torch.distributed as dist
-
-from model import Model
-
-
-# used for benchmarks
-dummy_input_sentences = [
-    "DeepSpeed is a machine learning framework",
-    "He is working on",
-    "He has a",
-    "He got all",
-    "Everyone is happy and I can",
-    "The new movie that got Oscar this year",
-    "In the far far distance from our galaxy,",
-    "Peace is the only way",
-]
-
-
-def get_dummy_batch(batch_size: int, input_sentences: List[str] = None) -> List[str]:
-    if input_sentences is None:
-        input_sentences = copy.deepcopy(dummy_input_sentences)
-
-    if batch_size > len(input_sentences):
-        input_sentences *= math.ceil(batch_size / len(input_sentences))
-    input_sentences = input_sentences[:batch_size]
-
-    return input_sentences
-
-
-def get_arg_parser() -> ArgumentParser:
-    parser = ArgumentParser()
-    parser.add_argument("--batch_size", default=1, type=int)
-    parser.add_argument("--dtype", default="bfloat16", type=str)
-    parser.add_argument("--max_new_tokens", default=100, type=int)
-    parser.add_argument("--local_rank", type=int)
-    parser.add_argument("--hidden_size", type=int)
-    parser.add_argument("--n_head", type=int)
-    parser.add_argument("--n_layer", type=int)
-    parser.add_argument("--benchmark_cycles", type=int, default=5)
-    return parser
-
-
-def get_args(parser: ArgumentParser) -> Namespace:
-    args = parser.parse_args()
-    args.dtype = getattr(torch, args.dtype)
-    return args
-
-
-def run_rank_n(func: partial, barrier: bool = False, rank: int = 0, other_rank_output: Any = None) -> Any:
-    # runs function on only process with specified rank
-    if dist.is_initialized():
-        if dist.get_rank() == rank:
-            output = func()
-            if barrier:
-                dist.barrier()
-            return output
-        else:
-            if barrier:
-                dist.barrier()
-            return other_rank_output
-    else:
-        return func()
-
-
-def print_rank_n(*values, rank: int = 0) -> None:
-    # print on only process with specified rank
-    if dist.is_initialized():
-        if dist.get_rank() == rank:
-            print(*values)
-    else:
-        print(*values)
-
-
-def run_and_log_time(execs: Union[List[partial], partial]) -> Tuple[Union[List[Any], Any], float]:
-    # runs a function / list of functions and times them
-    start_time = time.time()
-
-    if type(execs) == list:
-        results = []
-        for f in execs:
-            results.append(f())
-    else:
-        results = execs()
-
-    time_elapsed = time.time() - start_time
-    return results, time_elapsed
-
-
-def benchmark_generation(model: Model, text: List[str], generate_kwargs: dict, cycles: int = 5) -> int:
-    # run benchmarks for number of cycles
-    total_new_tokens_generated = 0
-    for _ in range(cycles):
-        _, num_generated_tokens = model.generate(text, **generate_kwargs)
-        total_new_tokens_generated += sum(new_tokens for new_tokens in num_generated_tokens)
-    return total_new_tokens_generated
-
-
-def get_benchmark_results(
-    benchmark_time: float, initialization_time: float, total_new_tokens_generated: int, batch_size: int, cycles: int
-) -> str:
-    throughput = total_new_tokens_generated / benchmark_time
-    latency = benchmark_time / cycles
-    return f"""
-*** Performance stats:
-Throughput (including tokenization) = {throughput:.2f} tokens/sec
-Throughput (including tokenization) = {1000 / throughput:.2f} msecs/token
-Model loading time = {initialization_time:.2f} secs
-Total tokens generated = {total_new_tokens_generated} with batch size = {batch_size}
-Latency = {latency:.2f} secs
-Model loading time + generation time per batch = {initialization_time + latency:.2f} secs
-"""
-
-
-def benchmark_end_to_end(args: Namespace, model_class: Model, text: List[str], generate_kwargs: dict) -> None:
-    model, initialization_time = run_and_log_time(partial(model_class, args=args))
-
-    print_rank_n("num params =", model.get_num_parameters())
-
-    print_rank_n(f"generate_kwargs = {generate_kwargs}")
-    print_rank_n(f"batch_size = {args.batch_size}")
-
-    # warmup is a must if measuring speed as it's when all the optimizations are performed
-    # e.g. on 8x80 a100 the first pass of 100 tokens takes 23sec, and the next one is 4secs
-    generated_text, _ = model.generate(text, **generate_kwargs)
-
-    for i, o in zip(text, generated_text):
-        print_rank_n(f"{'-' * 60}\nINPUT = {i}\nOUTPUT = {o}\n")
-
-    if args.benchmark_cycles > 0:
-        print_rank_n(f"*** Running benchmark")
-
-        torch.cuda.empty_cache()
-        gc.collect()
-        torch.cuda.synchronize()
-
-        # benchmark
-        total_new_tokens_generated, benchmark_time = run_and_log_time(
-            partial(
-                benchmark_generation,
-                model=model,
-                text=text,
-                generate_kwargs=generate_kwargs,
-                cycles=args.benchmark_cycles,
-            )
-        )
-
-        print_rank_n(
-            get_benchmark_results(
-                benchmark_time, initialization_time, total_new_tokens_generated, args.batch_size, args.benchmark_cycles
-            )
-        )
diff --git a/src/utils/__init__.py b/src/utils/__init__.py
new file mode 100644
index 0000000..a947456
--- /dev/null
+++ b/src/utils/__init__.py
@@ -0,0 +1,3 @@
+from .arguments import get_arg_parser, get_args
+from .benchmark import benchmark_end_to_end
+from .dummy import get_dummy_batch
diff --git a/src/utils/arguments.py b/src/utils/arguments.py
new file mode 100644
index 0000000..4078d17
--- /dev/null
+++ b/src/utils/arguments.py
@@ -0,0 +1,23 @@
+from argparse import ArgumentParser, Namespace
+
+import torch
+
+
+def get_arg_parser() -> ArgumentParser:
+    parser = ArgumentParser()
+    parser.add_argument("--pipeline_class", default="HF_GPU_Pipeline", type=str)
+    parser.add_argument("--batch_size", default=1, type=int)
+    parser.add_argument("--dtype", default="bfloat16", type=str)
+    parser.add_argument("--max_new_tokens", default=100, type=int)
+    parser.add_argument("--local_rank", type=int)
+    parser.add_argument("--hidden_size", type=int)
+    parser.add_argument("--n_head", type=int)
+    parser.add_argument("--n_layer", type=int)
+    parser.add_argument("--benchmark_cycles", type=int, default=5)
+    return parser
+
+
+def get_args(parser: ArgumentParser) -> Namespace:
+    args = parser.parse_args()
+    args.dtype = getattr(torch, args.dtype)
+    return args
diff --git a/src/utils/benchmark.py b/src/utils/benchmark.py
new file mode 100644
index 0000000..0d71933
--- /dev/null
+++ b/src/utils/benchmark.py
@@ -0,0 +1,73 @@
+import gc
+from argparse import Namespace
+from functools import partial
+from typing import List
+
+import torch
+
+from pipelines import Pipeline
+
+
+def benchmark_generation(pipeline: Pipeline, text: List[str], generate_kwargs: dict, cycles: int = 5) -> int:
+    # run benchmarks for number of cycles
+    total_new_tokens_generated = 0
+    for _ in range(cycles):
+        _, num_generated_tokens = pipeline(text, **generate_kwargs)
+        total_new_tokens_generated += sum(new_tokens for new_tokens in num_generated_tokens)
+    return total_new_tokens_generated
+
+
+def get_benchmark_results(
+    benchmark_time: float, initialization_time: float, total_new_tokens_generated: int, batch_size: int, cycles: int
+) -> str:
+    throughput = total_new_tokens_generated / benchmark_time
+    latency = benchmark_time / cycles
+    return f"""
+*** Performance stats:
+Throughput (including tokenization) = {throughput:.2f} tokens/sec
+Throughput (including tokenization) = {1000 / throughput:.2f} msecs/token
+Model loading time = {initialization_time:.2f} secs
+Total tokens generated = {total_new_tokens_generated} with batch size = {batch_size}
+Latency = {latency:.2f} secs
+Model loading time + generation time per batch = {initialization_time + latency:.2f} secs
+"""
+
+
+def benchmark_end_to_end(args: Namespace, pipeline_class: Pipeline, text: List[str], generate_kwargs: dict) -> None:
+    pipeline, initialization_time = run_and_log_time(partial(pipeline_class, args=args))
+
+    print_rank_n("num params =", pipeline.get_num_parameters())
+
+    print_rank_n(f"generate_kwargs = {generate_kwargs}")
+    print_rank_n(f"batch_size = {args.batch_size}")
+
+    # warmup is a must if measuring speed as it's when all the optimizations are performed
+    # e.g. on 8x80 a100 the first pass of 100 tokens takes 23sec, and the next one is 4secs
+    generated_text, _ = pipeline(text, **generate_kwargs)
+
+    for i, o in zip(text, generated_text):
+        print_rank_n(f"{'-' * 60}\nINPUT = {i}\nOUTPUT = {o}\n")
+
+    if args.benchmark_cycles > 0:
+        print_rank_n(f"*** Running benchmark")
+
+        torch.cuda.empty_cache()
+        gc.collect()
+        torch.cuda.synchronize()
+
+        # benchmark
+        total_new_tokens_generated, benchmark_time = run_and_log_time(
+            partial(
+                benchmark_generation,
+                pipeline=pipeline,
+                text=text,
+                generate_kwargs=generate_kwargs,
+                cycles=args.benchmark_cycles,
+            )
+        )
+
+        print_rank_n(
+            get_benchmark_results(
+                benchmark_time, initialization_time, total_new_tokens_generated, args.batch_size, args.benchmark_cycles
+            )
+        )
diff --git a/src/utils/dummy.py b/src/utils/dummy.py
new file mode 100644
index 0000000..ed06cdb
--- /dev/null
+++ b/src/utils/dummy.py
@@ -0,0 +1,26 @@
+import copy
+import math
+from typing import List
+
+
+dummy_input_sentences = [
+    "DeepSpeed is a machine learning framework",
+    "He is working on",
+    "He has a",
+    "He got all",
+    "Everyone is happy and I can",
+    "The new movie that got Oscar this year",
+    "In the far far distance from our galaxy,",
+    "Peace is the only way",
+]
+
+
+def get_dummy_batch(batch_size: int, input_sentences: List[str] = None) -> List[str]:
+    if input_sentences is None:
+        input_sentences = copy.deepcopy(dummy_input_sentences)
+
+    if batch_size > len(input_sentences):
+        input_sentences *= math.ceil(batch_size / len(input_sentences))
+    input_sentences = input_sentences[:batch_size]
+
+    return input_sentences
diff --git a/src/utils/utils.py b/src/utils/utils.py
new file mode 100644
index 0000000..2ae7b7f
--- /dev/null
+++ b/src/utils/utils.py
@@ -0,0 +1,45 @@
+import time
+from functools import partial
+from typing import Any, List, Tuple, Union
+
+import torch.distributed as dist
+
+
+def run_and_log_time(execs: Union[List[partial], partial]) -> Tuple[Union[List[Any], Any], float]:
+    # runs a function / list of functions and times them
+    start_time = time.time()
+
+    if type(execs) == list:
+        results = []
+        for f in execs:
+            results.append(f())
+    else:
+        results = execs()
+
+    time_elapsed = time.time() - start_time
+    return results, time_elapsed
+
+
+def run_rank_n(func: partial, barrier: bool = False, rank: int = 0, other_rank_output: Any = None) -> Any:
+    # runs function on only process with specified rank
+    if dist.is_initialized():
+        if dist.get_rank() == rank:
+            output = func()
+            if barrier:
+                dist.barrier()
+            return output
+        else:
+            if barrier:
+                dist.barrier()
+            return other_rank_output
+    else:
+        return func()
+
+
+def print_rank_n(*values, rank: int = 0) -> None:
+    # print on only process with specified rank
+    if dist.is_initialized():
+        if dist.get_rank() == rank:
+            print(*values)
+    else:
+        print(*values)

From 21713aca58dd1099d5e2121cab2a68a1a73aee08 Mon Sep 17 00:00:00 2001
From: mayank31398 <mayank31398@gmail.com>
Date: Mon, 17 Oct 2022 17:41:11 +0530
Subject: [PATCH 02/43] refactor

---
 src/pipelines/pipeline.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/pipelines/pipeline.py b/src/pipelines/pipeline.py
index 9d20cbf..7c6dfb8 100644
--- a/src/pipelines/pipeline.py
+++ b/src/pipelines/pipeline.py
@@ -7,8 +7,6 @@
 
 class Pipeline:
     def __init__(self, args: Namespace) -> None:
-        super().__init__()
-
         self.config = BloomConfig.from_dict(
             {
                 "apply_residual_connection_post_layernorm": False,
@@ -40,6 +38,7 @@ def __init__(self, args: Namespace) -> None:
             }
         )
 
+        # hardcoded for now to bigscience/bloom
         self.tokenizer = AutoTokenizer.from_pretrained("bigscience/bloom")
 
         self.model = None

From 0ea738aac299035ca37a075c97d2564849052efb Mon Sep 17 00:00:00 2001
From: mayank31398 <mayank31398@gmail.com>
Date: Mon, 17 Oct 2022 17:44:40 +0530
Subject: [PATCH 03/43] refactor

---
 benchmark.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/benchmark.sh b/benchmark.sh
index 001d543..25ec7e8 100644
--- a/benchmark.sh
+++ b/benchmark.sh
@@ -1,5 +1,5 @@
 # HF
-python src/hf.py --hidden_size 6144 --n_head 32 --n_layer 30
+python src/main.py --hidden_size 6144 --n_head 32 --n_layer 30 --pipeline_class HF_GPU_Pipeline
 
 # DS-inference
-deepspeed --num_gpus 1 src/ds_inference.py --hidden_size 6144 --n_head 32 --n_layer 30
+deepspeed --num_gpus 1 src/main.py --hidden_size 6144 --n_head 32 --n_layer 30 --pipeline_class DS_Inference_Pipeline

From 6239fc6658b82ee8d48077df1af1d04516b4d612 Mon Sep 17 00:00:00 2001
From: mayank31398 <mayank31398@gmail.com>
Date: Mon, 17 Oct 2022 17:46:19 +0530
Subject: [PATCH 04/43] refactor

---
 src/utils/benchmark.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/utils/benchmark.py b/src/utils/benchmark.py
index 0d71933..709c2fd 100644
--- a/src/utils/benchmark.py
+++ b/src/utils/benchmark.py
@@ -7,6 +7,8 @@
 
 from pipelines import Pipeline
 
+from .utils import print_rank_n, run_and_log_time
+
 
 def benchmark_generation(pipeline: Pipeline, text: List[str], generate_kwargs: dict, cycles: int = 5) -> int:
     # run benchmarks for number of cycles

From 01e9515c611be1ed7e0181729ec8bf16afb90ff3 Mon Sep 17 00:00:00 2001
From: mayank31398 <mayank31398@gmail.com>
Date: Mon, 17 Oct 2022 17:49:23 +0530
Subject: [PATCH 05/43] refactor

---
 src/pipelines/hf.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/pipelines/hf.py b/src/pipelines/hf.py
index e662c1e..22da2d2 100644
--- a/src/pipelines/hf.py
+++ b/src/pipelines/hf.py
@@ -26,6 +26,6 @@ def __init__(self, args: Namespace) -> None:
         super().__init__(args, "cpu")
 
 
-class HF_GPU_Pipeline(HF_CPU_Pipeline):
+class HF_GPU_Pipeline(HF_Pipeline):
     def __init__(self, args: Namespace) -> None:
         super().__init__(args, "cuda:0")

From b5a29b889d72eaeb77867787bcefda3fe75d7c4f Mon Sep 17 00:00:00 2001
From: mayank31398 <mayank31398@gmail.com>
Date: Thu, 1 Dec 2022 11:46:54 +0530
Subject: [PATCH 06/43] test

---
 Makefile                  |  5 +++
 benchmark.sh              |  5 ---
 src/pipelines/pipeline.py | 68 ++++++++++++++++++---------------------
 src/utils/arguments.py    |  1 +
 4 files changed, 38 insertions(+), 41 deletions(-)
 create mode 100644 Makefile
 delete mode 100644 benchmark.sh

diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..cf4a539
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,5 @@
+hf:
+	python src/main.py --hidden_size 6144 --n_head 32 --n_layer 30 --pipeline_class HF_CPU_Pipeline --model_class GPT2 --n_positions 2048
+
+ds-inference:
+	deepspeed --num_gpus 1 src/main.py --hidden_size 6144 --n_head 32 --n_layer 30 --pipeline_class DS_Inference_Pipeline --model_class BLOOM
diff --git a/benchmark.sh b/benchmark.sh
deleted file mode 100644
index 25ec7e8..0000000
--- a/benchmark.sh
+++ /dev/null
@@ -1,5 +0,0 @@
-# HF
-python src/main.py --hidden_size 6144 --n_head 32 --n_layer 30 --pipeline_class HF_GPU_Pipeline
-
-# DS-inference
-deepspeed --num_gpus 1 src/main.py --hidden_size 6144 --n_head 32 --n_layer 30 --pipeline_class DS_Inference_Pipeline
diff --git a/src/pipelines/pipeline.py b/src/pipelines/pipeline.py
index 7c6dfb8..f371cb5 100644
--- a/src/pipelines/pipeline.py
+++ b/src/pipelines/pipeline.py
@@ -1,46 +1,13 @@
 from argparse import Namespace
-from typing import List, Tuple
+from typing import List, Tuple, Union
 
 import torch
-from transformers import AutoTokenizer, BloomConfig
+from transformers import AutoTokenizer, BloomConfig, GPT2Config
 
 
 class Pipeline:
     def __init__(self, args: Namespace) -> None:
-        self.config = BloomConfig.from_dict(
-            {
-                "apply_residual_connection_post_layernorm": False,
-                "architectures": ["BloomModel"],
-                "attention_dropout": 0.0,
-                "attention_softmax_in_fp32": True,
-                "bias_dropout_fusion": True,
-                "bos_token_id": 1,
-                "eos_token_id": 2,
-                "hidden_dropout": 0.0,
-                "hidden_size": args.hidden_size,
-                "initializer_range": 0.02,
-                "layer_norm_epsilon": 1e-05,
-                "masked_softmax_fusion": True,
-                "model_type": "bloom",
-                "n_head": args.n_head,
-                "n_inner": None,
-                "n_layer": args.n_layer,
-                "offset_alibi": 100,
-                "pad_token_id": 3,
-                "pretraining_tp": 1,
-                "skip_bias_add": True,
-                "skip_bias_add_qkv": False,
-                "slow_but_exact": False,
-                "transformers_version": "4.22.2",
-                "unk_token_id": 0,
-                "use_cache": True,
-                "vocab_size": 250880,
-            }
-        )
-
-        # hardcoded for now to bigscience/bloom
-        self.tokenizer = AutoTokenizer.from_pretrained("bigscience/bloom")
-
+        self.config, self.tokenizer = get_config_tokenizer(args)
         self.model = None
         self.input_device = None
 
@@ -69,3 +36,32 @@ def get_num_parameters(self) -> int:
         for i in self.model.parameters():
             param_count += i.numel()
         return param_count
+
+
+def get_config_tokenizer(args: Namespace) -> Union[BloomConfig, GPT2Config]:
+    tokenizer = AutoTokenizer.from_pretrained("bigscience/bloom")
+
+    if args.model_class.lower() == "bloom":
+        config = BloomConfig(
+            attention_softmax_in_fp32=True,
+            hidden_size=args.hidden_size,
+            n_head=args.n_head,
+            n_layer=args.n_layer,
+            vocab_size=len(tokenizer),
+            bos_token_id=tokenizer.bos_token_id,
+            eos_token_id=tokenizer.eos_token_id,
+        )
+    elif args.model_class.lower() == "gpt2":
+        config = GPT2Config(
+            n_embd=args.hidden_size,
+            n_head=args.n_head,
+            n_layer=args.n_layer,
+            n_positions=args.n_positions,
+            bos_token_id=tokenizer.bos_token_id,
+            eos_token_id=tokenizer.eos_token_id,
+            attention_type=args.attention_type,
+            print_details=False,
+            vocab_size=len(tokenizer),
+        )
+
+    return config, tokenizer
diff --git a/src/utils/arguments.py b/src/utils/arguments.py
index 4078d17..9a46002 100644
--- a/src/utils/arguments.py
+++ b/src/utils/arguments.py
@@ -6,6 +6,7 @@
 def get_arg_parser() -> ArgumentParser:
     parser = ArgumentParser()
     parser.add_argument("--pipeline_class", default="HF_GPU_Pipeline", type=str)
+    parser.add_argument("--model_class", default="GPT2", type=str)
     parser.add_argument("--batch_size", default=1, type=int)
     parser.add_argument("--dtype", default="bfloat16", type=str)
     parser.add_argument("--max_new_tokens", default=100, type=int)

From e4a29b5e99588402c2a2693fac779e8594e31fcf Mon Sep 17 00:00:00 2001
From: mayank31398 <mayank31398@gmail.com>
Date: Thu, 1 Dec 2022 13:26:58 +0530
Subject: [PATCH 07/43] test

---
 src/utils/arguments.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/utils/arguments.py b/src/utils/arguments.py
index 9a46002..bcc3c38 100644
--- a/src/utils/arguments.py
+++ b/src/utils/arguments.py
@@ -12,6 +12,7 @@ def get_arg_parser() -> ArgumentParser:
     parser.add_argument("--max_new_tokens", default=100, type=int)
     parser.add_argument("--local_rank", type=int)
     parser.add_argument("--hidden_size", type=int)
+    parser.add_argument("--n_positions", type=int)
     parser.add_argument("--n_head", type=int)
     parser.add_argument("--n_layer", type=int)
     parser.add_argument("--benchmark_cycles", type=int, default=5)

From 48f0aa094bb91df0b43e2afcd7c80de16685bbc7 Mon Sep 17 00:00:00 2001
From: mayank31398 <mayank31398@gmail.com>
Date: Thu, 1 Dec 2022 13:28:08 +0530
Subject: [PATCH 08/43] test

---
 Makefile               | 2 +-
 src/utils/arguments.py | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index cf4a539..4c814de 100644
--- a/Makefile
+++ b/Makefile
@@ -1,5 +1,5 @@
 hf:
-	python src/main.py --hidden_size 6144 --n_head 32 --n_layer 30 --pipeline_class HF_CPU_Pipeline --model_class GPT2 --n_positions 2048
+	python src/main.py --hidden_size 6144 --n_head 32 --n_layer 30 --pipeline_class HF_CPU_Pipeline --model_class GPT2 --n_positions 2048 --attention_type 2
 
 ds-inference:
 	deepspeed --num_gpus 1 src/main.py --hidden_size 6144 --n_head 32 --n_layer 30 --pipeline_class DS_Inference_Pipeline --model_class BLOOM
diff --git a/src/utils/arguments.py b/src/utils/arguments.py
index bcc3c38..a2322e0 100644
--- a/src/utils/arguments.py
+++ b/src/utils/arguments.py
@@ -12,6 +12,7 @@ def get_arg_parser() -> ArgumentParser:
     parser.add_argument("--max_new_tokens", default=100, type=int)
     parser.add_argument("--local_rank", type=int)
     parser.add_argument("--hidden_size", type=int)
+    parser.add_argument("--attention_type", type=int)
     parser.add_argument("--n_positions", type=int)
     parser.add_argument("--n_head", type=int)
     parser.add_argument("--n_layer", type=int)

From 646b63b6a6fc8b1584bd5010c77af38968e7ffc1 Mon Sep 17 00:00:00 2001
From: mayank31398 <mayank31398@gmail.com>
Date: Thu, 1 Dec 2022 13:34:53 +0530
Subject: [PATCH 09/43] test

---
 src/pipelines/ds_inference.py |  3 +--
 src/pipelines/hf.py           |  3 +--
 src/pipelines/pipeline.py     | 10 ++++++----
 3 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/src/pipelines/ds_inference.py b/src/pipelines/ds_inference.py
index fca1cb2..31d4d8e 100644
--- a/src/pipelines/ds_inference.py
+++ b/src/pipelines/ds_inference.py
@@ -3,7 +3,6 @@
 
 import deepspeed
 import torch
-from transformers import BloomForCausalLM
 
 from .pipeline import Pipeline
 
@@ -16,7 +15,7 @@ def __init__(self, args: Namespace) -> None:
 
         # with deepspeed.OnDevice(dtype=torch.bfloat16, device="meta"):
         #     model = BloomForCausalLM._from_config(config, torch_dtype=torch.bfloat16)
-        self.model = BloomForCausalLM._from_config(self.config, torch_dtype=torch.bfloat16)
+        self.model = self.model_class._from_config(self.config, torch_dtype=torch.bfloat16)
         self.model.eval()
 
         # checkpoints_json = os.path.join(args.model_name, "checkpoints.json")
diff --git a/src/pipelines/hf.py b/src/pipelines/hf.py
index 22da2d2..60b7a77 100644
--- a/src/pipelines/hf.py
+++ b/src/pipelines/hf.py
@@ -1,7 +1,6 @@
 from argparse import Namespace
 
 import torch
-from transformers import BloomForCausalLM
 
 from .pipeline import Pipeline
 
@@ -17,7 +16,7 @@ def __init__(self, args: Namespace, device: str = "cpu") -> None:
             model_kwargs["torch_dtype"] = args.dtype
 
         self.input_device = device
-        self.model = BloomForCausalLM._from_config(self.config, **model_kwargs).to(self.input_device)
+        self.model = self.model_class._from_config(self.config, **model_kwargs).to(self.input_device)
         self.model.eval()
 
 
diff --git a/src/pipelines/pipeline.py b/src/pipelines/pipeline.py
index f371cb5..551dc23 100644
--- a/src/pipelines/pipeline.py
+++ b/src/pipelines/pipeline.py
@@ -2,12 +2,12 @@
 from typing import List, Tuple, Union
 
 import torch
-from transformers import AutoTokenizer, BloomConfig, GPT2Config
+from transformers import AutoTokenizer, BloomConfig, BloomForCausalLM, GPT2Config, GPT2LMHeadModel
 
 
 class Pipeline:
     def __init__(self, args: Namespace) -> None:
-        self.config, self.tokenizer = get_config_tokenizer(args)
+        self.config, self.tokenizer, self.model_class = get_config_tokenizer_model_class(args)
         self.model = None
         self.input_device = None
 
@@ -38,7 +38,7 @@ def get_num_parameters(self) -> int:
         return param_count
 
 
-def get_config_tokenizer(args: Namespace) -> Union[BloomConfig, GPT2Config]:
+def get_config_tokenizer_model_class(args: Namespace) -> Union[BloomConfig, GPT2Config]:
     tokenizer = AutoTokenizer.from_pretrained("bigscience/bloom")
 
     if args.model_class.lower() == "bloom":
@@ -51,6 +51,7 @@ def get_config_tokenizer(args: Namespace) -> Union[BloomConfig, GPT2Config]:
             bos_token_id=tokenizer.bos_token_id,
             eos_token_id=tokenizer.eos_token_id,
         )
+        model_class = BloomForCausalLM
     elif args.model_class.lower() == "gpt2":
         config = GPT2Config(
             n_embd=args.hidden_size,
@@ -63,5 +64,6 @@ def get_config_tokenizer(args: Namespace) -> Union[BloomConfig, GPT2Config]:
             print_details=False,
             vocab_size=len(tokenizer),
         )
+        model_class = GPT2LMHeadModel
 
-    return config, tokenizer
+    return config, tokenizer, model_class

From 3281d16a986ff29436a78560094954a62a1d6c74 Mon Sep 17 00:00:00 2001
From: mayank31398 <mayank31398@gmail.com>
Date: Thu, 1 Dec 2022 14:07:19 +0530
Subject: [PATCH 10/43] test

---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index 4c814de..a6bddae 100644
--- a/Makefile
+++ b/Makefile
@@ -1,5 +1,5 @@
 hf:
-	python src/main.py --hidden_size 6144 --n_head 32 --n_layer 30 --pipeline_class HF_CPU_Pipeline --model_class GPT2 --n_positions 2048 --attention_type 2
+	python src/main.py --hidden_size 6144 --n_head 32 --n_layer 30 --pipeline_class HF_GPU_Pipeline --model_class GPT2 --n_positions 2048 --attention_type 2
 
 ds-inference:
 	deepspeed --num_gpus 1 src/main.py --hidden_size 6144 --n_head 32 --n_layer 30 --pipeline_class DS_Inference_Pipeline --model_class BLOOM

From 2fbb6c3cbd75f15710c48f4a3e5c37221d4679cf Mon Sep 17 00:00:00 2001
From: mayank31398 <mayank31398@gmail.com>
Date: Thu, 1 Dec 2022 14:11:18 +0530
Subject: [PATCH 11/43] test

---
 Makefile | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/Makefile b/Makefile
index a6bddae..b34dcf8 100644
--- a/Makefile
+++ b/Makefile
@@ -1,3 +1,11 @@
+install-mqa-transformers:
+	git clone https://github.com/bigcode-project/transformers.git; \
+	cd transformers; \
+	git checkout multi_query; \
+	pip install .; \
+	cd ..; \
+	rm -rf transformers;
+
 hf:
 	python src/main.py --hidden_size 6144 --n_head 32 --n_layer 30 --pipeline_class HF_GPU_Pipeline --model_class GPT2 --n_positions 2048 --attention_type 2
 

From 1090704ebccd555be42c5bcc36673d7d8e438ae0 Mon Sep 17 00:00:00 2001
From: mayank31398 <mayank31398@gmail.com>
Date: Thu, 1 Dec 2022 15:39:51 +0530
Subject: [PATCH 12/43] test

---
 src/pipelines/pipeline.py | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/src/pipelines/pipeline.py b/src/pipelines/pipeline.py
index 551dc23..3596c6b 100644
--- a/src/pipelines/pipeline.py
+++ b/src/pipelines/pipeline.py
@@ -60,10 +60,21 @@ def get_config_tokenizer_model_class(args: Namespace) -> Union[BloomConfig, GPT2
             n_positions=args.n_positions,
             bos_token_id=tokenizer.bos_token_id,
             eos_token_id=tokenizer.eos_token_id,
-            attention_type=args.attention_type,
+            attention_type=get_attention_type(args.attention_type),
             print_details=False,
             vocab_size=len(tokenizer),
         )
         model_class = GPT2LMHeadModel
 
     return config, tokenizer, model_class
+
+
+def get_attention_type(attention_type: int):
+    from transformers.models.gpt2.modeling_gpt2 import AttentionType
+
+    if attention_type == 1:
+        return AttentionType.MULTI_HEAD
+    elif attention_type == 2:
+        return AttentionType.MULTI_QUERY
+    elif attention_type == 3:
+        return AttentionType.MULTI_QUERY_1

From b94ea81b776791c1fe23e83bd5e1b27d75f2db45 Mon Sep 17 00:00:00 2001
From: mayank31398 <mayank31398@gmail.com>
Date: Sun, 4 Dec 2022 01:06:28 +0530
Subject: [PATCH 13/43] fp32, bf16, int8

---
 Makefile | 56 ++++++++++++++++++++++++++++++++++++++++++++++++++++----
 run.sh   | 10 ++++++++++
 2 files changed, 62 insertions(+), 4 deletions(-)
 create mode 100644 run.sh

diff --git a/Makefile b/Makefile
index b34dcf8..2e75294 100644
--- a/Makefile
+++ b/Makefile
@@ -1,3 +1,5 @@
+batch_size := 1
+
 install-mqa-transformers:
 	git clone https://github.com/bigcode-project/transformers.git; \
 	cd transformers; \
@@ -6,8 +8,54 @@ install-mqa-transformers:
 	cd ..; \
 	rm -rf transformers;
 
-hf:
-	python src/main.py --hidden_size 6144 --n_head 32 --n_layer 30 --pipeline_class HF_GPU_Pipeline --model_class GPT2 --n_positions 2048 --attention_type 2
+# BLOOM AliBi
+hf-1b-bloom-fp32:
+	python src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class HF_GPU_Pipeline --model_class BLOOM --dtype float32 --batch_size ${batch_size}
+
+hf-1b-bloom-bf16:
+	python src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class HF_GPU_Pipeline --model_class BLOOM --dtype bfloat16 --batch_size ${batch_size}
+
+hf-1b-bloom-int8:
+	python src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class HF_GPU_Pipeline --model_class BLOOM --dtype int8 --batch_size ${batch_size}
+
+ds-inference-1b-bloom:
+	deepspeed --num_gpus 1 src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class DS_Inference_Pipeline --model_class BLOOM --batch_size ${batch_size}
+
+# GPT2 MHA
+hf-1b-GPT2-mha-fp32:
+	python src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class HF_GPU_Pipeline --model_class GPT2 --n_positions 2048 --attention_type 1 --dtype float32 --batch_size ${batch_size}
+
+hf-1b-GPT2-mha-bf16:
+	python src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class HF_GPU_Pipeline --model_class GPT2 --n_positions 2048 --attention_type 1 --dtype bfloat16 --batch_size ${batch_size}
+
+hf-1b-GPT2-mha-int8:
+	python src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class HF_GPU_Pipeline --model_class GPT2 --n_positions 2048 --attention_type 1 --dtype int8 --batch_size ${batch_size}
+
+ds-inference-1b-GPT2-mha:
+	deepspeed --num_gpus 1 src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class DS_Inference_Pipeline --model_class GPT2 --n_positions 2048 --attention_type 1 --batch_size ${batch_size}
+
+# GPT2 MQA
+hf-1b-GPT2-mqa-fp32:
+	python src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class HF_GPU_Pipeline --model_class GPT2 --n_positions 2048 --attention_type 2 --dtype float32 --batch_size ${batch_size}
+
+hf-1b-GPT2-mqa-bf16:
+	python src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class HF_GPU_Pipeline --model_class GPT2 --n_positions 2048 --attention_type 2 --dtype bfloat16 --batch_size ${batch_size}
+
+hf-1b-GPT2-mqa-int8:
+	python src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class HF_GPU_Pipeline --model_class GPT2 --n_positions 2048 --attention_type 2 --dtype int8 --batch_size ${batch_size}
+
+ds-inference-1b-GPT2-mqa:
+	deepspeed --num_gpus 1 src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class DS_Inference_Pipeline --model_class GPT2 --n_positions 2048 --attention_type 2 --batch_size ${batch_size}
+
+# GPT2 MQA1
+hf-1b-GPT2-mqa1-fp32:
+	python src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class HF_GPU_Pipeline --model_class GPT2 --n_positions 2048 --attention_type 3 --dtype float32 --batch_size ${batch_size}
+
+hf-1b-GPT2-mqa1-bf16:
+	python src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class HF_GPU_Pipeline --model_class GPT2 --n_positions 2048 --attention_type 3 --dtype bfloat16 --batch_size ${batch_size}
+
+hf-1b-GPT2-mqa1-int8:
+	python src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class HF_GPU_Pipeline --model_class GPT2 --n_positions 2048 --attention_type 3 --dtype int8 --batch_size ${batch_size}
 
-ds-inference:
-	deepspeed --num_gpus 1 src/main.py --hidden_size 6144 --n_head 32 --n_layer 30 --pipeline_class DS_Inference_Pipeline --model_class BLOOM
+ds-inference-1b-GPT2-mqa1:
+	deepspeed --num_gpus 1 src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class DS_Inference_Pipeline --model_class GPT2 --n_positions 2048 --attention_type 3 --batch_size ${batch_size}
diff --git a/run.sh b/run.sh
new file mode 100644
index 0000000..dc05add
--- /dev/null
+++ b/run.sh
@@ -0,0 +1,10 @@
+for bs in {1,2,4,8,16,32,64}
+do
+    make $1 batch_size=$bs
+done
+
+for i in {1..20}
+do
+    bs=$(($i*128))
+    make $1 batch_size=$bs
+done

From 17534fd97953297b78b19f41cabce61f624879bb Mon Sep 17 00:00:00 2001
From: mayank31398 <mayank31398@gmail.com>
Date: Sun, 4 Dec 2022 01:18:14 +0530
Subject: [PATCH 14/43] fp32, bf16, int8

---
 src/pipelines/pipeline.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/pipelines/pipeline.py b/src/pipelines/pipeline.py
index 3596c6b..d812474 100644
--- a/src/pipelines/pipeline.py
+++ b/src/pipelines/pipeline.py
@@ -39,7 +39,7 @@ def get_num_parameters(self) -> int:
 
 
 def get_config_tokenizer_model_class(args: Namespace) -> Union[BloomConfig, GPT2Config]:
-    tokenizer = AutoTokenizer.from_pretrained("bigscience/bloom")
+    tokenizer = AutoTokenizer.from_pretrained("gpt2")
 
     if args.model_class.lower() == "bloom":
         config = BloomConfig(

From e7230b5d2ee8b39005b305740c4522d0bf284194 Mon Sep 17 00:00:00 2001
From: mayank31398 <mayank31398@gmail.com>
Date: Sun, 4 Dec 2022 01:20:46 +0530
Subject: [PATCH 15/43] fp32, bf16, int8

---
 src/pipelines/pipeline.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/pipelines/pipeline.py b/src/pipelines/pipeline.py
index d812474..7e6bca3 100644
--- a/src/pipelines/pipeline.py
+++ b/src/pipelines/pipeline.py
@@ -40,6 +40,7 @@ def get_num_parameters(self) -> int:
 
 def get_config_tokenizer_model_class(args: Namespace) -> Union[BloomConfig, GPT2Config]:
     tokenizer = AutoTokenizer.from_pretrained("gpt2")
+    tokenizer.add_special_tokens({"pad_token": "[PAD]"})
 
     if args.model_class.lower() == "bloom":
         config = BloomConfig(

From 38c616b09fb5eb9a9fdcaa1ed34c2376104beca6 Mon Sep 17 00:00:00 2001
From: mayank31398 <mayank31398@gmail.com>
Date: Sun, 4 Dec 2022 01:35:48 +0530
Subject: [PATCH 16/43] use_cache

---
 src/pipelines/pipeline.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/pipelines/pipeline.py b/src/pipelines/pipeline.py
index 7e6bca3..7fd15ad 100644
--- a/src/pipelines/pipeline.py
+++ b/src/pipelines/pipeline.py
@@ -51,6 +51,7 @@ def get_config_tokenizer_model_class(args: Namespace) -> Union[BloomConfig, GPT2
             vocab_size=len(tokenizer),
             bos_token_id=tokenizer.bos_token_id,
             eos_token_id=tokenizer.eos_token_id,
+            use_cache=True,
         )
         model_class = BloomForCausalLM
     elif args.model_class.lower() == "gpt2":
@@ -64,6 +65,7 @@ def get_config_tokenizer_model_class(args: Namespace) -> Union[BloomConfig, GPT2
             attention_type=get_attention_type(args.attention_type),
             print_details=False,
             vocab_size=len(tokenizer),
+            use_cache=True,
         )
         model_class = GPT2LMHeadModel
 

From 15a2c80fe5f683d5a5b245087f2507cc25168649 Mon Sep 17 00:00:00 2001
From: mayank31398 <mayank31398@gmail.com>
Date: Sun, 4 Dec 2022 02:23:32 +0530
Subject: [PATCH 17/43] use_cache

---
 README.md | 3 ++-
 run.sh    | 2 ++
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index cbb2100..6c21a4e 100644
--- a/README.md
+++ b/README.md
@@ -1 +1,2 @@
-# bigcode-inference-benchmark
\ No newline at end of file
+# bigcode-inference-benchmark
+
diff --git a/run.sh b/run.sh
index dc05add..3a8e1d1 100644
--- a/run.sh
+++ b/run.sh
@@ -1,3 +1,5 @@
+export CUDA_VISIBLE_DEVICES=0
+
 for bs in {1,2,4,8,16,32,64}
 do
     make $1 batch_size=$bs

From 80ba9bbe93086c6c6c8b16746ececaad869c937f Mon Sep 17 00:00:00 2001
From: mayank31398 <mayank31398@gmail.com>
Date: Sun, 4 Dec 2022 03:51:40 +0530
Subject: [PATCH 18/43] gc

---
 README.md              | 39 +++++++++++++++++++++++++++++++++++++++
 src/utils/arguments.py |  1 +
 src/utils/benchmark.py |  7 ++++---
 3 files changed, 44 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 6c21a4e..a87d23a 100644
--- a/README.md
+++ b/README.md
@@ -1,2 +1,41 @@
 # bigcode-inference-benchmark
 
+BLOOM\
+HF-accelerate\
+A100 80GB
+
+tokens/sec | msec/token
+| batch_size |       fp32      | bf16 | int8<br>LLM.int8() |
+|:----------:|:---------------:|:----:|:------------------:|
+| 1          | 45.04 \| 22.20  |      |                    |
+| 2          | 84.39 \| 11.85  |      |                    |
+| 4          | 167.85 \| 5.96  |      |                    |
+| 8          | 326.72 \| 3.06  |      |                    |
+| 16         | 682.63 \| 1.46  |      |                    |
+| 32         | 1374.97 \| 0.73 |      |                    |
+| 64         | 2380.43 \| 0.42 |      |                    |
+| 128        | 2684.96 \| 0.37 |      |                    |
+| 256        | 3322.43 \| 0.30 |      |                    |
+| 384        | 2585.01 \| 0.39 |      |                    |
+| 512        | 2618.64 \| 0.38 |      |                    |
+| 640        | 2672.61 \| 0.37 |      |                    |
+| 768        | 2630.32 \| 0.38 |      |                    |
+| 896        | 2558.04 \| 0.39 |      |                    |
+
+sec
+| batch_size |  fp32 | bf16 | int8<br>LLM.int8() |
+|:----------:|:-----:|:----:|:------------------:|
+| 1          | 2.22  |      |                    |
+| 2          | 2.37  |      |                    |
+| 4          | 2.38  |      |                    |
+| 8          | 2.45  |      |                    |
+| 16         | 2.34  |      |                    |
+| 32         | 2.33  |      |                    |
+| 64         | 2.69  |      |                    |
+| 128        | 4.77  |      |                    |
+| 256        | 7.71  |      |                    |
+| 384        | 14.85 |      |                    |
+| 512        | 19.55 |      |                    |
+| 640        | 23.95 |      |                    |
+| 768        | 29.20 |      |                    |
+| 896        | 35.03 |      |                    |
\ No newline at end of file
diff --git a/src/utils/arguments.py b/src/utils/arguments.py
index a2322e0..79f2497 100644
--- a/src/utils/arguments.py
+++ b/src/utils/arguments.py
@@ -17,6 +17,7 @@ def get_arg_parser() -> ArgumentParser:
     parser.add_argument("--n_head", type=int)
     parser.add_argument("--n_layer", type=int)
     parser.add_argument("--benchmark_cycles", type=int, default=5)
+    parser.add_argument("--clear_every_run", action="store_true")
     return parser
 
 
diff --git a/src/utils/benchmark.py b/src/utils/benchmark.py
index 709c2fd..aa14961 100644
--- a/src/utils/benchmark.py
+++ b/src/utils/benchmark.py
@@ -53,9 +53,10 @@ def benchmark_end_to_end(args: Namespace, pipeline_class: Pipeline, text: List[s
     if args.benchmark_cycles > 0:
         print_rank_n(f"*** Running benchmark")
 
-        torch.cuda.empty_cache()
-        gc.collect()
-        torch.cuda.synchronize()
+        if args.clear_every_run:
+            torch.cuda.empty_cache()
+            gc.collect()
+            torch.cuda.synchronize()
 
         # benchmark
         total_new_tokens_generated, benchmark_time = run_and_log_time(

From f28f8ac15425065f75682768264d5d7f539b4531 Mon Sep 17 00:00:00 2001
From: mayank31398 <mayank31398@gmail.com>
Date: Sun, 4 Dec 2022 04:37:57 +0530
Subject: [PATCH 19/43] benchmark

---
 README.md | 56 ++++++++++++++++++++++++++++---------------------------
 1 file changed, 29 insertions(+), 27 deletions(-)

diff --git a/README.md b/README.md
index a87d23a..bc98092 100644
--- a/README.md
+++ b/README.md
@@ -7,35 +7,37 @@ A100 80GB
 tokens/sec | msec/token
 | batch_size |       fp32      | bf16 | int8<br>LLM.int8() |
 |:----------:|:---------------:|:----:|:------------------:|
-| 1          | 45.04 \| 22.20  |      |                    |
-| 2          | 84.39 \| 11.85  |      |                    |
-| 4          | 167.85 \| 5.96  |      |                    |
-| 8          | 326.72 \| 3.06  |      |                    |
-| 16         | 682.63 \| 1.46  |      |                    |
-| 32         | 1374.97 \| 0.73 |      |                    |
-| 64         | 2380.43 \| 0.42 |      |                    |
-| 128        | 2684.96 \| 0.37 |      |                    |
-| 256        | 3322.43 \| 0.30 |      |                    |
-| 384        | 2585.01 \| 0.39 |      |                    |
-| 512        | 2618.64 \| 0.38 |      |                    |
-| 640        | 2672.61 \| 0.37 |      |                    |
-| 768        | 2630.32 \| 0.38 |      |                    |
-| 896        | 2558.04 \| 0.39 |      |                    |
+| 1          | 45.31 \| 22.07  |      |                    |
+| 2          | 86.60 \| 11.55  |      |                    |
+| 4          | 171.38 \| 5.83  |      |                    |
+| 8          | 325.98 \| 3.07  |      |                    |
+| 16         | 655.23 \| 1.53  |      |                    |
+| 32         | 1356.57 \| 0.74 |      |                    |
+| 64         | 2373.14 \| 0.42 |      |                    |
+| 128        | 2688.91 \| 0.37 |      |                    |
+| 256        | 3325.01 \| 0.30 |      |                    |
+| 384        | 3261.28 \| 0.31 |      |                    |
+| 512        | 3369.69 \| 0.30 |      |                    |
+| 640        | 3506.41 \| 0.29 |      |                    |
+| 768        | 3461.95 \| 0.29 |      |                    |
+| 896        | 3346.01 \| 0.30 |      |                    |
+| 1024       | oom             |      |                    |
 
 sec
 | batch_size |  fp32 | bf16 | int8<br>LLM.int8() |
 |:----------:|:-----:|:----:|:------------------:|
-| 1          | 2.22  |      |                    |
-| 2          | 2.37  |      |                    |
-| 4          | 2.38  |      |                    |
+| 1          | 2.21  |      |                    |
+| 2          | 2.31  |      |                    |
+| 4          | 2.33  |      |                    |
 | 8          | 2.45  |      |                    |
-| 16         | 2.34  |      |                    |
-| 32         | 2.33  |      |                    |
-| 64         | 2.69  |      |                    |
-| 128        | 4.77  |      |                    |
-| 256        | 7.71  |      |                    |
-| 384        | 14.85 |      |                    |
-| 512        | 19.55 |      |                    |
-| 640        | 23.95 |      |                    |
-| 768        | 29.20 |      |                    |
-| 896        | 35.03 |      |                    |
\ No newline at end of file
+| 16         | 2.44  |      |                    |
+| 32         | 2.36  |      |                    |
+| 64         | 2.70  |      |                    |
+| 128        | 4.76  |      |                    |
+| 256        | 7.70  |      |                    |
+| 384        | 11.77 |      |                    |
+| 512        | 15.19 |      |                    |
+| 640        | 18.25 |      |                    |
+| 768        | 22.18 |      |                    |
+| 896        | 26.78 |      |                    |
+| 1024       | oom   |      |                    |
\ No newline at end of file

From d04dc1496f683a39dcb04dc7857bae7845bc33ff Mon Sep 17 00:00:00 2001
From: mayank31398 <mayank31398@gmail.com>
Date: Sun, 4 Dec 2022 07:06:23 +0530
Subject: [PATCH 20/43] benchmark

---
 README.md | 82 ++++++++++++++++++++++++++++++++-----------------------
 1 file changed, 48 insertions(+), 34 deletions(-)

diff --git a/README.md b/README.md
index bc98092..320931b 100644
--- a/README.md
+++ b/README.md
@@ -5,39 +5,53 @@ HF-accelerate\
 A100 80GB
 
 tokens/sec | msec/token
-| batch_size |       fp32      | bf16 | int8<br>LLM.int8() |
-|:----------:|:---------------:|:----:|:------------------:|
-| 1          | 45.31 \| 22.07  |      |                    |
-| 2          | 86.60 \| 11.55  |      |                    |
-| 4          | 171.38 \| 5.83  |      |                    |
-| 8          | 325.98 \| 3.07  |      |                    |
-| 16         | 655.23 \| 1.53  |      |                    |
-| 32         | 1356.57 \| 0.74 |      |                    |
-| 64         | 2373.14 \| 0.42 |      |                    |
-| 128        | 2688.91 \| 0.37 |      |                    |
-| 256        | 3325.01 \| 0.30 |      |                    |
-| 384        | 3261.28 \| 0.31 |      |                    |
-| 512        | 3369.69 \| 0.30 |      |                    |
-| 640        | 3506.41 \| 0.29 |      |                    |
-| 768        | 3461.95 \| 0.29 |      |                    |
-| 896        | 3346.01 \| 0.30 |      |                    |
-| 1024       | oom             |      |                    |
+| batch_size |       fp32      |       bf16      | int8<br>LLM.int8() |
+|:----------:|:---------------:|:---------------:|:------------------:|
+| 1          | 45.31 \| 22.07  | 42.23 \| 23.68  |                    |
+| 2          | 86.60 \| 11.55  | 83.79 \| 11.93  |                    |
+| 4          | 171.38 \| 5.83  | 168.91 \| 5.92  |                    |
+| 8          | 325.98 \| 3.07  | 328.11 \| 3.05  |                    |
+| 16         | 655.23 \| 1.53  | 669.15 \| 1.49  |                    |
+| 32         | 1356.57 \| 0.74 | 1277.78 \| 0.78 |                    |
+| 64         | 2373.14 \| 0.42 | 2605.26 \| 0.38 |                    |
+| 128        | 2688.91 \| 0.37 | 4780.32 \| 0.21 |                    |
+| 256        | 3325.01 \| 0.30 | 6549.67 \| 0.15 |                    |
+| 384        | 3261.28 \| 0.31 | 7319.86 \| 0.14 |                    |
+| 512        | 3369.69 \| 0.30 | 7425.47 \| 0.13 |                    |
+| 640        | 3506.41 \| 0.29 | 7553.05 \| 0.13 |                    |
+| 768        | 3461.95 \| 0.29 | 7681.78 \| 0.13 |                    |
+| 896        | 3346.01 \| 0.30 | 7544.19 \| 0.13 |                    |
+| 1024       | oom             | 7703.84 \| 0.13 |                    |
+| 1152       | oom             | 7728.71 \| 0.13 |                    |
+| 1280       | oom             | 7799.99 \| 0.13 |                    |
+| 1408       | oom             | 7776.64 \| 0.13 |                    |
+| 1536       | oom             | 7802.61 \| 0.13 |                    |
+| 1664       | oom             | 7783.20 \| 0.13 |                    |
+| 1792       | oom             | 7738.55 \| 0.13 |                    |
+| 1920       | oom             | oom             |                    |
 
 sec
-| batch_size |  fp32 | bf16 | int8<br>LLM.int8() |
-|:----------:|:-----:|:----:|:------------------:|
-| 1          | 2.21  |      |                    |
-| 2          | 2.31  |      |                    |
-| 4          | 2.33  |      |                    |
-| 8          | 2.45  |      |                    |
-| 16         | 2.44  |      |                    |
-| 32         | 2.36  |      |                    |
-| 64         | 2.70  |      |                    |
-| 128        | 4.76  |      |                    |
-| 256        | 7.70  |      |                    |
-| 384        | 11.77 |      |                    |
-| 512        | 15.19 |      |                    |
-| 640        | 18.25 |      |                    |
-| 768        | 22.18 |      |                    |
-| 896        | 26.78 |      |                    |
-| 1024       | oom   |      |                    |
\ No newline at end of file
+| batch_size |  fp32 |  bf16 | int8<br>LLM.int8() |
+|:----------:|:-----:|:-----:|:------------------:|
+| 1          | 2.21  | 2.37  |                    |
+| 2          | 2.31  | 2.39  |                    |
+| 4          | 2.33  | 2.37  |                    |
+| 8          | 2.45  | 2.44  |                    |
+| 16         | 2.44  | 2.39  |                    |
+| 32         | 2.36  | 2.50  |                    |
+| 64         | 2.70  | 2.46  |                    |
+| 128        | 4.76  | 2.68  |                    |
+| 256        | 7.70  | 3.91  |                    |
+| 384        | 11.77 | 5.25  |                    |
+| 512        | 15.19 | 6.90  |                    |
+| 640        | 18.25 | 8.47  |                    |
+| 768        | 22.18 | 10.00 |                    |
+| 896        | 26.78 | 11.88 |                    |
+| 1024       | oom   | 13.29 |                    |
+| 1152       | oom   | 14.91 |                    |
+| 1280       | oom   | 16.41 |                    |
+| 1408       | oom   | 18.11 |                    |
+| 1536       | oom   | 19.69 |                    |
+| 1664       | oom   | 21.38 |                    |
+| 1792       | oom   | 23.16 |                    |
+| 1920       | oom   | oom   |                    |
\ No newline at end of file

From 9dc52684b0bbf293b51907e191973e9f473b4fda Mon Sep 17 00:00:00 2001
From: mayank31398 <mayank31398@gmail.com>
Date: Sun, 4 Dec 2022 07:16:01 +0530
Subject: [PATCH 21/43] benchmark

---
 README.md | 97 +++++++++++++++++++++++++++----------------------------
 1 file changed, 48 insertions(+), 49 deletions(-)

diff --git a/README.md b/README.md
index 320931b..fad6142 100644
--- a/README.md
+++ b/README.md
@@ -1,57 +1,56 @@
 # bigcode-inference-benchmark
 
 BLOOM\
-HF-accelerate\
 A100 80GB
 
 tokens/sec | msec/token
-| batch_size |       fp32      |       bf16      | int8<br>LLM.int8() |
-|:----------:|:---------------:|:---------------:|:------------------:|
-| 1          | 45.31 \| 22.07  | 42.23 \| 23.68  |                    |
-| 2          | 86.60 \| 11.55  | 83.79 \| 11.93  |                    |
-| 4          | 171.38 \| 5.83  | 168.91 \| 5.92  |                    |
-| 8          | 325.98 \| 3.07  | 328.11 \| 3.05  |                    |
-| 16         | 655.23 \| 1.53  | 669.15 \| 1.49  |                    |
-| 32         | 1356.57 \| 0.74 | 1277.78 \| 0.78 |                    |
-| 64         | 2373.14 \| 0.42 | 2605.26 \| 0.38 |                    |
-| 128        | 2688.91 \| 0.37 | 4780.32 \| 0.21 |                    |
-| 256        | 3325.01 \| 0.30 | 6549.67 \| 0.15 |                    |
-| 384        | 3261.28 \| 0.31 | 7319.86 \| 0.14 |                    |
-| 512        | 3369.69 \| 0.30 | 7425.47 \| 0.13 |                    |
-| 640        | 3506.41 \| 0.29 | 7553.05 \| 0.13 |                    |
-| 768        | 3461.95 \| 0.29 | 7681.78 \| 0.13 |                    |
-| 896        | 3346.01 \| 0.30 | 7544.19 \| 0.13 |                    |
-| 1024       | oom             | 7703.84 \| 0.13 |                    |
-| 1152       | oom             | 7728.71 \| 0.13 |                    |
-| 1280       | oom             | 7799.99 \| 0.13 |                    |
-| 1408       | oom             | 7776.64 \| 0.13 |                    |
-| 1536       | oom             | 7802.61 \| 0.13 |                    |
-| 1664       | oom             | 7783.20 \| 0.13 |                    |
-| 1792       | oom             | 7738.55 \| 0.13 |                    |
-| 1920       | oom             | oom             |                    |
+| batch_size |    HF (fp32)    |    HF (bf16)    |    HF (int8)    |
+|:----------:|:---------------:|:---------------:|:---------------:|
+| 1          | 45.31 \| 22.07  | 42.23 \| 23.68  |                 |
+| 2          | 86.60 \| 11.55  | 83.79 \| 11.93  |                 |
+| 4          | 171.38 \| 5.83  | 168.91 \| 5.92  |                 |
+| 8          | 325.98 \| 3.07  | 328.11 \| 3.05  |                 |
+| 16         | 655.23 \| 1.53  | 669.15 \| 1.49  |                 |
+| 32         | 1356.57 \| 0.74 | 1277.78 \| 0.78 |                 |
+| 64         | 2373.14 \| 0.42 | 2605.26 \| 0.38 |                 |
+| 128        | 2688.91 \| 0.37 | 4780.32 \| 0.21 |                 |
+| 256        | 3325.01 \| 0.30 | 6549.67 \| 0.15 |                 |
+| 384        | 3261.28 \| 0.31 | 7319.86 \| 0.14 |                 |
+| 512        | 3369.69 \| 0.30 | 7425.47 \| 0.13 |                 |
+| 640        | 3506.41 \| 0.29 | 7553.05 \| 0.13 |                 |
+| 768        | 3461.95 \| 0.29 | 7681.78 \| 0.13 |                 |
+| 896        | 3346.01 \| 0.30 | 7544.19 \| 0.13 |                 |
+| 1024       | oom             | 7703.84 \| 0.13 |                 |
+| 1152       | oom             | 7728.71 \| 0.13 |                 |
+| 1280       | oom             | 7799.99 \| 0.13 |                 |
+| 1408       | oom             | 7776.64 \| 0.13 |                 |
+| 1536       | oom             | 7802.61 \| 0.13 |                 |
+| 1664       | oom             | 7783.20 \| 0.13 |                 |
+| 1792       | oom             | 7738.55 \| 0.13 |                 |
+| 1920       | oom             | oom             |                 |
 
 sec
-| batch_size |  fp32 |  bf16 | int8<br>LLM.int8() |
-|:----------:|:-----:|:-----:|:------------------:|
-| 1          | 2.21  | 2.37  |                    |
-| 2          | 2.31  | 2.39  |                    |
-| 4          | 2.33  | 2.37  |                    |
-| 8          | 2.45  | 2.44  |                    |
-| 16         | 2.44  | 2.39  |                    |
-| 32         | 2.36  | 2.50  |                    |
-| 64         | 2.70  | 2.46  |                    |
-| 128        | 4.76  | 2.68  |                    |
-| 256        | 7.70  | 3.91  |                    |
-| 384        | 11.77 | 5.25  |                    |
-| 512        | 15.19 | 6.90  |                    |
-| 640        | 18.25 | 8.47  |                    |
-| 768        | 22.18 | 10.00 |                    |
-| 896        | 26.78 | 11.88 |                    |
-| 1024       | oom   | 13.29 |                    |
-| 1152       | oom   | 14.91 |                    |
-| 1280       | oom   | 16.41 |                    |
-| 1408       | oom   | 18.11 |                    |
-| 1536       | oom   | 19.69 |                    |
-| 1664       | oom   | 21.38 |                    |
-| 1792       | oom   | 23.16 |                    |
-| 1920       | oom   | oom   |                    |
\ No newline at end of file
+| batch_size | HF (fp32) | HF (bf16) | HF (int8) |
+|:----------:|:---------:|:---------:|:---------:|
+| 1          | 2.21      | 2.37      |                    |
+| 2          | 2.31      | 2.39      |                    |
+| 4          | 2.33      | 2.37      |                    |
+| 8          | 2.45      | 2.44      |                    |
+| 16         | 2.44      | 2.39      |                    |
+| 32         | 2.36      | 2.50      |                    |
+| 64         | 2.70      | 2.46      |                    |
+| 128        | 4.76      | 2.68      |                    |
+| 256        | 7.70      | 3.91      |                    |
+| 384        | 11.77     | 5.25      |                    |
+| 512        | 15.19     | 6.90      |                    |
+| 640        | 18.25     | 8.47      |                    |
+| 768        | 22.18     | 10.00     |                    |
+| 896        | 26.78     | 11.88     |                    |
+| 1024       | oom       | 13.29     |                    |
+| 1152       | oom       | 14.91     |                    |
+| 1280       | oom       | 16.41     |                    |
+| 1408       | oom       | 18.11     |                    |
+| 1536       | oom       | 19.69     |                    |
+| 1664       | oom       | 21.38     |                    |
+| 1792       | oom       | 23.16     |                    |
+| 1920       | oom       | oom       |                    |
\ No newline at end of file

From 23a5eb1a6064571c0acffef6c6896462ce7aa453 Mon Sep 17 00:00:00 2001
From: mayank31398 <mayank31398@gmail.com>
Date: Sun, 4 Dec 2022 18:56:19 +0530
Subject: [PATCH 22/43] fix

---
 run.sh                        | 2 ++
 src/pipelines/ds_inference.py | 2 +-
 src/pipelines/hf.py           | 6 +++++-
 src/pipelines/pipeline.py     | 4 ++++
 4 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/run.sh b/run.sh
index 3a8e1d1..342b4ff 100644
--- a/run.sh
+++ b/run.sh
@@ -1,5 +1,7 @@
 export CUDA_VISIBLE_DEVICES=0
 
+rm -rf ./tmp
+
 for bs in {1,2,4,8,16,32,64}
 do
     make $1 batch_size=$bs
diff --git a/src/pipelines/ds_inference.py b/src/pipelines/ds_inference.py
index 31d4d8e..96a27d5 100644
--- a/src/pipelines/ds_inference.py
+++ b/src/pipelines/ds_inference.py
@@ -15,7 +15,7 @@ def __init__(self, args: Namespace) -> None:
 
         # with deepspeed.OnDevice(dtype=torch.bfloat16, device="meta"):
         #     model = BloomForCausalLM._from_config(config, torch_dtype=torch.bfloat16)
-        self.model = self.model_class._from_config(self.config, torch_dtype=torch.bfloat16)
+        self.model = self.model_class.from_pretrained("tmp", torch_dtype=torch.bfloat16)
         self.model.eval()
 
         # checkpoints_json = os.path.join(args.model_name, "checkpoints.json")
diff --git a/src/pipelines/hf.py b/src/pipelines/hf.py
index 60b7a77..48aeb56 100644
--- a/src/pipelines/hf.py
+++ b/src/pipelines/hf.py
@@ -10,13 +10,17 @@ def __init__(self, args: Namespace, device: str = "cpu") -> None:
         super().__init__(args)
 
         model_kwargs = {}
+
+        if device.startswith("cuda"):
+            model_kwargs["device_map"] = "balanced"
+
         if args.dtype == torch.int8:
             model_kwargs["load_in_8bit"] = True
         else:
             model_kwargs["torch_dtype"] = args.dtype
 
         self.input_device = device
-        self.model = self.model_class._from_config(self.config, **model_kwargs).to(self.input_device)
+        self.model = self.model_class.from_pretrained("tmp", **model_kwargs)
         self.model.eval()
 
 
diff --git a/src/pipelines/pipeline.py b/src/pipelines/pipeline.py
index 7fd15ad..e7d0830 100644
--- a/src/pipelines/pipeline.py
+++ b/src/pipelines/pipeline.py
@@ -1,3 +1,4 @@
+import os
 from argparse import Namespace
 from typing import List, Tuple, Union
 
@@ -69,6 +70,9 @@ def get_config_tokenizer_model_class(args: Namespace) -> Union[BloomConfig, GPT2
         )
         model_class = GPT2LMHeadModel
 
+    if not os.path.exists("tmp"):
+        model_class._from_config(config).save_pretrained("tmp")
+
     return config, tokenizer, model_class
 
 

From 391e05597beb94787a61eaf674a47c3a0104b209 Mon Sep 17 00:00:00 2001
From: mayank31398 <mayank31398@gmail.com>
Date: Sun, 4 Dec 2022 20:04:43 +0530
Subject: [PATCH 23/43] fix

---
 src/pipelines/hf.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/pipelines/hf.py b/src/pipelines/hf.py
index 48aeb56..41405af 100644
--- a/src/pipelines/hf.py
+++ b/src/pipelines/hf.py
@@ -12,7 +12,7 @@ def __init__(self, args: Namespace, device: str = "cpu") -> None:
         model_kwargs = {}
 
         if device.startswith("cuda"):
-            model_kwargs["device_map"] = "balanced"
+            model_kwargs["device_map"] = {0: "80GIB"}
 
         if args.dtype == torch.int8:
             model_kwargs["load_in_8bit"] = True

From 856c77b285176a01ca7dfd4e35e82d1f8a197d41 Mon Sep 17 00:00:00 2001
From: mayank31398 <mayank31398@gmail.com>
Date: Sun, 4 Dec 2022 20:07:32 +0530
Subject: [PATCH 24/43] fix

---
 src/pipelines/hf.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/src/pipelines/hf.py b/src/pipelines/hf.py
index 41405af..a3dca81 100644
--- a/src/pipelines/hf.py
+++ b/src/pipelines/hf.py
@@ -11,16 +11,13 @@ def __init__(self, args: Namespace, device: str = "cpu") -> None:
 
         model_kwargs = {}
 
-        if device.startswith("cuda"):
-            model_kwargs["device_map"] = {0: "80GIB"}
-
         if args.dtype == torch.int8:
             model_kwargs["load_in_8bit"] = True
         else:
             model_kwargs["torch_dtype"] = args.dtype
 
         self.input_device = device
-        self.model = self.model_class.from_pretrained("tmp", **model_kwargs)
+        self.model = self.model_class.from_pretrained("tmp", **model_kwargs).to(self.input_device)
         self.model.eval()
 
 

From 9d99f463271b1d4b74166966ccb496899cc085c8 Mon Sep 17 00:00:00 2001
From: mayank31398 <mayank31398@gmail.com>
Date: Sun, 4 Dec 2022 22:14:12 +0530
Subject: [PATCH 25/43] fp32

---
 README.md | 52 ++++++++++++++++++++++++++--------------------------
 1 file changed, 26 insertions(+), 26 deletions(-)

diff --git a/README.md b/README.md
index fad6142..462050f 100644
--- a/README.md
+++ b/README.md
@@ -6,20 +6,20 @@ A100 80GB
 tokens/sec | msec/token
 | batch_size |    HF (fp32)    |    HF (bf16)    |    HF (int8)    |
 |:----------:|:---------------:|:---------------:|:---------------:|
-| 1          | 45.31 \| 22.07  | 42.23 \| 23.68  |                 |
-| 2          | 86.60 \| 11.55  | 83.79 \| 11.93  |                 |
-| 4          | 171.38 \| 5.83  | 168.91 \| 5.92  |                 |
-| 8          | 325.98 \| 3.07  | 328.11 \| 3.05  |                 |
-| 16         | 655.23 \| 1.53  | 669.15 \| 1.49  |                 |
-| 32         | 1356.57 \| 0.74 | 1277.78 \| 0.78 |                 |
-| 64         | 2373.14 \| 0.42 | 2605.26 \| 0.38 |                 |
-| 128        | 2688.91 \| 0.37 | 4780.32 \| 0.21 |                 |
-| 256        | 3325.01 \| 0.30 | 6549.67 \| 0.15 |                 |
-| 384        | 3261.28 \| 0.31 | 7319.86 \| 0.14 |                 |
-| 512        | 3369.69 \| 0.30 | 7425.47 \| 0.13 |                 |
-| 640        | 3506.41 \| 0.29 | 7553.05 \| 0.13 |                 |
-| 768        | 3461.95 \| 0.29 | 7681.78 \| 0.13 |                 |
-| 896        | 3346.01 \| 0.30 | 7544.19 \| 0.13 |                 |
+| 1          | 44.38 \| 22.53  | 42.23 \| 23.68  |                 |
+| 2          | 85.82 \| 11.65  | 83.79 \| 11.93  |                 |
+| 4          | 171.77 \| 5.82  | 168.91 \| 5.92  |                 |
+| 8          | 334.21 \| 2.99  | 328.11 \| 3.05  |                 |
+| 16         | 658.77 \| 1.52  | 669.15 \| 1.49  |                 |
+| 32         | 1312.31 \| 0.76 | 1277.78 \| 0.78 |                 |
+| 64         | 2312.48 \| 0.43 | 2605.26 \| 0.38 |                 |
+| 128        | 2686.37 \| 0.37 | 4780.32 \| 0.21 |                 |
+| 256        | 3313.04 \| 0.30 | 6549.67 \| 0.15 |                 |
+| 384        | 3253.52 \| 0.31 | 7319.86 \| 0.14 |                 |
+| 512        | 3361.34 \| 0.30 | 7425.47 \| 0.13 |                 |
+| 640        | 3497.55 \| 0.29 | 7553.05 \| 0.13 |                 |
+| 768        | 3460.71 \| 0.29 | 7681.78 \| 0.13 |                 |
+| 896        | 3339.99  \| 0.30 | 7544.19 \| 0.13 |                 |
 | 1024       | oom             | 7703.84 \| 0.13 |                 |
 | 1152       | oom             | 7728.71 \| 0.13 |                 |
 | 1280       | oom             | 7799.99 \| 0.13 |                 |
@@ -32,20 +32,20 @@ tokens/sec | msec/token
 sec
 | batch_size | HF (fp32) | HF (bf16) | HF (int8) |
 |:----------:|:---------:|:---------:|:---------:|
-| 1          | 2.21      | 2.37      |                    |
-| 2          | 2.31      | 2.39      |                    |
+| 1          | 2.25      | 2.37      |                    |
+| 2          | 2.33      | 2.39      |                    |
 | 4          | 2.33      | 2.37      |                    |
-| 8          | 2.45      | 2.44      |                    |
-| 16         | 2.44      | 2.39      |                    |
-| 32         | 2.36      | 2.50      |                    |
-| 64         | 2.70      | 2.46      |                    |
+| 8          | 2.39      | 2.44      |                    |
+| 16         | 2.43      | 2.39      |                    |
+| 32         | 2.44      | 2.50      |                    |
+| 64         | 2.77      | 2.46      |                    |
 | 128        | 4.76      | 2.68      |                    |
-| 256        | 7.70      | 3.91      |                    |
-| 384        | 11.77     | 5.25      |                    |
-| 512        | 15.19     | 6.90      |                    |
-| 640        | 18.25     | 8.47      |                    |
-| 768        | 22.18     | 10.00     |                    |
-| 896        | 26.78     | 11.88     |                    |
+| 256        | 7.73      | 3.91      |                    |
+| 384        | 11.80     | 5.25      |                    |
+| 512        | 15.23     | 6.90      |                    |
+| 640        | 18.30     | 8.47      |                    |
+| 768        | 22.19     | 10.00     |                    |
+| 896        | 26.83     | 11.88     |                    |
 | 1024       | oom       | 13.29     |                    |
 | 1152       | oom       | 14.91     |                    |
 | 1280       | oom       | 16.41     |                    |

From dfe8cb32b44ce07597d9cc57b16ccc173c40cf83 Mon Sep 17 00:00:00 2001
From: mayank31398 <mayank31398@gmail.com>
Date: Sun, 4 Dec 2022 23:03:31 +0530
Subject: [PATCH 26/43] bf16

---
 README.md | 76 +++++++++++++++++++++++++++----------------------------
 1 file changed, 38 insertions(+), 38 deletions(-)

diff --git a/README.md b/README.md
index 462050f..26724ab 100644
--- a/README.md
+++ b/README.md
@@ -6,51 +6,51 @@ A100 80GB
 tokens/sec | msec/token
 | batch_size |    HF (fp32)    |    HF (bf16)    |    HF (int8)    |
 |:----------:|:---------------:|:---------------:|:---------------:|
-| 1          | 44.38 \| 22.53  | 42.23 \| 23.68  |                 |
-| 2          | 85.82 \| 11.65  | 83.79 \| 11.93  |                 |
-| 4          | 171.77 \| 5.82  | 168.91 \| 5.92  |                 |
-| 8          | 334.21 \| 2.99  | 328.11 \| 3.05  |                 |
-| 16         | 658.77 \| 1.52  | 669.15 \| 1.49  |                 |
-| 32         | 1312.31 \| 0.76 | 1277.78 \| 0.78 |                 |
-| 64         | 2312.48 \| 0.43 | 2605.26 \| 0.38 |                 |
-| 128        | 2686.37 \| 0.37 | 4780.32 \| 0.21 |                 |
-| 256        | 3313.04 \| 0.30 | 6549.67 \| 0.15 |                 |
-| 384        | 3253.52 \| 0.31 | 7319.86 \| 0.14 |                 |
-| 512        | 3361.34 \| 0.30 | 7425.47 \| 0.13 |                 |
-| 640        | 3497.55 \| 0.29 | 7553.05 \| 0.13 |                 |
-| 768        | 3460.71 \| 0.29 | 7681.78 \| 0.13 |                 |
-| 896        | 3339.99  \| 0.30 | 7544.19 \| 0.13 |                 |
-| 1024       | oom             | 7703.84 \| 0.13 |                 |
-| 1152       | oom             | 7728.71 \| 0.13 |                 |
-| 1280       | oom             | 7799.99 \| 0.13 |                 |
-| 1408       | oom             | 7776.64 \| 0.13 |                 |
-| 1536       | oom             | 7802.61 \| 0.13 |                 |
-| 1664       | oom             | 7783.20 \| 0.13 |                 |
-| 1792       | oom             | 7738.55 \| 0.13 |                 |
+| 1          | 44.38 \| 22.53  | 41.00 \| 24.39  |                 |
+| 2          | 85.82 \| 11.65  | 79.20 \| 12.63  |                 |
+| 4          | 171.77 \| 5.82  | 160.72 \| 6.22  |                 |
+| 8          | 334.21 \| 2.99  | 317.56 \| 3.15  |                 |
+| 16         | 658.77 \| 1.52  | 644.14 \| 1.55  |                 |
+| 32         | 1312.31 \| 0.76 | 1277.62 \| 0.78 |                 |
+| 64         | 2312.48 \| 0.43 | 2683.15 \| 0.37 |                 |
+| 128        | 2686.37 \| 0.37 | 4766.97 \| 0.21 |                 |
+| 256        | 3313.04 \| 0.30 | 6578.06 \| 0.15 |                 |
+| 384        | 3253.52 \| 0.31 | 7285.37 \| 0.14 |                 |
+| 512        | 3361.34 \| 0.30 | 7436.71 \| 0.13 |                 |
+| 640        | 3497.55 \| 0.29 | 7554.05 \| 0.13 |                 |
+| 768        | 3460.71 \| 0.29 | 7678.89 \| 0.13 |                 |
+| 896        | 3339.99 \| 0.30 | 7542.81 \| 0.13 |                 |
+| 1024       | oom             | 7702.06 \| 0.13 |                 |
+| 1152       | oom             | 7719.68 \| 0.13 |                 |
+| 1280       | oom             | 7786.51 \| 0.13 |                 |
+| 1408       | oom             | 7770.26 \| 0.13 |                 |
+| 1536       | oom             | 7783.86 \| 0.13 |                 |
+| 1664       | oom             | 7772.43 \| 0.13 |                 |
+| 1792       | oom             | 7747.92 \| 0.13 |                 |
 | 1920       | oom             | oom             |                 |
 
 sec
 | batch_size | HF (fp32) | HF (bf16) | HF (int8) |
 |:----------:|:---------:|:---------:|:---------:|
-| 1          | 2.25      | 2.37      |                    |
-| 2          | 2.33      | 2.39      |                    |
-| 4          | 2.33      | 2.37      |                    |
-| 8          | 2.39      | 2.44      |                    |
-| 16         | 2.43      | 2.39      |                    |
+| 1          | 2.25      | 2.44      |                    |
+| 2          | 2.33      | 2.53      |                    |
+| 4          | 2.33      | 2.49      |                    |
+| 8          | 2.39      | 2.52      |                    |
+| 16         | 2.43      | 2.48      |                    |
 | 32         | 2.44      | 2.50      |                    |
-| 64         | 2.77      | 2.46      |                    |
-| 128        | 4.76      | 2.68      |                    |
-| 256        | 7.73      | 3.91      |                    |
-| 384        | 11.80     | 5.25      |                    |
-| 512        | 15.23     | 6.90      |                    |
+| 64         | 2.77      | 2.39      |                    |
+| 128        | 4.76      | 2.69      |                    |
+| 256        | 7.73      | 3.89      |                    |
+| 384        | 11.80     | 5.27      |                    |
+| 512        | 15.23     | 6.88      |                    |
 | 640        | 18.30     | 8.47      |                    |
 | 768        | 22.19     | 10.00     |                    |
 | 896        | 26.83     | 11.88     |                    |
-| 1024       | oom       | 13.29     |                    |
-| 1152       | oom       | 14.91     |                    |
-| 1280       | oom       | 16.41     |                    |
-| 1408       | oom       | 18.11     |                    |
-| 1536       | oom       | 19.69     |                    |
-| 1664       | oom       | 21.38     |                    |
-| 1792       | oom       | 23.16     |                    |
+| 1024       | oom       | 13.30     |                    |
+| 1152       | oom       | 14.92     |                    |
+| 1280       | oom       | 16.44     |                    |
+| 1408       | oom       | 18.12     |                    |
+| 1536       | oom       | 19.73     |                    |
+| 1664       | oom       | 21.41     |                    |
+| 1792       | oom       | 23.13     |                    |
 | 1920       | oom       | oom       |                    |
\ No newline at end of file

From 7344ae0c1de3891561afb70c4a66e4c44f1b7e7a Mon Sep 17 00:00:00 2001
From: mayank31398 <mayank31398@gmail.com>
Date: Sun, 4 Dec 2022 23:09:00 +0530
Subject: [PATCH 27/43] bf16

---
 Makefile | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/Makefile b/Makefile
index 2e75294..2ea810d 100644
--- a/Makefile
+++ b/Makefile
@@ -18,7 +18,7 @@ hf-1b-bloom-bf16:
 hf-1b-bloom-int8:
 	python src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class HF_GPU_Pipeline --model_class BLOOM --dtype int8 --batch_size ${batch_size}
 
-ds-inference-1b-bloom:
+ds-inference-1b-bloom-fp16:
 	deepspeed --num_gpus 1 src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class DS_Inference_Pipeline --model_class BLOOM --batch_size ${batch_size}
 
 # GPT2 MHA
@@ -31,7 +31,7 @@ hf-1b-GPT2-mha-bf16:
 hf-1b-GPT2-mha-int8:
 	python src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class HF_GPU_Pipeline --model_class GPT2 --n_positions 2048 --attention_type 1 --dtype int8 --batch_size ${batch_size}
 
-ds-inference-1b-GPT2-mha:
+ds-inference-1b-GPT2-mha-fp16:
 	deepspeed --num_gpus 1 src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class DS_Inference_Pipeline --model_class GPT2 --n_positions 2048 --attention_type 1 --batch_size ${batch_size}
 
 # GPT2 MQA
@@ -44,7 +44,7 @@ hf-1b-GPT2-mqa-bf16:
 hf-1b-GPT2-mqa-int8:
 	python src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class HF_GPU_Pipeline --model_class GPT2 --n_positions 2048 --attention_type 2 --dtype int8 --batch_size ${batch_size}
 
-ds-inference-1b-GPT2-mqa:
+ds-inference-1b-GPT2-mqa-fp16:
 	deepspeed --num_gpus 1 src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class DS_Inference_Pipeline --model_class GPT2 --n_positions 2048 --attention_type 2 --batch_size ${batch_size}
 
 # GPT2 MQA1
@@ -57,5 +57,5 @@ hf-1b-GPT2-mqa1-bf16:
 hf-1b-GPT2-mqa1-int8:
 	python src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class HF_GPU_Pipeline --model_class GPT2 --n_positions 2048 --attention_type 3 --dtype int8 --batch_size ${batch_size}
 
-ds-inference-1b-GPT2-mqa1:
+ds-inference-1b-GPT2-mqa1-fp16:
 	deepspeed --num_gpus 1 src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class DS_Inference_Pipeline --model_class GPT2 --n_positions 2048 --attention_type 3 --batch_size ${batch_size}

From a4c3b81cf0f295422e313edd77e2edf9384fcbcf Mon Sep 17 00:00:00 2001
From: mayank31398 <mayank31398@gmail.com>
Date: Mon, 5 Dec 2022 00:30:16 +0530
Subject: [PATCH 28/43] ds-inference

---
 README.md | 107 ++++++++++++++++++++++++++++--------------------------
 1 file changed, 56 insertions(+), 51 deletions(-)

diff --git a/README.md b/README.md
index 26724ab..6552df0 100644
--- a/README.md
+++ b/README.md
@@ -1,56 +1,61 @@
 # bigcode-inference-benchmark
+A100 80GB
 
 BLOOM\
-A100 80GB
+```python
+hidden_size = 2048
+n_head = 16
+n_layer = 24
+```
 
-tokens/sec | msec/token
-| batch_size |    HF (fp32)    |    HF (bf16)    |    HF (int8)    |
-|:----------:|:---------------:|:---------------:|:---------------:|
-| 1          | 44.38 \| 22.53  | 41.00 \| 24.39  |                 |
-| 2          | 85.82 \| 11.65  | 79.20 \| 12.63  |                 |
-| 4          | 171.77 \| 5.82  | 160.72 \| 6.22  |                 |
-| 8          | 334.21 \| 2.99  | 317.56 \| 3.15  |                 |
-| 16         | 658.77 \| 1.52  | 644.14 \| 1.55  |                 |
-| 32         | 1312.31 \| 0.76 | 1277.62 \| 0.78 |                 |
-| 64         | 2312.48 \| 0.43 | 2683.15 \| 0.37 |                 |
-| 128        | 2686.37 \| 0.37 | 4766.97 \| 0.21 |                 |
-| 256        | 3313.04 \| 0.30 | 6578.06 \| 0.15 |                 |
-| 384        | 3253.52 \| 0.31 | 7285.37 \| 0.14 |                 |
-| 512        | 3361.34 \| 0.30 | 7436.71 \| 0.13 |                 |
-| 640        | 3497.55 \| 0.29 | 7554.05 \| 0.13 |                 |
-| 768        | 3460.71 \| 0.29 | 7678.89 \| 0.13 |                 |
-| 896        | 3339.99 \| 0.30 | 7542.81 \| 0.13 |                 |
-| 1024       | oom             | 7702.06 \| 0.13 |                 |
-| 1152       | oom             | 7719.68 \| 0.13 |                 |
-| 1280       | oom             | 7786.51 \| 0.13 |                 |
-| 1408       | oom             | 7770.26 \| 0.13 |                 |
-| 1536       | oom             | 7783.86 \| 0.13 |                 |
-| 1664       | oom             | 7772.43 \| 0.13 |                 |
-| 1792       | oom             | 7747.92 \| 0.13 |                 |
-| 1920       | oom             | oom             |                 |
+Throughput (tokens/sec | msec/token)
+| batch_size |    HF (fp32)    |    HF (bf16)    |    HF (int8)    | DS-inference (fp16) |
+|:----------:|:---------------:|:---------------:|:---------------:|:-------------------:|
+| 1          | 44.38 \| 22.53  | 41.00 \| 24.39  |                 | 61.61 | 16.23       |
+| 2          | 85.82 \| 11.65  | 79.20 \| 12.63  |                 | 121.55 | 8.23       |
+| 4          | 171.77 \| 5.82  | 160.72 \| 6.22  |                 | 240.06 | 4.17       |
+| 8          | 334.21 \| 2.99  | 317.56 \| 3.15  |                 | 492.42 | 2.03       |
+| 16         | 658.77 \| 1.52  | 644.14 \| 1.55  |                 | 970.59 | 1.03       |
+| 32         | 1312.31 \| 0.76 | 1277.62 \| 0.78 |                 | 1999.04 | 0.50      |
+| 64         | 2312.48 \| 0.43 | 2683.15 \| 0.37 |                 | 3971.09 | 0.25      |
+| 128        | 2686.37 \| 0.37 | 4766.97 \| 0.21 |                 | 7514.59 | 0.13      |
+| 256        | 3313.04 \| 0.30 | 6578.06 \| 0.15 |                 | 10226.50 | 0.10     |
+| 384        | 3253.52 \| 0.31 | 7285.37 \| 0.14 |                 | 11094.27 | 0.09     |
+| 512        | 3361.34 \| 0.30 | 7436.71 \| 0.13 |                 | 11390.85 | 0.09     |
+| 640        | 3497.55 \| 0.29 | 7554.05 \| 0.13 |                 | 11625.71 | 0.09     |
+| 768        | 3460.71 \| 0.29 | 7678.89 \| 0.13 |                 | 11814.31 | 0.08     |
+| 896        | 3339.99 \| 0.30 | 7542.81 \| 0.13 |                 | 11744.38 | 0.09     |
+| 1024       | oom             | 7702.06 \| 0.13 |                 | 11534.95 | 0.09     |
+| 1152       | oom             | 7719.68 \| 0.13 |                 | oom                 |
+| 1280       | oom             | 7786.51 \| 0.13 |                 | oom                 |
+| 1408       | oom             | 7770.26 \| 0.13 |                 | oom                 |
+| 1536       | oom             | 7783.86 \| 0.13 |                 | oom                 |
+| 1664       | oom             | 7772.43 \| 0.13 |                 | oom                 |
+| 1792       | oom             | 7747.92 \| 0.13 |                 | oom                 |
+| 1920       | oom             | oom             |                 | oom                 |
 
-sec
-| batch_size | HF (fp32) | HF (bf16) | HF (int8) |
-|:----------:|:---------:|:---------:|:---------:|
-| 1          | 2.25      | 2.44      |                    |
-| 2          | 2.33      | 2.53      |                    |
-| 4          | 2.33      | 2.49      |                    |
-| 8          | 2.39      | 2.52      |                    |
-| 16         | 2.43      | 2.48      |                    |
-| 32         | 2.44      | 2.50      |                    |
-| 64         | 2.77      | 2.39      |                    |
-| 128        | 4.76      | 2.69      |                    |
-| 256        | 7.73      | 3.89      |                    |
-| 384        | 11.80     | 5.27      |                    |
-| 512        | 15.23     | 6.88      |                    |
-| 640        | 18.30     | 8.47      |                    |
-| 768        | 22.19     | 10.00     |                    |
-| 896        | 26.83     | 11.88     |                    |
-| 1024       | oom       | 13.30     |                    |
-| 1152       | oom       | 14.92     |                    |
-| 1280       | oom       | 16.44     |                    |
-| 1408       | oom       | 18.12     |                    |
-| 1536       | oom       | 19.73     |                    |
-| 1664       | oom       | 21.41     |                    |
-| 1792       | oom       | 23.13     |                    |
-| 1920       | oom       | oom       |                    |
\ No newline at end of file
+Latency (sec)
+| batch_size | HF (fp32) | HF (bf16) | HF (int8) | DS-inference (fp16) |
+|:----------:|:---------:|:---------:|:---------:|:-------------------:|
+| 1          | 2.25      | 2.44      |           | 1.62                |
+| 2          | 2.33      | 2.53      |           | 1.65                |
+| 4          | 2.33      | 2.49      |           | 1.67                |
+| 8          | 2.39      | 2.52      |           | 1.62                |
+| 16         | 2.43      | 2.48      |           | 1.65                |
+| 32         | 2.44      | 2.50      |           | 1.60                |
+| 64         | 2.77      | 2.39      |           | 1.61                |
+| 128        | 4.76      | 2.69      |           | 1.70                |
+| 256        | 7.73      | 3.89      |           | 2.50                |
+| 384        | 11.80     | 5.27      |           | 3.46                |
+| 512        | 15.23     | 6.88      |           | 4.49                |
+| 640        | 18.30     | 8.47      |           | 5.51                |
+| 768        | 22.19     | 10.00     |           | 6.50                |
+| 896        | 26.83     | 11.88     |           | 7.63                |
+| 1024       | oom       | 13.30     |           | 8.88                |
+| 1152       | oom       | 14.92     |           | oom                 |
+| 1280       | oom       | 16.44     |           | oom                 |
+| 1408       | oom       | 18.12     |           | oom                 |
+| 1536       | oom       | 19.73     |           | oom                 |
+| 1664       | oom       | 21.41     |           | oom                 |
+| 1792       | oom       | 23.13     |           | oom                 |
+| 1920       | oom       | oom       |           | oom                 |
\ No newline at end of file

From a0f308db975d271fd9fcf0cba27a576e59ae60dd Mon Sep 17 00:00:00 2001
From: mayank31398 <mayank31398@gmail.com>
Date: Mon, 5 Dec 2022 03:23:15 +0530
Subject: [PATCH 29/43] device map

---
 src/pipelines/hf.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/pipelines/hf.py b/src/pipelines/hf.py
index a3dca81..e7d2ac0 100644
--- a/src/pipelines/hf.py
+++ b/src/pipelines/hf.py
@@ -9,7 +9,7 @@ class HF_Pipeline(Pipeline):
     def __init__(self, args: Namespace, device: str = "cpu") -> None:
         super().__init__(args)
 
-        model_kwargs = {}
+        model_kwargs = {"device_map": "auto"}
 
         if args.dtype == torch.int8:
             model_kwargs["load_in_8bit"] = True

From 0947688aca02e3ccc859178de7aa69b24187b2a8 Mon Sep 17 00:00:00 2001
From: mayank31398 <mayank31398@gmail.com>
Date: Mon, 5 Dec 2022 03:24:41 +0530
Subject: [PATCH 30/43] device map

---
 src/pipelines/hf.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/pipelines/hf.py b/src/pipelines/hf.py
index e7d2ac0..1050074 100644
--- a/src/pipelines/hf.py
+++ b/src/pipelines/hf.py
@@ -9,10 +9,11 @@ class HF_Pipeline(Pipeline):
     def __init__(self, args: Namespace, device: str = "cpu") -> None:
         super().__init__(args)
 
-        model_kwargs = {"device_map": "auto"}
+        model_kwargs = {}
 
         if args.dtype == torch.int8:
             model_kwargs["load_in_8bit"] = True
+            model_kwargs["device_map"] = "auto"
         else:
             model_kwargs["torch_dtype"] = args.dtype
 

From 379bfd9739a94161aece6757c02694e5d545b216 Mon Sep 17 00:00:00 2001
From: mayank31398 <mayank31398@gmail.com>
Date: Mon, 5 Dec 2022 03:38:36 +0530
Subject: [PATCH 31/43] fix

---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index 2ea810d..a47032a 100644
--- a/Makefile
+++ b/Makefile
@@ -3,7 +3,7 @@ batch_size := 1
 install-mqa-transformers:
 	git clone https://github.com/bigcode-project/transformers.git; \
 	cd transformers; \
-	git checkout multi_query; \
+	git checkout mayank/multi_query; \
 	pip install .; \
 	cd ..; \
 	rm -rf transformers;

From 6dc0c0786ec40eec6fadc11f1a282be0d5360f62 Mon Sep 17 00:00:00 2001
From: mayank31398 <mayank31398@gmail.com>
Date: Mon, 5 Dec 2022 13:29:08 +0530
Subject: [PATCH 32/43] fp32

---
 README.md | 56 +++++++++++++++++++++++++++----------------------------
 1 file changed, 28 insertions(+), 28 deletions(-)

diff --git a/README.md b/README.md
index 6552df0..b285956 100644
--- a/README.md
+++ b/README.md
@@ -11,20 +11,20 @@ n_layer = 24
 Throughput (tokens/sec | msec/token)
 | batch_size |    HF (fp32)    |    HF (bf16)    |    HF (int8)    | DS-inference (fp16) |
 |:----------:|:---------------:|:---------------:|:---------------:|:-------------------:|
-| 1          | 44.38 \| 22.53  | 41.00 \| 24.39  |                 | 61.61 | 16.23       |
-| 2          | 85.82 \| 11.65  | 79.20 \| 12.63  |                 | 121.55 | 8.23       |
-| 4          | 171.77 \| 5.82  | 160.72 \| 6.22  |                 | 240.06 | 4.17       |
-| 8          | 334.21 \| 2.99  | 317.56 \| 3.15  |                 | 492.42 | 2.03       |
-| 16         | 658.77 \| 1.52  | 644.14 \| 1.55  |                 | 970.59 | 1.03       |
-| 32         | 1312.31 \| 0.76 | 1277.62 \| 0.78 |                 | 1999.04 | 0.50      |
-| 64         | 2312.48 \| 0.43 | 2683.15 \| 0.37 |                 | 3971.09 | 0.25      |
-| 128        | 2686.37 \| 0.37 | 4766.97 \| 0.21 |                 | 7514.59 | 0.13      |
-| 256        | 3313.04 \| 0.30 | 6578.06 \| 0.15 |                 | 10226.50 | 0.10     |
-| 384        | 3253.52 \| 0.31 | 7285.37 \| 0.14 |                 | 11094.27 | 0.09     |
-| 512        | 3361.34 \| 0.30 | 7436.71 \| 0.13 |                 | 11390.85 | 0.09     |
-| 640        | 3497.55 \| 0.29 | 7554.05 \| 0.13 |                 | 11625.71 | 0.09     |
-| 768        | 3460.71 \| 0.29 | 7678.89 \| 0.13 |                 | 11814.31 | 0.08     |
-| 896        | 3339.99 \| 0.30 | 7542.81 \| 0.13 |                 | 11744.38 | 0.09     |
+| 1          | 51.59 \| 19.38  | 41.00 \| 24.39  |                 | 61.61 | 16.23       |
+| 2          | 103.92 \| 9.62  | 79.20 \| 12.63  |                 | 121.55 | 8.23       |
+| 4          | 211.96 \| 4.72  | 160.72 \| 6.22  |                 | 240.06 | 4.17       |
+| 8          | 411.79 \| 2.43  | 317.56 \| 3.15  |                 | 492.42 | 2.03       |
+| 16         | 804.55 \| 1.24  | 644.14 \| 1.55  |                 | 970.59 | 1.03       |
+| 32         | 1574.68 \| 0.64 | 1277.62 \| 0.78 |                 | 1999.04 | 0.50      |
+| 64         | 2712.46 \| 0.37 | 2683.15 \| 0.37 |                 | 3971.09 | 0.25      |
+| 128        | 2974.36 \| 0.34 | 4766.97 \| 0.21 |                 | 7514.59 | 0.13      |
+| 256        | 3695.44 \| 0.27 | 6578.06 \| 0.15 |                 | 10226.50 | 0.10     |
+| 384        | 3591.13 \| 0.28 | 7285.37 \| 0.14 |                 | 11094.27 | 0.09     |
+| 512        | 3708.54 \| 0.27 | 7436.71 \| 0.13 |                 | 11390.85 | 0.09     |
+| 640        | 3859.43 \| 0.26 | 7554.05 \| 0.13 |                 | 11625.71 | 0.09     |
+| 768        | 3804.82 \| 0.26 | 7678.89 \| 0.13 |                 | 11814.31 | 0.08     |
+| 896        | 3652.42 \| 0.27 | 7542.81 \| 0.13 |                 | 11744.38 | 0.09     |
 | 1024       | oom             | 7702.06 \| 0.13 |                 | 11534.95 | 0.09     |
 | 1152       | oom             | 7719.68 \| 0.13 |                 | oom                 |
 | 1280       | oom             | 7786.51 \| 0.13 |                 | oom                 |
@@ -37,20 +37,20 @@ Throughput (tokens/sec | msec/token)
 Latency (sec)
 | batch_size | HF (fp32) | HF (bf16) | HF (int8) | DS-inference (fp16) |
 |:----------:|:---------:|:---------:|:---------:|:-------------------:|
-| 1          | 2.25      | 2.44      |           | 1.62                |
-| 2          | 2.33      | 2.53      |           | 1.65                |
-| 4          | 2.33      | 2.49      |           | 1.67                |
-| 8          | 2.39      | 2.52      |           | 1.62                |
-| 16         | 2.43      | 2.48      |           | 1.65                |
-| 32         | 2.44      | 2.50      |           | 1.60                |
-| 64         | 2.77      | 2.39      |           | 1.61                |
-| 128        | 4.76      | 2.69      |           | 1.70                |
-| 256        | 7.73      | 3.89      |           | 2.50                |
-| 384        | 11.80     | 5.27      |           | 3.46                |
-| 512        | 15.23     | 6.88      |           | 4.49                |
-| 640        | 18.30     | 8.47      |           | 5.51                |
-| 768        | 22.19     | 10.00     |           | 6.50                |
-| 896        | 26.83     | 11.88     |           | 7.63                |
+| 1          | 1.94      | 2.44      |           | 1.62                |
+| 2          | 1.92      | 2.53      |           | 1.65                |
+| 4          | 1.89      | 2.49      |           | 1.67                |
+| 8          | 1.94      | 2.52      |           | 1.62                |
+| 16         | 1.99      | 2.48      |           | 1.65                |
+| 32         | 2.03      | 2.50      |           | 1.60                |
+| 64         | 2.36      | 2.39      |           | 1.61                |
+| 128        | 4.30      | 2.69      |           | 1.70                |
+| 256        | 6.93      | 3.89      |           | 2.50                |
+| 384        | 10.69     | 5.27      |           | 3.46                |
+| 512        | 14.82     | 6.88      |           | 4.49                |
+| 640        | 19.85     | 8.47      |           | 5.51                |
+| 768        | 20.18     | 10.00     |           | 6.50                |
+| 896        | 24.53     | 11.88     |           | 7.63                |
 | 1024       | oom       | 13.30     |           | 8.88                |
 | 1152       | oom       | 14.92     |           | oom                 |
 | 1280       | oom       | 16.44     |           | oom                 |

From 7dc67ea0a3b28b27048cd96b18ac32911e9d9d61 Mon Sep 17 00:00:00 2001
From: mayank31398 <mayank31398@gmail.com>
Date: Mon, 5 Dec 2022 14:13:10 +0530
Subject: [PATCH 33/43] bf16

---
 README.md | 84 +++++++++++++++++++++++++++----------------------------
 1 file changed, 42 insertions(+), 42 deletions(-)

diff --git a/README.md b/README.md
index b285956..ff22562 100644
--- a/README.md
+++ b/README.md
@@ -11,51 +11,51 @@ n_layer = 24
 Throughput (tokens/sec | msec/token)
 | batch_size |    HF (fp32)    |    HF (bf16)    |    HF (int8)    | DS-inference (fp16) |
 |:----------:|:---------------:|:---------------:|:---------------:|:-------------------:|
-| 1          | 51.59 \| 19.38  | 41.00 \| 24.39  |                 | 61.61 | 16.23       |
-| 2          | 103.92 \| 9.62  | 79.20 \| 12.63  |                 | 121.55 | 8.23       |
-| 4          | 211.96 \| 4.72  | 160.72 \| 6.22  |                 | 240.06 | 4.17       |
-| 8          | 411.79 \| 2.43  | 317.56 \| 3.15  |                 | 492.42 | 2.03       |
-| 16         | 804.55 \| 1.24  | 644.14 \| 1.55  |                 | 970.59 | 1.03       |
-| 32         | 1574.68 \| 0.64 | 1277.62 \| 0.78 |                 | 1999.04 | 0.50      |
-| 64         | 2712.46 \| 0.37 | 2683.15 \| 0.37 |                 | 3971.09 | 0.25      |
-| 128        | 2974.36 \| 0.34 | 4766.97 \| 0.21 |                 | 7514.59 | 0.13      |
-| 256        | 3695.44 \| 0.27 | 6578.06 \| 0.15 |                 | 10226.50 | 0.10     |
-| 384        | 3591.13 \| 0.28 | 7285.37 \| 0.14 |                 | 11094.27 | 0.09     |
-| 512        | 3708.54 \| 0.27 | 7436.71 \| 0.13 |                 | 11390.85 | 0.09     |
-| 640        | 3859.43 \| 0.26 | 7554.05 \| 0.13 |                 | 11625.71 | 0.09     |
-| 768        | 3804.82 \| 0.26 | 7678.89 \| 0.13 |                 | 11814.31 | 0.08     |
-| 896        | 3652.42 \| 0.27 | 7542.81 \| 0.13 |                 | 11744.38 | 0.09     |
-| 1024       | oom             | 7702.06 \| 0.13 |                 | 11534.95 | 0.09     |
-| 1152       | oom             | 7719.68 \| 0.13 |                 | oom                 |
-| 1280       | oom             | 7786.51 \| 0.13 |                 | oom                 |
-| 1408       | oom             | 7770.26 \| 0.13 |                 | oom                 |
-| 1536       | oom             | 7783.86 \| 0.13 |                 | oom                 |
-| 1664       | oom             | 7772.43 \| 0.13 |                 | oom                 |
-| 1792       | oom             | 7747.92 \| 0.13 |                 | oom                 |
+| 1          | 51.59 \| 19.38  | 47.46 \| 21.07  |                 | 61.61 | 16.23       |
+| 2          | 103.92 \| 9.62  | 96.88 \| 10.32  |                 | 121.55 | 8.23       |
+| 4          | 211.96 \| 4.72  | 193.72 \| 5.16  |                 | 240.06 | 4.17       |
+| 8          | 411.79 \| 2.43  | 370.67 \| 2.70  |                 | 492.42 | 2.03       |
+| 16         | 804.55 \| 1.24  | 781.29 \| 1.28  |                 | 970.59 | 1.03       |
+| 32         | 1574.68 \| 0.64 | 1539.19 \| 0.65 |                 | 1999.04 | 0.50      |
+| 64         | 2712.46 \| 0.37 | 3038.01 \| 0.33 |                 | 3971.09 | 0.25      |
+| 128        | 2974.36 \| 0.34 | 5795.97 \| 0.17 |                 | 7514.59 | 0.13      |
+| 256        | 3695.44 \| 0.27 | 8216.27 \| 0.12 |                 | 10226.50 | 0.10     |
+| 384        | 3591.13 \| 0.28 | 9328.18 \| 0.11 |                 | 11094.27 | 0.09     |
+| 512        | 3708.54 \| 0.27 | 9446.34 \| 0.11 |                 | 11390.85 | 0.09     |
+| 640        | 3859.43 \| 0.26 | 9572.53 \| 0.10 |                 | 11625.71 | 0.09     |
+| 768        | 3804.82 \| 0.26 | 9464.75 \| 0.11 |                 | 11814.31 | 0.08     |
+| 896        | 3652.42 \| 0.27 | 9482.11 \| 0.11 |                 | 11744.38 | 0.09     |
+| 1024       | oom             | 9710.46 \| 0.10 |                 | 11534.95 | 0.09     |
+| 1152       | oom             | 9712.39 \| 0.10 |                 | oom                 |
+| 1280       | oom             | 9667.19 \| 0.10 |                 | oom                 |
+| 1408       | oom             | 9771.91 \| 0.10 |                 | oom                 |
+| 1536       | oom             | 9744.56 \| 0.10 |                 | oom                 |
+| 1664       | oom             | 9719.82 \| 0.10 |                 | oom                 |
+| 1792       | oom             | 9690.61 \| 0.10 |                 | oom                 |
 | 1920       | oom             | oom             |                 | oom                 |
 
 Latency (sec)
 | batch_size | HF (fp32) | HF (bf16) | HF (int8) | DS-inference (fp16) |
 |:----------:|:---------:|:---------:|:---------:|:-------------------:|
-| 1          | 1.94      | 2.44      |           | 1.62                |
-| 2          | 1.92      | 2.53      |           | 1.65                |
-| 4          | 1.89      | 2.49      |           | 1.67                |
-| 8          | 1.94      | 2.52      |           | 1.62                |
-| 16         | 1.99      | 2.48      |           | 1.65                |
-| 32         | 2.03      | 2.50      |           | 1.60                |
-| 64         | 2.36      | 2.39      |           | 1.61                |
-| 128        | 4.30      | 2.69      |           | 1.70                |
-| 256        | 6.93      | 3.89      |           | 2.50                |
-| 384        | 10.69     | 5.27      |           | 3.46                |
-| 512        | 14.82     | 6.88      |           | 4.49                |
-| 640        | 19.85     | 8.47      |           | 5.51                |
-| 768        | 20.18     | 10.00     |           | 6.50                |
-| 896        | 24.53     | 11.88     |           | 7.63                |
-| 1024       | oom       | 13.30     |           | 8.88                |
-| 1152       | oom       | 14.92     |           | oom                 |
-| 1280       | oom       | 16.44     |           | oom                 |
-| 1408       | oom       | 18.12     |           | oom                 |
-| 1536       | oom       | 19.73     |           | oom                 |
-| 1664       | oom       | 21.41     |           | oom                 |
-| 1792       | oom       | 23.13     |           | oom                 |
+| 1          | 1.94      | 2.11      |           | 1.62                |
+| 2          | 1.92      | 2.06      |           | 1.65                |
+| 4          | 1.89      | 2.06      |           | 1.67                |
+| 8          | 1.94      | 2.16      |           | 1.62                |
+| 16         | 1.99      | 2.05      |           | 1.65                |
+| 32         | 2.03      | 2.08      |           | 1.60                |
+| 64         | 2.36      | 2.11      |           | 1.61                |
+| 128        | 4.30      | 2.21      |           | 1.70                |
+| 256        | 6.93      | 3.12      |           | 2.50                |
+| 384        | 10.69     | 4.12      |           | 3.46                |
+| 512        | 14.82     | 5.42      |           | 4.49                |
+| 640        | 19.85     | 6.69      |           | 5.51                |
+| 768        | 20.18     | 8.11      |           | 6.50                |
+| 896        | 24.53     | 9.45      |           | 7.63                |
+| 1024       | oom       | 10.55     |           | 8.88                |
+| 1152       | oom       | 11.86     |           | oom                 |
+| 1280       | oom       | 13.24     |           | oom                 |
+| 1408       | oom       | 14.41     |           | oom                 |
+| 1536       | oom       | 15.76     |           | oom                 |
+| 1664       | oom       | 17.12     |           | oom                 |
+| 1792       | oom       | 18.49     |           | oom                 |
 | 1920       | oom       | oom       |           | oom                 |
\ No newline at end of file

From 2ac761d640702292abccfd6cc810c9630096bc8e Mon Sep 17 00:00:00 2001
From: mayank31398 <mayank31398@gmail.com>
Date: Mon, 5 Dec 2022 17:45:13 +0530
Subject: [PATCH 34/43] int8

---
 README.md | 88 +++++++++++++++++++++++++++----------------------------
 1 file changed, 44 insertions(+), 44 deletions(-)

diff --git a/README.md b/README.md
index ff22562..f68f6cb 100644
--- a/README.md
+++ b/README.md
@@ -11,51 +11,51 @@ n_layer = 24
 Throughput (tokens/sec | msec/token)
 | batch_size |    HF (fp32)    |    HF (bf16)    |    HF (int8)    | DS-inference (fp16) |
 |:----------:|:---------------:|:---------------:|:---------------:|:-------------------:|
-| 1          | 51.59 \| 19.38  | 47.46 \| 21.07  |                 | 61.61 | 16.23       |
-| 2          | 103.92 \| 9.62  | 96.88 \| 10.32  |                 | 121.55 | 8.23       |
-| 4          | 211.96 \| 4.72  | 193.72 \| 5.16  |                 | 240.06 | 4.17       |
-| 8          | 411.79 \| 2.43  | 370.67 \| 2.70  |                 | 492.42 | 2.03       |
-| 16         | 804.55 \| 1.24  | 781.29 \| 1.28  |                 | 970.59 | 1.03       |
-| 32         | 1574.68 \| 0.64 | 1539.19 \| 0.65 |                 | 1999.04 | 0.50      |
-| 64         | 2712.46 \| 0.37 | 3038.01 \| 0.33 |                 | 3971.09 | 0.25      |
-| 128        | 2974.36 \| 0.34 | 5795.97 \| 0.17 |                 | 7514.59 | 0.13      |
-| 256        | 3695.44 \| 0.27 | 8216.27 \| 0.12 |                 | 10226.50 | 0.10     |
-| 384        | 3591.13 \| 0.28 | 9328.18 \| 0.11 |                 | 11094.27 | 0.09     |
-| 512        | 3708.54 \| 0.27 | 9446.34 \| 0.11 |                 | 11390.85 | 0.09     |
-| 640        | 3859.43 \| 0.26 | 9572.53 \| 0.10 |                 | 11625.71 | 0.09     |
-| 768        | 3804.82 \| 0.26 | 9464.75 \| 0.11 |                 | 11814.31 | 0.08     |
-| 896        | 3652.42 \| 0.27 | 9482.11 \| 0.11 |                 | 11744.38 | 0.09     |
-| 1024       | oom             | 9710.46 \| 0.10 |                 | 11534.95 | 0.09     |
-| 1152       | oom             | 9712.39 \| 0.10 |                 | oom                 |
-| 1280       | oom             | 9667.19 \| 0.10 |                 | oom                 |
-| 1408       | oom             | 9771.91 \| 0.10 |                 | oom                 |
-| 1536       | oom             | 9744.56 \| 0.10 |                 | oom                 |
-| 1664       | oom             | 9719.82 \| 0.10 |                 | oom                 |
-| 1792       | oom             | 9690.61 \| 0.10 |                 | oom                 |
-| 1920       | oom             | oom             |                 | oom                 |
+| 1          | 51.59 \| 19.38  | 47.46 \| 21.07  | 16.53 \| 60.49  | 61.61 \| 16.23      |
+| 2          | 103.92 \| 9.62  | 96.88 \| 10.32  | 33.79 \| 29.60  | 121.55 \| 8.23      |
+| 4          | 211.96 \| 4.72  | 193.72 \| 5.16  | 67.38 \| 14.84  | 240.06 \| 4.17      |
+| 8          | 411.79 \| 2.43  | 370.67 \| 2.70  | 134.34 \| 7.44  | 492.42 \| 2.03      |
+| 16         | 804.55 \| 1.24  | 781.29 \| 1.28  | 275.69 \| 3.63  | 970.59 \| 1.03      |
+| 32         | 1574.68 \| 0.64 | 1539.19 \| 0.65 | 537.14 \| 1.86  | 1999.04 \| 0.50     |
+| 64         | 2712.46 \| 0.37 | 3038.01 \| 0.33 | 1070.50 \| 0.93 | 3971.09 \| 0.25     |
+| 128        | 2974.36 \| 0.34 | 5795.97 \| 0.17 | 2055.34 \| 0.49 | 7514.59 \| 0.13     |
+| 256        | 3695.44 \| 0.27 | 8216.27 \| 0.12 | 3523.77 \| 0.28 | 10226.50 \| 0.10    |
+| 384        | 3591.13 \| 0.28 | 9328.18 \| 0.11 | 4585.33 \| 0.22 | 11094.27 \| 0.09    |
+| 512        | 3708.54 \| 0.27 | 9446.34 \| 0.11 | 5416.48 \| 0.18 | 11390.85 \| 0.09    |
+| 640        | 3859.43 \| 0.26 | 9572.53 \| 0.10 | 6113.65 \| 0.16 | 11625.71 \| 0.09    |
+| 768        | 3804.82 \| 0.26 | 9464.75 \| 0.11 | 6582.52 \| 0.15 | 11814.31 \| 0.08    |
+| 896        | 3652.42 \| 0.27 | 9482.11 \| 0.11 | 7111.08 \| 0.14 | 11744.38 \| 0.09    |
+| 1024       | oom             | 9710.46 \| 0.10 | 7486.36 \| 0.13 | 11534.95 \| 0.09    |
+| 1152       | oom             | 9712.39 \| 0.10 | 7544.99 \| 0.13 | oom                 |
+| 1280       | oom             | 9667.19 \| 0.10 | 7858.91 \| 0.13 | oom                 |
+| 1408       | oom             | 9771.91 \| 0.10 | 8116.30 \| 0.12 | oom                 |
+| 1536       | oom             | 9744.56 \| 0.10 | 8201.28 \| 0.12 | oom                 |
+| 1664       | oom             | 9719.82 \| 0.10 | 8227.56 \| 0.12 | oom                 |
+| 1792       | oom             | 9690.61 \| 0.10 | 8344.36 \| 0.12 | oom                 |
+| 1920       | oom             | oom             | oom             | oom                 |
 
 Latency (sec)
 | batch_size | HF (fp32) | HF (bf16) | HF (int8) | DS-inference (fp16) |
 |:----------:|:---------:|:---------:|:---------:|:-------------------:|
-| 1          | 1.94      | 2.11      |           | 1.62                |
-| 2          | 1.92      | 2.06      |           | 1.65                |
-| 4          | 1.89      | 2.06      |           | 1.67                |
-| 8          | 1.94      | 2.16      |           | 1.62                |
-| 16         | 1.99      | 2.05      |           | 1.65                |
-| 32         | 2.03      | 2.08      |           | 1.60                |
-| 64         | 2.36      | 2.11      |           | 1.61                |
-| 128        | 4.30      | 2.21      |           | 1.70                |
-| 256        | 6.93      | 3.12      |           | 2.50                |
-| 384        | 10.69     | 4.12      |           | 3.46                |
-| 512        | 14.82     | 5.42      |           | 4.49                |
-| 640        | 19.85     | 6.69      |           | 5.51                |
-| 768        | 20.18     | 8.11      |           | 6.50                |
-| 896        | 24.53     | 9.45      |           | 7.63                |
-| 1024       | oom       | 10.55     |           | 8.88                |
-| 1152       | oom       | 11.86     |           | oom                 |
-| 1280       | oom       | 13.24     |           | oom                 |
-| 1408       | oom       | 14.41     |           | oom                 |
-| 1536       | oom       | 15.76     |           | oom                 |
-| 1664       | oom       | 17.12     |           | oom                 |
-| 1792       | oom       | 18.49     |           | oom                 |
-| 1920       | oom       | oom       |           | oom                 |
\ No newline at end of file
+| 1          | 1.94      | 2.11      | 6.05      | 1.62                |
+| 2          | 1.92      | 2.06      | 5.92      | 1.65                |
+| 4          | 1.89      | 2.06      | 5.94      | 1.67                |
+| 8          | 1.94      | 2.16      | 5.96      | 1.62                |
+| 16         | 1.99      | 2.05      | 5.80      | 1.65                |
+| 32         | 2.03      | 2.08      | 5.96      | 1.60                |
+| 64         | 2.36      | 2.11      | 5.98      | 1.61                |
+| 128        | 4.30      | 2.21      | 6.23      | 1.70                |
+| 256        | 6.93      | 3.12      | 7.26      | 2.50                |
+| 384        | 10.69     | 4.12      | 8.37      | 3.46                |
+| 512        | 14.82     | 5.42      | 9.45      | 4.49                |
+| 640        | 19.85     | 6.69      | 10.47     | 5.51                |
+| 768        | 20.18     | 8.11      | 11.67     | 6.50                |
+| 896        | 24.53     | 9.45      | 12.60     | 7.63                |
+| 1024       | oom       | 10.55     | 13.68     | 8.88                |
+| 1152       | oom       | 11.86     | 15.27     | oom                 |
+| 1280       | oom       | 13.24     | 16.29     | oom                 |
+| 1408       | oom       | 14.41     | 17.35     | oom                 |
+| 1536       | oom       | 15.76     | 18.73     | oom                 |
+| 1664       | oom       | 17.12     | 20.22     | oom                 |
+| 1792       | oom       | 18.49     | 21.48     | oom                 |
+| 1920       | oom       | oom       | oom       | oom                 |
\ No newline at end of file

From 28e1e715df9835759389839d497b80991a700747 Mon Sep 17 00:00:00 2001
From: mayank31398 <mayank31398@gmail.com>
Date: Mon, 5 Dec 2022 19:24:05 +0530
Subject: [PATCH 35/43] attention_type

---
 README.md                 | 11 +++++++++--
 src/pipelines/pipeline.py | 13 +------------
 2 files changed, 10 insertions(+), 14 deletions(-)

diff --git a/README.md b/README.md
index f68f6cb..8c9ffd8 100644
--- a/README.md
+++ b/README.md
@@ -1,7 +1,7 @@
 # bigcode-inference-benchmark
 A100 80GB
 
-BLOOM\
+BLOOM
 ```python
 hidden_size = 2048
 n_head = 16
@@ -58,4 +58,11 @@ Latency (sec)
 | 1536       | oom       | 15.76     | 18.73     | oom                 |
 | 1664       | oom       | 17.12     | 20.22     | oom                 |
 | 1792       | oom       | 18.49     | 21.48     | oom                 |
-| 1920       | oom       | oom       | oom       | oom                 |
\ No newline at end of file
+| 1920       | oom       | oom       | oom       | oom                 |
+
+GPT2 MHA
+```python
+hidden_size = 2048
+n_head = 16
+n_layer = 24
+```
diff --git a/src/pipelines/pipeline.py b/src/pipelines/pipeline.py
index e7d0830..20a0988 100644
--- a/src/pipelines/pipeline.py
+++ b/src/pipelines/pipeline.py
@@ -63,7 +63,7 @@ def get_config_tokenizer_model_class(args: Namespace) -> Union[BloomConfig, GPT2
             n_positions=args.n_positions,
             bos_token_id=tokenizer.bos_token_id,
             eos_token_id=tokenizer.eos_token_id,
-            attention_type=get_attention_type(args.attention_type),
+            attention_type=args.attention_type,
             print_details=False,
             vocab_size=len(tokenizer),
             use_cache=True,
@@ -74,14 +74,3 @@ def get_config_tokenizer_model_class(args: Namespace) -> Union[BloomConfig, GPT2
         model_class._from_config(config).save_pretrained("tmp")
 
     return config, tokenizer, model_class
-
-
-def get_attention_type(attention_type: int):
-    from transformers.models.gpt2.modeling_gpt2 import AttentionType
-
-    if attention_type == 1:
-        return AttentionType.MULTI_HEAD
-    elif attention_type == 2:
-        return AttentionType.MULTI_QUERY
-    elif attention_type == 3:
-        return AttentionType.MULTI_QUERY_1

From b2c7de7c9c7fa61bc30fdc7f2c8c7f4c2e1237d3 Mon Sep 17 00:00:00 2001
From: mayank31398 <mayank31398@gmail.com>
Date: Mon, 5 Dec 2022 21:05:36 +0530
Subject: [PATCH 36/43] fp32

---
 README.md | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/README.md b/README.md
index 8c9ffd8..fdb7e1b 100644
--- a/README.md
+++ b/README.md
@@ -66,3 +66,29 @@ hidden_size = 2048
 n_head = 16
 n_layer = 24
 ```
+
+Throughput (tokens/sec | msec/token)
+| batch_size |    HF (fp32)    |    HF (bf16)    |    HF (int8)    | DS-inference (fp16) |
+|:----------:|:---------------:|:---------------:|:---------------:|:-------------------:|
+| 1          | 41.34 \| 24.19  | 47.46 \| 21.07  | 16.53 \| 60.49  | 61.61 \| 16.23      |
+| 2          | 79.21 \| 12.62  | 96.88 \| 10.32  | 33.79 \| 29.60  | 121.55 \| 8.23      |
+| 4          | 160.78 \| 6.22  | 193.72 \| 5.16  | 67.38 \| 14.84  | 240.06 \| 4.17      |
+| 8          | 324.26 \| 3.08  | 370.67 \| 2.70  | 134.34 \| 7.44  | 492.42 \| 2.03      |
+| 16         | 637.18 \| 1.57  | 781.29 \| 1.28  | 275.69 \| 3.63  | 970.59 \| 1.03      |
+| 32         | 1310.62 \| 0.76 | 1539.19 \| 0.65 | 537.14 \| 1.86  | 1999.04 \| 0.50     |
+| 64         | 2092.72 \| 0.48 | 3038.01 \| 0.33 | 1070.50 \| 0.93 | 3971.09 \| 0.25     |
+| 128        | 2854.47 \| 0.35 | 5795.97 \| 0.17 | 2055.34 \| 0.49 | 7514.59 \| 0.13     |
+| 256        | 3504.34 \| 0.29 | 8216.27 \| 0.12 | 3523.77 \| 0.28 | 10226.50 \| 0.10    |
+| 384        | 3811.93 \| 0.26 | 9328.18 \| 0.11 | 4585.33 \| 0.22 | 11094.27 \| 0.09    |
+| 512        | 3794.15 \| 0.26 | 9446.34 \| 0.11 | 5416.48 \| 0.18 | 11390.85 \| 0.09    |
+| 640        | 4120.75 \| 0.24 | 9572.53 \| 0.10 | 6113.65 \| 0.16 | 11625.71 \| 0.09    |
+| 768        | 3946.79 \| 0.25 | 9464.75 \| 0.11 | 6582.52 \| 0.15 | 11814.31 \| 0.08    |
+| 896        | 3925.22 \| 0.25 | 9482.11 \| 0.11 | 7111.08 \| 0.14 | 11744.38 \| 0.09    |
+| 1024       | oom             | 9710.46 \| 0.10 | 7486.36 \| 0.13 | 11534.95 \| 0.09    |
+| 1152       | oom             | 9712.39 \| 0.10 | 7544.99 \| 0.13 | oom                 |
+| 1280       | oom             | 9667.19 \| 0.10 | 7858.91 \| 0.13 | oom                 |
+| 1408       | oom             | 9771.91 \| 0.10 | 8116.30 \| 0.12 | oom                 |
+| 1536       | oom             | 9744.56 \| 0.10 | 8201.28 \| 0.12 | oom                 |
+| 1664       | oom             | 9719.82 \| 0.10 | 8227.56 \| 0.12 | oom                 |
+| 1792       | oom             | 9690.61 \| 0.10 | 8344.36 \| 0.12 | oom                 |
+| 1920       | oom             | oom             | oom             | oom                 |

From 76b3b8df3fadca7d133fa8492389c707e31674d9 Mon Sep 17 00:00:00 2001
From: mayank31398 <mayank31398@gmail.com>
Date: Tue, 6 Dec 2022 10:02:43 +0530
Subject: [PATCH 37/43] bf16

---
 README.md | 76 +++++++++++++++++++++++++++++++++++++------------------
 1 file changed, 51 insertions(+), 25 deletions(-)

diff --git a/README.md b/README.md
index fdb7e1b..c6e2b8b 100644
--- a/README.md
+++ b/README.md
@@ -60,7 +60,7 @@ Latency (sec)
 | 1792       | oom       | 18.49     | 21.48     | oom                 |
 | 1920       | oom       | oom       | oom       | oom                 |
 
-GPT2 MHA
+GPT2 Multi-Head Attention
 ```python
 hidden_size = 2048
 n_head = 16
@@ -68,27 +68,53 @@ n_layer = 24
 ```
 
 Throughput (tokens/sec | msec/token)
-| batch_size |    HF (fp32)    |    HF (bf16)    |    HF (int8)    | DS-inference (fp16) |
-|:----------:|:---------------:|:---------------:|:---------------:|:-------------------:|
-| 1          | 41.34 \| 24.19  | 47.46 \| 21.07  | 16.53 \| 60.49  | 61.61 \| 16.23      |
-| 2          | 79.21 \| 12.62  | 96.88 \| 10.32  | 33.79 \| 29.60  | 121.55 \| 8.23      |
-| 4          | 160.78 \| 6.22  | 193.72 \| 5.16  | 67.38 \| 14.84  | 240.06 \| 4.17      |
-| 8          | 324.26 \| 3.08  | 370.67 \| 2.70  | 134.34 \| 7.44  | 492.42 \| 2.03      |
-| 16         | 637.18 \| 1.57  | 781.29 \| 1.28  | 275.69 \| 3.63  | 970.59 \| 1.03      |
-| 32         | 1310.62 \| 0.76 | 1539.19 \| 0.65 | 537.14 \| 1.86  | 1999.04 \| 0.50     |
-| 64         | 2092.72 \| 0.48 | 3038.01 \| 0.33 | 1070.50 \| 0.93 | 3971.09 \| 0.25     |
-| 128        | 2854.47 \| 0.35 | 5795.97 \| 0.17 | 2055.34 \| 0.49 | 7514.59 \| 0.13     |
-| 256        | 3504.34 \| 0.29 | 8216.27 \| 0.12 | 3523.77 \| 0.28 | 10226.50 \| 0.10    |
-| 384        | 3811.93 \| 0.26 | 9328.18 \| 0.11 | 4585.33 \| 0.22 | 11094.27 \| 0.09    |
-| 512        | 3794.15 \| 0.26 | 9446.34 \| 0.11 | 5416.48 \| 0.18 | 11390.85 \| 0.09    |
-| 640        | 4120.75 \| 0.24 | 9572.53 \| 0.10 | 6113.65 \| 0.16 | 11625.71 \| 0.09    |
-| 768        | 3946.79 \| 0.25 | 9464.75 \| 0.11 | 6582.52 \| 0.15 | 11814.31 \| 0.08    |
-| 896        | 3925.22 \| 0.25 | 9482.11 \| 0.11 | 7111.08 \| 0.14 | 11744.38 \| 0.09    |
-| 1024       | oom             | 9710.46 \| 0.10 | 7486.36 \| 0.13 | 11534.95 \| 0.09    |
-| 1152       | oom             | 9712.39 \| 0.10 | 7544.99 \| 0.13 | oom                 |
-| 1280       | oom             | 9667.19 \| 0.10 | 7858.91 \| 0.13 | oom                 |
-| 1408       | oom             | 9771.91 \| 0.10 | 8116.30 \| 0.12 | oom                 |
-| 1536       | oom             | 9744.56 \| 0.10 | 8201.28 \| 0.12 | oom                 |
-| 1664       | oom             | 9719.82 \| 0.10 | 8227.56 \| 0.12 | oom                 |
-| 1792       | oom             | 9690.61 \| 0.10 | 8344.36 \| 0.12 | oom                 |
-| 1920       | oom             | oom             | oom             | oom                 |
+| batch_size |    HF (fp32)    |    HF (bf16)     |    HF (int8)    | DS-inference (fp16) |
+|:----------:|:---------------:|:----------------:|:---------------:|:-------------------:|
+| 1          | 41.34 \| 24.19  | 40.69 \| 24.57   | 16.53 \| 60.49  | 61.61 \| 16.23      |
+| 2          | 79.21 \| 12.62  | 80.87 \| 12.37   | 33.79 \| 29.60  | 121.55 \| 8.23      |
+| 4          | 160.78 \| 6.22  | 154.98 \| 6.45   | 67.38 \| 14.84  | 240.06 \| 4.17      |
+| 8          | 324.26 \| 3.08  | 332.90 \| 3.00   | 134.34 \| 7.44  | 492.42 \| 2.03      |
+| 16         | 637.18 \| 1.57  | 669.27 \| 1.49   | 275.69 \| 3.63  | 970.59 \| 1.03      |
+| 32         | 1310.62 \| 0.76 | 1287.95 \| 0.78  | 537.14 \| 1.86  | 1999.04 \| 0.50     |
+| 64         | 2092.72 \| 0.48 | 2487.35 \| 0.40  | 1070.50 \| 0.93 | 3971.09 \| 0.25     |
+| 128        | 2854.47 \| 0.35 | 4268.99 \| 0.23  | 2055.34 \| 0.49 | 7514.59 \| 0.13     |
+| 256        | 3504.34 \| 0.29 | 6917.01 \| 0.14  | 3523.77 \| 0.28 | 10226.50 \| 0.10    |
+| 384        | 3811.93 \| 0.26 | 8821.31 \| 0.11  | 4585.33 \| 0.22 | 11094.27 \| 0.09    |
+| 512        | 3794.15 \| 0.26 | 10068.51 \| 0.10 | 5416.48 \| 0.18 | 11390.85 \| 0.09    |
+| 640        | 4120.75 \| 0.24 | 10547.88 \| 0.09 | 6113.65 \| 0.16 | 11625.71 \| 0.09    |
+| 768        | 3946.79 \| 0.25 | 10675.09 \| 0.09 | 6582.52 \| 0.15 | 11814.31 \| 0.08    |
+| 896        | 3925.22 \| 0.25 | 10780.82 \| 0.09 | 7111.08 \| 0.14 | 11744.38 \| 0.09    |
+| 1024       | oom             | 11192.55 \| 0.09 | 7486.36 \| 0.13 | 11534.95 \| 0.09    |
+| 1152       | oom             | 11178.30 \| 0.09 | 7544.99 \| 0.13 | oom                 |
+| 1280       | oom             | 11383.98 \| 0.09 | 7858.91 \| 0.13 | oom                 |
+| 1408       | oom             | 11477.66 \| 0.09 | 8116.30 \| 0.12 | oom                 |
+| 1536       | oom             | 11382.66 \| 0.09 | 8201.28 \| 0.12 | oom                 |
+| 1664       | oom             | 11571.52 \| 0.09 | 8227.56 \| 0.12 | oom                 |
+| 1792       | oom             | 11394.20 \| 0.09 | 8344.36 \| 0.12 | oom                 |
+| 1920       | oom             | oom              | oom             | oom                 |
+
+Latency (sec)
+| batch_size | HF (fp32) | HF (bf16) | HF (int8) | DS-inference (fp16) |
+|:----------:|:---------:|:---------:|:---------:|:-------------------:|
+| 1          | 1.94      | 2.46      | 6.05      | 1.62                |
+| 2          | 1.92      | 2.47      | 5.92      | 1.65                |
+| 4          | 1.89      | 2.58      | 5.94      | 1.67                |
+| 8          | 1.94      | 2.40      | 5.96      | 1.62                |
+| 16         | 1.99      | 2.39      | 5.80      | 1.65                |
+| 32         | 2.03      | 2.48      | 5.96      | 1.60                |
+| 64         | 2.36      | 2.57      | 5.98      | 1.61                |
+| 128        | 4.30      | 3.00      | 6.23      | 1.70                |
+| 256        | 6.93      | 3.70      | 7.26      | 2.50                |
+| 384        | 10.69     | 4.35      | 8.37      | 3.46                |
+| 512        | 14.82     | 5.09      | 9.45      | 4.49                |
+| 640        | 19.85     | 6.07      | 10.47     | 5.51                |
+| 768        | 20.18     | 7.19      | 11.67     | 6.50                |
+| 896        | 24.53     | 8.31      | 12.60     | 7.63                |
+| 1024       | oom       | 9.15      | 13.68     | 8.88                |
+| 1152       | oom       | 10.31     | 15.27     | oom                 |
+| 1280       | oom       | 11.24     | 16.29     | oom                 |
+| 1408       | oom       | 12.27     | 17.35     | oom                 |
+| 1536       | oom       | 13.49     | 18.73     | oom                 |
+| 1664       | oom       | 14.38     | 20.22     | oom                 |
+| 1792       | oom       | 15.73     | 21.48     | oom                 |
+| 1920       | oom       | oom       | oom       | oom                 |

From c149ee91fa6454534018aa827e95427d645076c3 Mon Sep 17 00:00:00 2001
From: mayank31398 <mayank31398@gmail.com>
Date: Tue, 6 Dec 2022 15:41:42 +0530
Subject: [PATCH 38/43] fp32

---
 README.md | 56 +++++++++++++++++++++++++++----------------------------
 1 file changed, 28 insertions(+), 28 deletions(-)

diff --git a/README.md b/README.md
index c6e2b8b..8844d5c 100644
--- a/README.md
+++ b/README.md
@@ -70,20 +70,20 @@ n_layer = 24
 Throughput (tokens/sec | msec/token)
 | batch_size |    HF (fp32)    |    HF (bf16)     |    HF (int8)    | DS-inference (fp16) |
 |:----------:|:---------------:|:----------------:|:---------------:|:-------------------:|
-| 1          | 41.34 \| 24.19  | 40.69 \| 24.57   | 16.53 \| 60.49  | 61.61 \| 16.23      |
-| 2          | 79.21 \| 12.62  | 80.87 \| 12.37   | 33.79 \| 29.60  | 121.55 \| 8.23      |
-| 4          | 160.78 \| 6.22  | 154.98 \| 6.45   | 67.38 \| 14.84  | 240.06 \| 4.17      |
-| 8          | 324.26 \| 3.08  | 332.90 \| 3.00   | 134.34 \| 7.44  | 492.42 \| 2.03      |
-| 16         | 637.18 \| 1.57  | 669.27 \| 1.49   | 275.69 \| 3.63  | 970.59 \| 1.03      |
-| 32         | 1310.62 \| 0.76 | 1287.95 \| 0.78  | 537.14 \| 1.86  | 1999.04 \| 0.50     |
-| 64         | 2092.72 \| 0.48 | 2487.35 \| 0.40  | 1070.50 \| 0.93 | 3971.09 \| 0.25     |
-| 128        | 2854.47 \| 0.35 | 4268.99 \| 0.23  | 2055.34 \| 0.49 | 7514.59 \| 0.13     |
-| 256        | 3504.34 \| 0.29 | 6917.01 \| 0.14  | 3523.77 \| 0.28 | 10226.50 \| 0.10    |
-| 384        | 3811.93 \| 0.26 | 8821.31 \| 0.11  | 4585.33 \| 0.22 | 11094.27 \| 0.09    |
-| 512        | 3794.15 \| 0.26 | 10068.51 \| 0.10 | 5416.48 \| 0.18 | 11390.85 \| 0.09    |
-| 640        | 4120.75 \| 0.24 | 10547.88 \| 0.09 | 6113.65 \| 0.16 | 11625.71 \| 0.09    |
-| 768        | 3946.79 \| 0.25 | 10675.09 \| 0.09 | 6582.52 \| 0.15 | 11814.31 \| 0.08    |
-| 896        | 3925.22 \| 0.25 | 10780.82 \| 0.09 | 7111.08 \| 0.14 | 11744.38 \| 0.09    |
+| 1          | 43.11 \| 23.20  | 40.69 \| 24.57   | 16.53 \| 60.49  | 61.61 \| 16.23      |
+| 2          | 80.76 \| 12.38  | 80.87 \| 12.37   | 33.79 \| 29.60  | 121.55 \| 8.23      |
+| 4          | 160.38 \| 6.24  | 154.98 \| 6.45   | 67.38 \| 14.84  | 240.06 \| 4.17      |
+| 8          | 328.62 \| 3.04  | 332.90 \| 3.00   | 134.34 \| 7.44  | 492.42 \| 2.03      |
+| 16         | 662.08 \| 1.51  | 669.27 \| 1.49   | 275.69 \| 3.63  | 970.59 \| 1.03      |
+| 32         | 1314.92 \| 0.76 | 1287.95 \| 0.78  | 537.14 \| 1.86  | 1999.04 \| 0.50     |
+| 64         | 2118.17 \| 0.47 | 2487.35 \| 0.40  | 1070.50 \| 0.93 | 3971.09 \| 0.25     |
+| 128        | 2860.26 \| 0.35 | 4268.99 \| 0.23  | 2055.34 \| 0.49 | 7514.59 \| 0.13     |
+| 256        | 3487.86 \| 0.29 | 6917.01 \| 0.14  | 3523.77 \| 0.28 | 10226.50 \| 0.10    |
+| 384        | 3794.16 \| 0.26 | 8821.31 \| 0.11  | 4585.33 \| 0.22 | 11094.27 \| 0.09    |
+| 512        | 3804.37 \| 0.26 | 10068.51 \| 0.10 | 5416.48 \| 0.18 | 11390.85 \| 0.09    |
+| 640        | 4124.01 \| 0.24 | 10547.88 \| 0.09 | 6113.65 \| 0.16 | 11625.71 \| 0.09    |
+| 768        | 3950.39 \| 0.25 | 10675.09 \| 0.09 | 6582.52 \| 0.15 | 11814.31 \| 0.08    |
+| 896        | 3937.28 \| 0.25 | 10780.82 \| 0.09 | 7111.08 \| 0.14 | 11744.38 \| 0.09    |
 | 1024       | oom             | 11192.55 \| 0.09 | 7486.36 \| 0.13 | 11534.95 \| 0.09    |
 | 1152       | oom             | 11178.30 \| 0.09 | 7544.99 \| 0.13 | oom                 |
 | 1280       | oom             | 11383.98 \| 0.09 | 7858.91 \| 0.13 | oom                 |
@@ -96,20 +96,20 @@ Throughput (tokens/sec | msec/token)
 Latency (sec)
 | batch_size | HF (fp32) | HF (bf16) | HF (int8) | DS-inference (fp16) |
 |:----------:|:---------:|:---------:|:---------:|:-------------------:|
-| 1          | 1.94      | 2.46      | 6.05      | 1.62                |
-| 2          | 1.92      | 2.47      | 5.92      | 1.65                |
-| 4          | 1.89      | 2.58      | 5.94      | 1.67                |
-| 8          | 1.94      | 2.40      | 5.96      | 1.62                |
-| 16         | 1.99      | 2.39      | 5.80      | 1.65                |
-| 32         | 2.03      | 2.48      | 5.96      | 1.60                |
-| 64         | 2.36      | 2.57      | 5.98      | 1.61                |
-| 128        | 4.30      | 3.00      | 6.23      | 1.70                |
-| 256        | 6.93      | 3.70      | 7.26      | 2.50                |
-| 384        | 10.69     | 4.35      | 8.37      | 3.46                |
-| 512        | 14.82     | 5.09      | 9.45      | 4.49                |
-| 640        | 19.85     | 6.07      | 10.47     | 5.51                |
-| 768        | 20.18     | 7.19      | 11.67     | 6.50                |
-| 896        | 24.53     | 8.31      | 12.60     | 7.63                |
+| 1          | 2.32      | 2.46      | 6.05      | 1.62                |
+| 2          | 2.48      | 2.47      | 5.92      | 1.65                |
+| 4          | 2.49      | 2.58      | 5.94      | 1.67                |
+| 8          | 2.43      | 2.40      | 5.96      | 1.62                |
+| 16         | 2.42      | 2.39      | 5.80      | 1.65                |
+| 32         | 2.43      | 2.48      | 5.96      | 1.60                |
+| 64         | 3.02      | 2.57      | 5.98      | 1.61                |
+| 128        | 4.48      | 3.00      | 6.23      | 1.70                |
+| 256        | 7.34      | 3.70      | 7.26      | 2.50                |
+| 384        | 10.12     | 4.35      | 8.37      | 3.46                |
+| 512        | 13.46     | 5.09      | 9.45      | 4.49                |
+| 640        | 15.52     | 6.07      | 10.47     | 5.51                |
+| 768        | 19.44     | 7.19      | 11.67     | 6.50                |
+| 896        | 22.76     | 8.31      | 12.60     | 7.63                |
 | 1024       | oom       | 9.15      | 13.68     | 8.88                |
 | 1152       | oom       | 10.31     | 15.27     | oom                 |
 | 1280       | oom       | 11.24     | 16.29     | oom                 |

From 8427b9434bc78815d3689e8a90bf97863c0ef796 Mon Sep 17 00:00:00 2001
From: mayank31398 <mayank31398@gmail.com>
Date: Tue, 6 Dec 2022 23:43:00 +0530
Subject: [PATCH 39/43] int8

---
 README.md | 90 +++++++++++++++++++++++++++----------------------------
 1 file changed, 45 insertions(+), 45 deletions(-)

diff --git a/README.md b/README.md
index 8844d5c..b5f264e 100644
--- a/README.md
+++ b/README.md
@@ -68,53 +68,53 @@ n_layer = 24
 ```
 
 Throughput (tokens/sec | msec/token)
-| batch_size |    HF (fp32)    |    HF (bf16)     |    HF (int8)    | DS-inference (fp16) |
-|:----------:|:---------------:|:----------------:|:---------------:|:-------------------:|
-| 1          | 43.11 \| 23.20  | 40.69 \| 24.57   | 16.53 \| 60.49  | 61.61 \| 16.23      |
-| 2          | 80.76 \| 12.38  | 80.87 \| 12.37   | 33.79 \| 29.60  | 121.55 \| 8.23      |
-| 4          | 160.38 \| 6.24  | 154.98 \| 6.45   | 67.38 \| 14.84  | 240.06 \| 4.17      |
-| 8          | 328.62 \| 3.04  | 332.90 \| 3.00   | 134.34 \| 7.44  | 492.42 \| 2.03      |
-| 16         | 662.08 \| 1.51  | 669.27 \| 1.49   | 275.69 \| 3.63  | 970.59 \| 1.03      |
-| 32         | 1314.92 \| 0.76 | 1287.95 \| 0.78  | 537.14 \| 1.86  | 1999.04 \| 0.50     |
-| 64         | 2118.17 \| 0.47 | 2487.35 \| 0.40  | 1070.50 \| 0.93 | 3971.09 \| 0.25     |
-| 128        | 2860.26 \| 0.35 | 4268.99 \| 0.23  | 2055.34 \| 0.49 | 7514.59 \| 0.13     |
-| 256        | 3487.86 \| 0.29 | 6917.01 \| 0.14  | 3523.77 \| 0.28 | 10226.50 \| 0.10    |
-| 384        | 3794.16 \| 0.26 | 8821.31 \| 0.11  | 4585.33 \| 0.22 | 11094.27 \| 0.09    |
-| 512        | 3804.37 \| 0.26 | 10068.51 \| 0.10 | 5416.48 \| 0.18 | 11390.85 \| 0.09    |
-| 640        | 4124.01 \| 0.24 | 10547.88 \| 0.09 | 6113.65 \| 0.16 | 11625.71 \| 0.09    |
-| 768        | 3950.39 \| 0.25 | 10675.09 \| 0.09 | 6582.52 \| 0.15 | 11814.31 \| 0.08    |
-| 896        | 3937.28 \| 0.25 | 10780.82 \| 0.09 | 7111.08 \| 0.14 | 11744.38 \| 0.09    |
-| 1024       | oom             | 11192.55 \| 0.09 | 7486.36 \| 0.13 | 11534.95 \| 0.09    |
-| 1152       | oom             | 11178.30 \| 0.09 | 7544.99 \| 0.13 | oom                 |
-| 1280       | oom             | 11383.98 \| 0.09 | 7858.91 \| 0.13 | oom                 |
-| 1408       | oom             | 11477.66 \| 0.09 | 8116.30 \| 0.12 | oom                 |
-| 1536       | oom             | 11382.66 \| 0.09 | 8201.28 \| 0.12 | oom                 |
-| 1664       | oom             | 11571.52 \| 0.09 | 8227.56 \| 0.12 | oom                 |
-| 1792       | oom             | 11394.20 \| 0.09 | 8344.36 \| 0.12 | oom                 |
-| 1920       | oom             | oom              | oom             | oom                 |
+| batch_size |    HF (fp32)    |    HF (bf16)     |    HF (int8)     | DS-inference (fp16) |
+|:----------:|:---------------:|:----------------:|:----------------:|:-------------------:|
+| 1          | 43.11 \| 23.20  | 40.69 \| 24.57   | 32.29 \| 30.97   | 61.61 \| 16.23      |
+| 2          | 80.76 \| 12.38  | 80.87 \| 12.37   | 63.54 \| 15.74   | 121.55 \| 8.23      |
+| 4          | 160.38 \| 6.24  | 154.98 \| 6.45   | 131.00 \| 7.63   | 240.06 \| 4.17      |
+| 8          | 328.62 \| 3.04  | 332.90 \| 3.00   | 260.16 \| 3.84   | 492.42 \| 2.03      |
+| 16         | 662.08 \| 1.51  | 669.27 \| 1.49   | 523.29 \| 1.91   | 970.59 \| 1.03      |
+| 32         | 1314.92 \| 0.76 | 1287.95 \| 0.78  | 1055.57 \| 0.95  | 1999.04 \| 0.50     |
+| 64         | 2118.17 \| 0.47 | 2487.35 \| 0.40  | 1969.26 \| 0.51  | 3971.09 \| 0.25     |
+| 128        | 2860.26 \| 0.35 | 4268.99 \| 0.23  | 3581.49 \| 0.28  | 7514.59 \| 0.13     |
+| 256        | 3487.86 \| 0.29 | 6917.01 \| 0.14  | 6132.47 \| 0.16  | 10226.50 \| 0.10    |
+| 384        | 3794.16 \| 0.26 | 8821.31 \| 0.11  | 7774.37 \| 0.13  | 11094.27 \| 0.09    |
+| 512        | 3804.37 \| 0.26 | 10068.51 \| 0.10 | 8872.88 \| 0.11  | 11390.85 \| 0.09    |
+| 640        | 4124.01 \| 0.24 | 10547.88 \| 0.09 | 9956.58 \| 0.10  | 11625.71 \| 0.09    |
+| 768        | 3950.39 \| 0.25 | 10675.09 \| 0.09 | 10584.21 \| 0.09 | 11814.31 \| 0.08    |
+| 896        | 3937.28 \| 0.25 | 10780.82 \| 0.09 | 10994.00 \| 0.09 | 11744.38 \| 0.09    |
+| 1024       | oom             | 11192.55 \| 0.09 | 11306.37 \| 0.09 | 11534.95 \| 0.09    |
+| 1152       | oom             | 11178.30 \| 0.09 | 11290.51 \| 0.09 | oom                 |
+| 1280       | oom             | 11383.98 \| 0.09 | 11459.89 \| 0.09 | oom                 |
+| 1408       | oom             | 11477.66 \| 0.09 | 11565.90 \| 0.09 | oom                 |
+| 1536       | oom             | 11382.66 \| 0.09 | 11491.99 \| 0.09 | oom                 |
+| 1664       | oom             | 11571.52 \| 0.09 | 11603.73 \| 0.09 | oom                 |
+| 1792       | oom             | 11394.20 \| 0.09 | 11412.46 \| 0.09 | oom                 |
+| 1920       | oom             | oom              | oom              | oom                 |
 
 Latency (sec)
 | batch_size | HF (fp32) | HF (bf16) | HF (int8) | DS-inference (fp16) |
 |:----------:|:---------:|:---------:|:---------:|:-------------------:|
-| 1          | 2.32      | 2.46      | 6.05      | 1.62                |
-| 2          | 2.48      | 2.47      | 5.92      | 1.65                |
-| 4          | 2.49      | 2.58      | 5.94      | 1.67                |
-| 8          | 2.43      | 2.40      | 5.96      | 1.62                |
-| 16         | 2.42      | 2.39      | 5.80      | 1.65                |
-| 32         | 2.43      | 2.48      | 5.96      | 1.60                |
-| 64         | 3.02      | 2.57      | 5.98      | 1.61                |
-| 128        | 4.48      | 3.00      | 6.23      | 1.70                |
-| 256        | 7.34      | 3.70      | 7.26      | 2.50                |
-| 384        | 10.12     | 4.35      | 8.37      | 3.46                |
-| 512        | 13.46     | 5.09      | 9.45      | 4.49                |
-| 640        | 15.52     | 6.07      | 10.47     | 5.51                |
-| 768        | 19.44     | 7.19      | 11.67     | 6.50                |
-| 896        | 22.76     | 8.31      | 12.60     | 7.63                |
-| 1024       | oom       | 9.15      | 13.68     | 8.88                |
-| 1152       | oom       | 10.31     | 15.27     | oom                 |
-| 1280       | oom       | 11.24     | 16.29     | oom                 |
-| 1408       | oom       | 12.27     | 17.35     | oom                 |
-| 1536       | oom       | 13.49     | 18.73     | oom                 |
-| 1664       | oom       | 14.38     | 20.22     | oom                 |
-| 1792       | oom       | 15.73     | 21.48     | oom                 |
+| 1          | 2.32      | 2.46      | 3.10      | 1.62                |
+| 2          | 2.48      | 2.47      | 3.15      | 1.65                |
+| 4          | 2.49      | 2.58      | 3.05      | 1.67                |
+| 8          | 2.43      | 2.40      | 3.07      | 1.62                |
+| 16         | 2.42      | 2.39      | 3.06      | 1.65                |
+| 32         | 2.43      | 2.48      | 3.03      | 1.60                |
+| 64         | 3.02      | 2.57      | 3.25      | 1.61                |
+| 128        | 4.48      | 3.00      | 3.57      | 1.70                |
+| 256        | 7.34      | 3.70      | 4.17      | 2.50                |
+| 384        | 10.12     | 4.35      | 4.94      | 3.46                |
+| 512        | 13.46     | 5.09      | 5.77      | 4.49                |
+| 640        | 15.52     | 6.07      | 6.43      | 5.51                |
+| 768        | 19.44     | 7.19      | 7.26      | 6.50                |
+| 896        | 22.76     | 8.31      | 8.15      | 7.63                |
+| 1024       | oom       | 9.15      | 9.06      | 8.88                |
+| 1152       | oom       | 10.31     | 10.20     | oom                 |
+| 1280       | oom       | 11.24     | 11.17     | oom                 |
+| 1408       | oom       | 12.27     | 12.17     | oom                 |
+| 1536       | oom       | 13.49     | 13.37     | oom                 |
+| 1664       | oom       | 14.38     | 14.34     | oom                 |
+| 1792       | oom       | 15.73     | 15.70     | oom                 |
 | 1920       | oom       | oom       | oom       | oom                 |

From 487954f3ad43fcd7224954e889b6efb36adec24b Mon Sep 17 00:00:00 2001
From: mayank31398 <mayank31398@gmail.com>
Date: Wed, 7 Dec 2022 02:12:34 +0530
Subject: [PATCH 40/43] fp16

---
 README.md | 61 ++++++++++++++++++++++++++++---------------------------
 1 file changed, 31 insertions(+), 30 deletions(-)

diff --git a/README.md b/README.md
index b5f264e..e083012 100644
--- a/README.md
+++ b/README.md
@@ -65,26 +65,27 @@ GPT2 Multi-Head Attention
 hidden_size = 2048
 n_head = 16
 n_layer = 24
+total_params = 1315725312
 ```
 
 Throughput (tokens/sec | msec/token)
 | batch_size |    HF (fp32)    |    HF (bf16)     |    HF (int8)     | DS-inference (fp16) |
 |:----------:|:---------------:|:----------------:|:----------------:|:-------------------:|
-| 1          | 43.11 \| 23.20  | 40.69 \| 24.57   | 32.29 \| 30.97   | 61.61 \| 16.23      |
-| 2          | 80.76 \| 12.38  | 80.87 \| 12.37   | 63.54 \| 15.74   | 121.55 \| 8.23      |
-| 4          | 160.38 \| 6.24  | 154.98 \| 6.45   | 131.00 \| 7.63   | 240.06 \| 4.17      |
-| 8          | 328.62 \| 3.04  | 332.90 \| 3.00   | 260.16 \| 3.84   | 492.42 \| 2.03      |
-| 16         | 662.08 \| 1.51  | 669.27 \| 1.49   | 523.29 \| 1.91   | 970.59 \| 1.03      |
-| 32         | 1314.92 \| 0.76 | 1287.95 \| 0.78  | 1055.57 \| 0.95  | 1999.04 \| 0.50     |
-| 64         | 2118.17 \| 0.47 | 2487.35 \| 0.40  | 1969.26 \| 0.51  | 3971.09 \| 0.25     |
-| 128        | 2860.26 \| 0.35 | 4268.99 \| 0.23  | 3581.49 \| 0.28  | 7514.59 \| 0.13     |
-| 256        | 3487.86 \| 0.29 | 6917.01 \| 0.14  | 6132.47 \| 0.16  | 10226.50 \| 0.10    |
-| 384        | 3794.16 \| 0.26 | 8821.31 \| 0.11  | 7774.37 \| 0.13  | 11094.27 \| 0.09    |
-| 512        | 3804.37 \| 0.26 | 10068.51 \| 0.10 | 8872.88 \| 0.11  | 11390.85 \| 0.09    |
-| 640        | 4124.01 \| 0.24 | 10547.88 \| 0.09 | 9956.58 \| 0.10  | 11625.71 \| 0.09    |
-| 768        | 3950.39 \| 0.25 | 10675.09 \| 0.09 | 10584.21 \| 0.09 | 11814.31 \| 0.08    |
-| 896        | 3937.28 \| 0.25 | 10780.82 \| 0.09 | 10994.00 \| 0.09 | 11744.38 \| 0.09    |
-| 1024       | oom             | 11192.55 \| 0.09 | 11306.37 \| 0.09 | 11534.95 \| 0.09    |
+| 1          | 43.11 \| 23.20  | 40.69 \| 24.57   | 32.29 \| 30.97   | 122.76 \| 8.15      |
+| 2          | 80.76 \| 12.38  | 80.87 \| 12.37   | 63.54 \| 15.74   | 247.85 \| 4.03      |
+| 4          | 160.38 \| 6.24  | 154.98 \| 6.45   | 131.00 \| 7.63   | 503.52 \| 1.99      |
+| 8          | 328.62 \| 3.04  | 332.90 \| 3.00   | 260.16 \| 3.84   | 1022.20 \| 0.98     |
+| 16         | 662.08 \| 1.51  | 669.27 \| 1.49   | 523.29 \| 1.91   | 2027.35 \| 0.49     |
+| 32         | 1314.92 \| 0.76 | 1287.95 \| 0.78  | 1055.57 \| 0.95  | 4231.82 \| 0.24     |
+| 64         | 2118.17 \| 0.47 | 2487.35 \| 0.40  | 1969.26 \| 0.51  | 8311.39 \| 0.12     |
+| 128        | 2860.26 \| 0.35 | 4268.99 \| 0.23  | 3581.49 \| 0.28  | 15879.15 \| 0.06    |
+| 256        | 3487.86 \| 0.29 | 6917.01 \| 0.14  | 6132.47 \| 0.16  | 21635.49 \| 0.05    |
+| 384        | 3794.16 \| 0.26 | 8821.31 \| 0.11  | 7774.37 \| 0.13  | 23872.25 \| 0.04    |
+| 512        | 3804.37 \| 0.26 | 10068.51 \| 0.10 | 8872.88 \| 0.11  | 25009.06 \| 0.04    |
+| 640        | 4124.01 \| 0.24 | 10547.88 \| 0.09 | 9956.58 \| 0.10  | oom                 |
+| 768        | 3950.39 \| 0.25 | 10675.09 \| 0.09 | 10584.21 \| 0.09 | oom                 |
+| 896        | 3937.28 \| 0.25 | 10780.82 \| 0.09 | 10994.00 \| 0.09 | oom                 |
+| 1024       | oom             | 11192.55 \| 0.09 | 11306.37 \| 0.09 | oom                 |
 | 1152       | oom             | 11178.30 \| 0.09 | 11290.51 \| 0.09 | oom                 |
 | 1280       | oom             | 11383.98 \| 0.09 | 11459.89 \| 0.09 | oom                 |
 | 1408       | oom             | 11477.66 \| 0.09 | 11565.90 \| 0.09 | oom                 |
@@ -96,21 +97,21 @@ Throughput (tokens/sec | msec/token)
 Latency (sec)
 | batch_size | HF (fp32) | HF (bf16) | HF (int8) | DS-inference (fp16) |
 |:----------:|:---------:|:---------:|:---------:|:-------------------:|
-| 1          | 2.32      | 2.46      | 3.10      | 1.62                |
-| 2          | 2.48      | 2.47      | 3.15      | 1.65                |
-| 4          | 2.49      | 2.58      | 3.05      | 1.67                |
-| 8          | 2.43      | 2.40      | 3.07      | 1.62                |
-| 16         | 2.42      | 2.39      | 3.06      | 1.65                |
-| 32         | 2.43      | 2.48      | 3.03      | 1.60                |
-| 64         | 3.02      | 2.57      | 3.25      | 1.61                |
-| 128        | 4.48      | 3.00      | 3.57      | 1.70                |
-| 256        | 7.34      | 3.70      | 4.17      | 2.50                |
-| 384        | 10.12     | 4.35      | 4.94      | 3.46                |
-| 512        | 13.46     | 5.09      | 5.77      | 4.49                |
-| 640        | 15.52     | 6.07      | 6.43      | 5.51                |
-| 768        | 19.44     | 7.19      | 7.26      | 6.50                |
-| 896        | 22.76     | 8.31      | 8.15      | 7.63                |
-| 1024       | oom       | 9.15      | 9.06      | 8.88                |
+| 1          | 2.32      | 2.46      | 3.10      | 0.81                |
+| 2          | 2.48      | 2.47      | 3.15      | 0.81                |
+| 4          | 2.49      | 2.58      | 3.05      | 0.79                |
+| 8          | 2.43      | 2.40      | 3.07      | 0.78                |
+| 16         | 2.42      | 2.39      | 3.06      | 0.79                |
+| 32         | 2.43      | 2.48      | 3.03      | 0.76                |
+| 64         | 3.02      | 2.57      | 3.25      | 0.77                |
+| 128        | 4.48      | 3.00      | 3.57      | 0.81                |
+| 256        | 7.34      | 3.70      | 4.17      | 1.18                |
+| 384        | 10.12     | 4.35      | 4.94      | 1.61                |
+| 512        | 13.46     | 5.09      | 5.77      | 2.05                |
+| 640        | 15.52     | 6.07      | 6.43      | oom                 |
+| 768        | 19.44     | 7.19      | 7.26      | oom                 |
+| 896        | 22.76     | 8.31      | 8.15      | oom                 |
+| 1024       | oom       | 9.15      | 9.06      | oom                 |
 | 1152       | oom       | 10.31     | 10.20     | oom                 |
 | 1280       | oom       | 11.24     | 11.17     | oom                 |
 | 1408       | oom       | 12.27     | 12.17     | oom                 |

From 0253839c6785646fc1b7563c6fcd19c369f660ef Mon Sep 17 00:00:00 2001
From: mayank31398 <mayank31398@gmail.com>
Date: Wed, 7 Dec 2022 02:19:28 +0530
Subject: [PATCH 41/43] total params

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index e083012..3610f5a 100644
--- a/README.md
+++ b/README.md
@@ -6,6 +6,7 @@ BLOOM
 hidden_size = 2048
 n_head = 16
 n_layer = 24
+total_params = 1311535104
 ```
 
 Throughput (tokens/sec | msec/token)

From 893c5217387b16f1c2311efba57ded4607e1d008 Mon Sep 17 00:00:00 2001
From: mayank31398 <mayank31398@gmail.com>
Date: Wed, 7 Dec 2022 02:39:38 +0530
Subject: [PATCH 42/43] models

---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 3610f5a..9d4ec25 100644
--- a/README.md
+++ b/README.md
@@ -1,7 +1,7 @@
 # bigcode-inference-benchmark
 A100 80GB
 
-BLOOM
+## BLOOM
 ```python
 hidden_size = 2048
 n_head = 16
@@ -61,7 +61,7 @@ Latency (sec)
 | 1792       | oom       | 18.49     | 21.48     | oom                 |
 | 1920       | oom       | oom       | oom       | oom                 |
 
-GPT2 Multi-Head Attention
+## GPT2 Multi-Head Attention
 ```python
 hidden_size = 2048
 n_head = 16

From daea92dad31daf066bb275d005a1aa3c6c5ec4f0 Mon Sep 17 00:00:00 2001
From: Alex Gu <gua@mit.edu>
Date: Tue, 6 Dec 2022 15:39:14 -0600
Subject: [PATCH 43/43] Add code to vary input length (#5)

* input length experiments

* sort input lengths in ascending order

* make default max input length -1

make some updates to Alex's code
---
 Makefile                    | 7 +++++++
 run.sh => run_batch_size.sh | 0
 run_input_length.sh         | 8 ++++++++
 src/main.py                 | 3 ++-
 src/utils/arguments.py      | 1 +
 src/utils/dummy.py          | 6 ++++--
 6 files changed, 22 insertions(+), 3 deletions(-)
 rename run.sh => run_batch_size.sh (100%)
 mode change 100644 => 100755
 create mode 100755 run_input_length.sh

diff --git a/Makefile b/Makefile
index a47032a..6f0cbde 100644
--- a/Makefile
+++ b/Makefile
@@ -59,3 +59,10 @@ hf-1b-GPT2-mqa1-int8:
 
 ds-inference-1b-GPT2-mqa1-fp16:
 	deepspeed --num_gpus 1 src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class DS_Inference_Pipeline --model_class GPT2 --n_positions 2048 --attention_type 3 --batch_size ${batch_size}
+
+# Input length experiments
+hf-1b-GPT2-mqa1-int8-input-length:
+	python src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class HF_GPU_Pipeline --model_class GPT2 --n_positions 2048 --attention_type 3 --dtype int8 --batch_size ${batch_size} --max_input_length ${max_input_length}
+
+hf-1b-GPT2-mha-int8-input-length:
+	python src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class HF_GPU_Pipeline --model_class GPT2 --n_positions 2048 --attention_type 1 --dtype int8 --batch_size ${batch_size} --max_input_length ${max_input_length}
diff --git a/run.sh b/run_batch_size.sh
old mode 100644
new mode 100755
similarity index 100%
rename from run.sh
rename to run_batch_size.sh
diff --git a/run_input_length.sh b/run_input_length.sh
new file mode 100755
index 0000000..0bda0d5
--- /dev/null
+++ b/run_input_length.sh
@@ -0,0 +1,8 @@
+export CUDA_VISIBLE_DEVICES=0
+
+rm -rf ./tmp
+
+for max_input_length in {4,8,16,32,64,128,256,512,1024,1536,1900}
+do
+    make $1 batch_size=32 max_input_length=$max_input_length
+done
diff --git a/src/main.py b/src/main.py
index 2046829..30ec6a1 100644
--- a/src/main.py
+++ b/src/main.py
@@ -7,7 +7,8 @@ def main() -> None:
 
     args = get_args(get_arg_parser())
 
-    inputs = get_dummy_batch(args.batch_size)
+    inputs = get_dummy_batch(args.batch_size, args.max_input_length)
+
     generate_kwargs = dict(max_new_tokens=args.max_new_tokens, do_sample=False)
 
     pipeline_class = getattr(pipelines, args.pipeline_class)
diff --git a/src/utils/arguments.py b/src/utils/arguments.py
index 79f2497..158fbe3 100644
--- a/src/utils/arguments.py
+++ b/src/utils/arguments.py
@@ -9,6 +9,7 @@ def get_arg_parser() -> ArgumentParser:
     parser.add_argument("--model_class", default="GPT2", type=str)
     parser.add_argument("--batch_size", default=1, type=int)
     parser.add_argument("--dtype", default="bfloat16", type=str)
+    parser.add_argument("--max_input_length", default=-1, type=int)
     parser.add_argument("--max_new_tokens", default=100, type=int)
     parser.add_argument("--local_rank", type=int)
     parser.add_argument("--hidden_size", type=int)
diff --git a/src/utils/dummy.py b/src/utils/dummy.py
index ed06cdb..e1055a0 100644
--- a/src/utils/dummy.py
+++ b/src/utils/dummy.py
@@ -15,9 +15,11 @@
 ]
 
 
-def get_dummy_batch(batch_size: int, input_sentences: List[str] = None) -> List[str]:
-    if input_sentences is None:
+def get_dummy_batch(batch_size: int, max_input_length: int = -1) -> List[str]:
+    if max_input_length == -1:
         input_sentences = copy.deepcopy(dummy_input_sentences)
+    else:
+        input_sentences = batch_size * ["Hello " * max_input_length]
 
     if batch_size > len(input_sentences):
         input_sentences *= math.ceil(batch_size / len(input_sentences))