bigscience-workshop · SaulLu · Mar 1, 2022 · Feb 28, 2022 · Feb 28, 2022 · Feb 28, 2022
diff --git a/megatron/arguments.py b/megatron/arguments.py
@@ -369,6 +369,10 @@ def _add_network_size_args(parser):
     group.add_argument('--make-vocab-size-divisible-by', type=int, default=128,
                        help='Pad the vocab size to be divisible by this value.'
                        'This is added for computational efficieny reasons.')
+    group.add_argument('--pad-vocab-size-to', type=int, default=None,
-    group.add_argument('--pad-vocab-size-to', type=int, default=None,
+    group.add_argument('--pad-embedding-size-to', type=int, default=None,
-    group.add_argument('--pad-vocab-size-to', type=int, default=None,
+    group.add_argument('--pad-embedding-size-to', type=int, default=None,
+                       help='Pad the vocab size to this value.'
+                       'This value must be greater than the initial size of the tokenizer'
+                       ', needs to be divisible by TP size and `make-vocab-size-divisible-by`.')
     group.add_argument('--layernorm-epsilon', type=float, default=1e-5,
                        help='Layer norm epsilon.')
     group.add_argument('--apply-residual-connection-post-layernorm',

diff --git a/megatron/mpu/layers.py b/megatron/mpu/layers.py
@@ -217,6 +217,9 @@ def __init__(self, num_embeddings, embedding_dim,
 
 
     def forward(self, input_):
+        if torch.any(input_ >= self.num_embeddings):
+            raise ValueError(f"There is an input id in the input that is greater than the highest possible input id.\nInput: {input_}\nnum_embeddings: {self.num_embeddings}")
+
         if self.tensor_model_parallel_size > 1:
             # Build the mask.
             input_mask = (input_ < self.vocab_start_index) | \
@@ -225,7 +228,9 @@ def forward(self, input_):
             masked_input = input_.clone() - self.vocab_start_index
             masked_input[input_mask] = 0
         else:
+            # input_ is garanted to be in the range [0:self.vocab_end_index - self.vocab_start_index] thanks to the first check
             masked_input = input_
+
         # Get the embeddings.
         output_parallel = F.embedding(masked_input, self.weight,
                                       self.padding_idx, self.max_norm,

diff --git a/megatron/tokenizer/tokenizer.py b/megatron/tokenizer/tokenizer.py
@@ -68,14 +68,25 @@ def build_tokenizer(args):
 
 
 def _vocab_size_with_padding(orig_vocab_size, args):
-    """Pad vocab size so it is divisible by model parallel size and
-    still having GPU friendly size."""
-
-    after = orig_vocab_size
-    multiple = args.make_vocab_size_divisible_by * \
-        args.tensor_model_parallel_size
-    while (after % multiple) != 0:
-        after += 1
+    """Apply the requested rules to change the size of the vocabulary"""
+    if args.pad_vocab_size_to is not None:
+        if args.pad_vocab_size_to  < orig_vocab_size:
+            raise ValueError(
+                f"You asked to pad the vocabulary to {args.pad_vocab_size_to} when the initial vocabulary size is "
+                f"{orig_vocab_size}. You can only pad to a higher value."
+            )
+
+        if args.make_vocab_size_divisible_by is not None and (args.pad_vocab_size_to % args.make_vocab_size_divisible_by) != 0:
+            raise ValueError(f"{args.pad_vocab_size_to} is not divisible by {args.make_vocab_size_divisible_by}")
+
+        after = args.pad_vocab_size_to
+    else:
+        # Pad vocab size so it is divisible by model parallel size and still having GPU friendly size.
+        after = orig_vocab_size
+        multiple = args.make_vocab_size_divisible_by * \
+            args.tensor_model_parallel_size
+        while (after % multiple) != 0:
+            after += 1
     if args.rank == 0:
         print(' > padded vocab (size: {}) with {} dummy tokens '
               '(new size: {})'.format(

diff --git a/tests/test_tensor_parallel.py b/tests/test_tensor_parallel.py
@@ -25,6 +25,7 @@
 class MegDSTestTP(TestCasePlus):
     def get_default_args(self):
         """return a dictionary with key as argument name and value as additional arguments"""
+        data_dir = f"{self.data_dir}/gpt2"
         return {
             # GPT_ARGS
             "--num-layers": "2",
@@ -39,8 +40,9 @@ def get_default_args(self):
             "--lr": "0.00015",
             "--min-lr": "1.0e-5",
             "--train-iters": "5000",
-            "--tokenizer-type": "PretrainedFromHF",
-            "--tokenizer-name-or-path": "gpt2",
+            "--tokenizer-type": "GPT2BPETokenizer",
+            "--merge-file": f"{data_dir}/gpt2-tiny-merges.txt",
+            "--vocab-file": f"{data_dir}/gpt2-tiny-vocab.json",
             "--data-impl": "mmap",
             "--split": "949,50,1",
             "--distributed-backend": "nccl",
@@ -111,8 +113,6 @@ def create_model_inputs(tokens):
                 initialize_megatron()
                 args = get_args()
 
-                args.vocab_size = args.padded_vocab_size = 1024
-
                 tokenizer = get_tokenizer()
 
                 model, _, _ = setup_model_and_optimizer(gpt_model_provider)
@@ -141,7 +141,6 @@ def create_model_inputs(tokens):
                 else:
                     token_ids = torch.tensor(token_ids)
 
-
                 model.micro_batches = 1
                 model.set_batch_fn(create_model_inputs)
                 # process batch
@@ -156,7 +155,7 @@ def create_model_inputs(tokens):
 
                 output = model.eval_batch(iter([token_ids]), compute_loss = False, reduce_output = None)[0]
 
-                output = gather_from_tensor_model_parallel_region(output)[..., :tokenizer.vocab_size]
+                output = gather_from_tensor_model_parallel_region(output)
 
                 if save != None:
                     args.save = save
@@ -169,6 +168,7 @@ def test_alibi_tp(self):
         cp_dir = self.get_auto_remove_tmp_dir()
 
         command_args = self.get_default_args()
+        command_args["--pad-vocab-size-to"] = "5120" # This is equal to 128 * 40 which is above the len of gp2-tiny vocabulary
         command_args["--position-embedding-type"] = "alibi"
         command_args["--tensor-model-parallel-size"] = "1"
 
@@ -192,5 +192,107 @@ def test_alibi_tp(self):
         logging.getLogger().critical(output-output2)
         self.assertTrue(np.allclose(output,output2, atol=5e-3, rtol=0), "Different results when running with TP=1 and TP=2")
 
+
+
+    def test_embedding_matrix_tp(self):
+        mp.set_start_method('spawn', force=True)
+        cp_dir = self.get_auto_remove_tmp_dir()
+
+        command_args = self.get_default_args()
+        command_args["--pad-vocab-size-to"] = "5120" # This is equal to 128 * 40 which is above the len of gp2-tiny vocabulary
+        command_args["--seq-length"] = "4"
+        command_args["--micro-batch-size"] = "2"
+        tokens = [[5119, 0, 1, 5100],[0, 1, 5111, 5101]]
+
+        command_args["--tensor-model-parallel-size"] = "1"
+
+        pool = Pool(1)
+        # tp_index, tp_size, command_args, token_ids, save, load
+        result = pool.map(MegDSTestTP.infer_model, [((0, 1, command_args, tokens, cp_dir, None))])
+        pool.close()
+        pool.join()
+
+        output, _ = result[0]
+        logging.getLogger().info("First done!")
+
+        command_args["--tensor-model-parallel-size"] = "2"
+
+        pool = Pool(2)
+        result = pool.map(MegDSTestTP.infer_model, [((0, 2, command_args, tokens, None, cp_dir)), ((1, 2, command_args, tokens, None, cp_dir))])
+        pool.close()
+        pool.join()
+
+        output2, _ = result[0]
+
+        logging.getLogger().critical(output-output2)
+        self.assertTrue(np.allclose(output,output2, atol=5e-3, rtol=0), "Different results when running with TP=1 and TP=2")
+
+
+    def test_embedding_matrix_tp_with_invalid_tokens_ids(self):
+        mp.set_start_method('spawn', force=True)
+
+        command_args = self.get_default_args()
+        command_args["--pad-vocab-size-to"] = "5120" # This is equal to 128 * 40 which is above the len of gp2-tiny vocabulary
+        command_args["--seq-length"] = "4"
+        command_args["--micro-batch-size"] = "2"
+        tokens = [[5120, 0, 1, 2],[0, 1, 3, 4]]
+
+        command_args["--tensor-model-parallel-size"] = "1"
+
+        pool = Pool(1)
+        with pytest.raises(Exception) as exc_info: 
+            _ = pool.map(MegDSTestTP.infer_model, [((0, 1, command_args, tokens, None, None))])
+        pool.close()
+        pool.join()
+
+        self.assertIn("There is an input id in the input that is greater than the highest possible input id" , str(exc_info.value))
+
+        logging.getLogger().info("First done!")
+
+        command_args["--tensor-model-parallel-size"] = "2"
+
+        pool = Pool(2)
+        with pytest.raises(Exception) as exc_info: 
+            _ = pool.map(MegDSTestTP.infer_model, [((0, 2, command_args, tokens, None, None)), ((1, 2, command_args, tokens, None, None))])
+        pool.close()
+        pool.join()
+
+        self.assertIn("There is an input id in the input that is greater than the highest possible input id", str(exc_info.value))
+
+
+    def test_tokenizer_vocab_size_multiple_of_tp_size(self):
+        mp.set_start_method('spawn', force=True)
+
+        command_args = self.get_default_args()
+        command_args["--pad-vocab-size-to"] = "5121" # This is equal to 128 * 40 + 1 which is above the len of gp2-tiny vocabulary
+        command_args["--micro-batch-size"] = "4"
+        command_args["--tensor-model-parallel-size"] = "2"
+        command_args["--make-vocab-size-divisible-by"] = "1"
+
+        pool = Pool(2)
+        with pytest.raises(Exception) as exc_info: 
+            _ = pool.map(MegDSTestTP.infer_model, [((0, 2, command_args, None, None, None)), ((1, 2, command_args, None, None, None))])
+        pool.close()
+        pool.join()
+
+        self.assertEqual(str(exc_info.value), "5121 is not divisible by 2")
+
+    def test_tokenizer_raise_error_make_vocab_size_divisible_by(self):
+        mp.set_start_method('spawn', force=True)
+
+        command_args = self.get_default_args()
+        command_args["--pad-vocab-size-to"] = "5121" # This is equal to 128 * 40 + 1 which is above the len of gp2-tiny vocabulary
+        command_args["--micro-batch-size"] = "4"
+
+
+        pool = Pool(2)
+        with pytest.raises(Exception) as exc_info: 
+            _ = pool.map(MegDSTestTP.infer_model, [((0, 2, command_args, None, None, None)), ((1, 2, command_args, None, None, None))])
+        pool.close()
+        pool.join()
+
+        self.assertEqual(str(exc_info.value), "5121 is not divisible by 128")
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/tools/preprocess_data.py b/tools/preprocess_data.py
@@ -119,6 +119,14 @@ def get_args():
                        help='Append an <eod> token to the end of a document.')
     group.add_argument("--tokenizer-name-or-path", type=str, default=None,
                        help="Name or path of the huggingface tokenizer.")
+    group.add_argument('--make-vocab-size-divisible-by', type=int, default=128,
+                       help='Pad the vocab size to be divisible by this value.'
+                            'This is added for computational efficieny reasons.')
+    group.add_argument('--pad-vocab-size-to', type=int, default=None,
+                       help='Pad the vocab size to be divisible by this value.'
+                            'Value of the size of the vocabulary of the tokenizer to reach. This value must be greater than'
+                            ' the initial size of the tokenizer. If this argument is used the value of '
+                            '`make-vocab-size-divisible-by` will be ignored.')
 
     group = parser.add_argument_group(title='output data')
     group.add_argument('--output-prefix', type=str, required=True,
@@ -140,7 +148,6 @@ def get_args():
 
     # some default/dummy values for the tokenizer
     args.rank = 0
-    args.make_vocab_size_divisible_by = 128
     args.tensor_model_parallel_size = 1
     args.vocab_extra_ids = 0
 

diff --git a/tools/preprocess_data_dist.py b/tools/preprocess_data_dist.py
@@ -167,6 +167,15 @@ def get_args():
                        help='Path to binary output file without suffix')
     group.add_argument('--dataset-impl', type=str, default='mmap',
                        choices=['lazy', 'cached', 'mmap'])
+    group.add_argument('--make-vocab-size-divisible-by', type=int, default=128,
+                       help='Pad the vocab size to be divisible by this value.'
+                            'This is added for computational efficieny reasons.')
+    group.add_argument('--pad-vocab-size-to', type=int, default=None,
+                       help='Pad the vocab size to be divisible by this value.'
+                            'Value of the size of the vocabulary of the tokenizer to reach. This value must be greater than'
+                            ' the initial size of the tokenizer. If this argument is used the value of '
+                            '`make-vocab-size-divisible-by` will be ignored.')
+
 
     group = parser.add_argument_group(title='runtime')
     group.add_argument('--torch-backend', type=str, default='gloo', choices=['gloo', 'mpi'],
@@ -198,7 +207,6 @@ def get_args():
     args.numranks = args.distctx.numranks
 
     # some default/dummy values for the tokenizer
-    args.make_vocab_size_divisible_by = 128
     args.tensor_model_parallel_size = 1
     args.vocab_extra_ids = 0
 

diff --git a/tools/preprocess_data_many_cores.py b/tools/preprocess_data_many_cores.py
@@ -185,6 +185,14 @@ def get_args():
                        help='Append an <eod> token to the end of a document.')
     group.add_argument("--tokenizer-name-or-path", type=str, default=None, 
                        help="Name or path of the huggingface tokenizer.")
+    group.add_argument('--make-vocab-size-divisible-by', type=int, default=128,
+                       help='Pad the vocab size to be divisible by this value.'
+                            'This is added for computational efficieny reasons.')
+    group.add_argument('--pad-vocab-size-to', type=int, default=None,
+                       help='Pad the vocab size to be divisible by this value.'
+                            'Value of the size of the vocabulary of the tokenizer to reach. This value must be greater than'
+                            ' the initial size of the tokenizer. If this argument is used the value of '
+                            '`make-vocab-size-divisible-by` will be ignored.')
 
     group = parser.add_argument_group(title='output data')
     group.add_argument('--output-prefix', type=str, required=True,
@@ -206,7 +214,6 @@ def get_args():
 
     # some default/dummy values for the tokenizer
     args.rank = 0
-    args.make_vocab_size_divisible_by = 128
     args.tensor_model_parallel_size = 1
     args.vocab_extra_ids = 0