bigscience-workshop · borzunov · Jan 5, 2023 · Jan 5, 2023
diff --git a/src/petals/utils/convert_block.py b/src/petals/utils/convert_block.py
@@ -4,7 +4,6 @@
 import re
 from typing import Sequence
 
-import bitsandbytes as bnb
 import tensor_parallel as tp
 import torch
 import torch.nn as nn
@@ -14,7 +13,6 @@
 from transformers.models.bloom.modeling_bloom import BloomAttention
 
 from petals.bloom.block import WrappedBloomBlock
-from petals.utils.linear8bitlt_patch import CustomLinear8bitLt
 
 use_hivemind_log_handler("in_root_logger")
 logger = get_logger(__file__)
@@ -75,6 +73,12 @@ def replace_8bit_linear(model: nn.Module, threshold=6.0):
             `int8_threshold` for outlier detection as described in the formentioned paper. This parameters is set to
             `6.0` as described by the paper.
     """
+
+    # Import bitsandbytes only when necessary, so Petals runs on platforms not supported by bitsandbytes
+    import bitsandbytes as bnb
+
+    from petals.utils.linear8bitlt_patch import CustomLinear8bitLt
+
     for n, module in model.named_children():
         if len(list(module.children())) > 0:
             replace_8bit_linear(module, threshold)
@@ -98,7 +102,6 @@ def replace_8bit_linear(model: nn.Module, threshold=6.0):
 def make_tensor_parallel(
     block: WrappedBloomBlock, model_config: BloomConfig, devices: Sequence[torch.device], output_device: torch.device
 ):
-    assert isinstance(block, (WrappedBloomBlock, CustomLinear8bitLt))
     tp_config = get_bloom_config(model_config, devices)
     del tp_config.state_rules[re.compile(".*word_embeddings.weight$")]
     tp_block = tp.TensorParallel(block, devices, config=tp_config, output_device=output_device, delay_init=True)