You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when checking argument for argument tensors in method wrapper_CUDA_cat)
#575
Closed
jmikedupont2 opened this issue
Apr 11, 2024
· 2 comments
The seq_len argument is deprecated and unused. It will be removed in v4.39.
Traceback (most recent call last):
File "/usr/lib/python3.10/runpy.py", line 196, in _run_module_as_main
return _run_code(code, main_globals, None,
File "/usr/lib/python3.10/runpy.py", line 86, in run_code
exec(code, run_globals)
File "/mnt/data1/nix/time/2023/09/22/petals/src/petals/cli/run_server.py", line 235, in
main()
File "/mnt/data1/nix/time/2023/09/22/petals/src/petals/cli/run_server.py", line 219, in main
server = Server(
File "/mnt/data1/nix/time/2023/09/22/petals/src/petals/server/server.py", line 237, in init
throughput_info = get_server_throughput(
File "/mnt/data1/nix/time/2023/09/22/petals/src/petals/server/throughput.py", line 83, in get_server_throughput
cache[cache_key] = measure_throughput_info(
File "/mnt/data1/nix/time/2023/09/22/petals/src/petals/server/throughput.py", line 123, in measure_throughput_info
"inference_rps": measure_compute_rps(
File "/mnt/data1/nix/time/2023/09/22/petals/src/petals/server/throughput.py", line 218, in measure_compute_rps
cache = step(cache)
File "/mnt/data1/nix/time/2023/09/22/petals/src/petals/server/throughput.py", line 215, in step
outputs = block.forward(dummy_input, use_cache=inference, layer_past=cache if inference else None)
File "/mnt/data1/nix/time/2023/09/22/petals/.venv-omain/lib/python3.10/site-packages/tensor_parallel/tensor_parallel.py", line 99, in forward
return [self.module_shards[0](*args, **kwargs)][self.output_device_index]
File "/mnt/data1/nix/time/2023/09/22/petals/.venv-omain/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/mnt/data1/nix/time/2023/09/22/petals/.venv-omain/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
return forward_call(*args, **kwargs)
File "/mnt/data1/nix/time/2023/09/22/petals/src/petals/models/llama/block.py", line 264, in forward
outputs = super().forward(
File "/mnt/data1/nix/time/2023/09/22/petals/src/petals/models/llama/block.py", line 193, in forward
hidden_states, self_attn_weights, present_key_value = self.self_attn(
File "/mnt/data1/nix/time/2023/09/22/petals/.venv-omain/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/mnt/data1/nix/time/2023/09/22/petals/.venv-omain/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
return forward_call(*args, **kwargs)
File "/mnt/data1/nix/time/2023/09/22/petals/src/petals/models/llama/block.py", line 103, in forward
key_states = torch.cat([past_key_value[0], key_states], dim=2)
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when checking argument for argument tensors in method wrapper_CUDA_cat)
The text was updated successfully, but these errors were encountered:
The
seq_len
argument is deprecated and unused. It will be removed in v4.39.Traceback (most recent call last):
File "/usr/lib/python3.10/runpy.py", line 196, in _run_module_as_main
return _run_code(code, main_globals, None,
File "/usr/lib/python3.10/runpy.py", line 86, in run_code
exec(code, run_globals)
File "/mnt/data1/nix/time/2023/09/22/petals/src/petals/cli/run_server.py", line 235, in
main()
File "/mnt/data1/nix/time/2023/09/22/petals/src/petals/cli/run_server.py", line 219, in main
server = Server(
File "/mnt/data1/nix/time/2023/09/22/petals/src/petals/server/server.py", line 237, in init
throughput_info = get_server_throughput(
File "/mnt/data1/nix/time/2023/09/22/petals/src/petals/server/throughput.py", line 83, in get_server_throughput
cache[cache_key] = measure_throughput_info(
File "/mnt/data1/nix/time/2023/09/22/petals/src/petals/server/throughput.py", line 123, in measure_throughput_info
"inference_rps": measure_compute_rps(
File "/mnt/data1/nix/time/2023/09/22/petals/src/petals/server/throughput.py", line 218, in measure_compute_rps
cache = step(cache)
File "/mnt/data1/nix/time/2023/09/22/petals/src/petals/server/throughput.py", line 215, in step
outputs = block.forward(dummy_input, use_cache=inference, layer_past=cache if inference else None)
File "/mnt/data1/nix/time/2023/09/22/petals/.venv-omain/lib/python3.10/site-packages/tensor_parallel/tensor_parallel.py", line 99, in forward
return [self.module_shards[0](*args, **kwargs)][self.output_device_index]
File "/mnt/data1/nix/time/2023/09/22/petals/.venv-omain/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/mnt/data1/nix/time/2023/09/22/petals/.venv-omain/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
return forward_call(*args, **kwargs)
File "/mnt/data1/nix/time/2023/09/22/petals/src/petals/models/llama/block.py", line 264, in forward
outputs = super().forward(
File "/mnt/data1/nix/time/2023/09/22/petals/src/petals/models/llama/block.py", line 193, in forward
hidden_states, self_attn_weights, present_key_value = self.self_attn(
File "/mnt/data1/nix/time/2023/09/22/petals/.venv-omain/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/mnt/data1/nix/time/2023/09/22/petals/.venv-omain/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
return forward_call(*args, **kwargs)
File "/mnt/data1/nix/time/2023/09/22/petals/src/petals/models/llama/block.py", line 103, in forward
key_states = torch.cat([past_key_value[0], key_states], dim=2)
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when checking argument for argument tensors in method wrapper_CUDA_cat)
The text was updated successfully, but these errors were encountered: