-
Notifications
You must be signed in to change notification settings - Fork 193
Description
Traceback (most recent call last):
File "/fs/fast/ROLL/examples/start_rlvr_vl_custom_pipeline.py", line 34, in
main()
File "/fs/fast/ROLL/examples/start_rlvr_vl_custom_pipeline.py", line 30, in main
pipeline.run()
File "/fs/fast/anaconda3/envs/qwen/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 116, in decorate_context
return func(*args, **kwargs)
File "/fs/fast/ROLL/roll/pipeline/rlvr/rlvr_custom_vlm_pipeline.py", line 309, in run
model_update_metrics: Dict = self.model_update(global_step)
File "/fs/fast/ROLL/roll/pipeline/base_pipeline.py", line 70, in model_update
metrics.update(model_update_group.model_update(global_step))
File "/fs/fast/ROLL/roll/distributed/executor/model_update_group.py", line 155, in model_update
data = ray.get(refs)
File "/fs/fast/anaconda3/envs/qwen/lib/python3.10/site-packages/ray/_private/auto_init_hook.py", line 22, in auto_init_wrapper
return fn(*args, **kwargs)
File "/fs/fast/anaconda3/envs/qwen/lib/python3.10/site-packages/ray/_private/client_mode_hook.py", line 104, in wrapper
return func(*args, **kwargs)
File "/fs/fast/anaconda3/envs/qwen/lib/python3.10/site-packages/ray/_private/worker.py", line 2849, in get
values, debugger_breakpoint = worker.get_objects(object_refs, timeout=timeout)
File "/fs/fast/anaconda3/envs/qwen/lib/python3.10/site-packages/ray/_private/worker.py", line 937, in get_objects
raise value.as_instanceof_cause()
ray.exceptions.RayTaskError(TypeError): �[36mray::ActorWorker.start_model_update()�[39m (pid=40841, ip=10.0.0.2, actor_id=1384351ca8622b742df6c0b801000000, repr=ActorWorker(actor_train-0))
File "/fs/fast/ROLL/roll/distributed/executor/worker.py", line 180, in start_model_update
exec_metrics: Dict = self.strategy.model_update(*args, **kwargs)
File "/fs/fast/ROLL/roll/distributed/strategy/deepspeed_strategy.py", line 616, in model_update
ray.get(refs)
ray.exceptions.RayTaskError(TypeError): �[36mray::ActorWorker.update_parameter()�[39m (pid=41390, ip=10.0.0.2, actor_id=30efa8a9405d10c1a9e1907b01000000, repr=ActorWorker(actor_infer-0))
File "/fs/fast/ROLL/roll/distributed/executor/worker.py", line 191, in update_parameter
self.strategy.update_parameter(*args, **kwargs)
File "/fs/fast/ROLL/roll/distributed/strategy/vllm_strategy.py", line 354, in update_parameter
self.model.update_parameter(parameter_name, weight, ranks_in_worker, is_lora)
TypeError: Llm084.update_parameter() takes 4 positional arguments but 5 were given
Traceback (most recent call last):
File "/fs/fast/ROLL/examples/start_rlvr_vl_custom_pipeline.py", line 34, in
main()
File "/fs/fast/ROLL/examples/start_rlvr_vl_custom_pipeline.py", line 30, in main
pipeline.run()
File "/fs/fast/anaconda3/envs/qwen/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 116, in decorate_context
return func(*args, **kwargs)
File "/fs/fast/ROLL/roll/pipeline/rlvr/rlvr_custom_vlm_pipeline.py", line 309, in run
model_update_metrics: Dict = self.model_update(global_step)
File "/fs/fast/ROLL/roll/pipeline/base_pipeline.py", line 70, in model_update
metrics.update(model_update_group.model_update(global_step))
File "/fs/fast/ROLL/roll/distributed/executor/model_update_group.py", line 155, in model_update
data = ray.get(refs)
File "/fs/fast/anaconda3/envs/qwen/lib/python3.10/site-packages/ray/_private/auto_init_hook.py", line 22, in auto_init_wrapper
return fn(*args, **kwargs)
File "/fs/fast/anaconda3/envs/qwen/lib/python3.10/site-packages/ray/_private/client_mode_hook.py", line 104, in wrapper
return func(*args, **kwargs)
File "/fs/fast/anaconda3/envs/qwen/lib/python3.10/site-packages/ray/_private/worker.py", line 2849, in get
values, debugger_breakpoint = worker.get_objects(object_refs, timeout=timeout)
File "/fs/fast/anaconda3/envs/qwen/lib/python3.10/site-packages/ray/_private/worker.py", line 937, in get_objects
raise value.as_instanceof_cause()
ray.exceptions.RayTaskError(TypeError): �[36mray::ActorWorker.start_model_update()�[39m (pid=40841, ip=10.0.0.2, actor_id=1384351ca8622b742df6c0b801000000, repr=ActorWorker(actor_train-0))
File "/fs/fast/ROLL/roll/distributed/executor/worker.py", line 180, in start_model_update
exec_metrics: Dict = self.strategy.model_update(*args, **kwargs)
File "/fs/fast/ROLL/roll/distributed/strategy/deepspeed_strategy.py", line 616, in model_update
ray.get(refs)
ray.exceptions.RayTaskError(TypeError): �[36mray::ActorWorker.update_parameter()�[39m (pid=41390, ip=10.0.0.2, actor_id=30efa8a9405d10c1a9e1907b01000000, repr=ActorWorker(actor_infer-0))
File "/fs/fast/ROLL/roll/distributed/executor/worker.py", line 191, in update_parameter
self.strategy.update_parameter(*args, **kwargs)
File "/fs/fast/ROLL/roll/distributed/strategy/vllm_strategy.py", line 354, in update_parameter
self.model.update_parameter(parameter_name, weight, ranks_in_worker, is_lora)
TypeError: Llm084.update_parameter() takes 4 positional arguments but 5 were given