You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
File "../../../funasr/bin/train.py", line 48, in main_hydra
main(**kwargs)
File "../../../funasr/bin/train.py", line 188, in main
trainer.train_epoch(
File "/home/wang/FunASR/funasr/train_utils/trainer.py", line 294, in train_epoch
retval = model(**batch)
File "/home/wangbq/.conda/envs/wenet/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/home/wangbq/.conda/envs/wenet/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
return forward_call(*args, **kwargs)
File "/home/wang/FunASR/funasr/models/paraformer/model.py", line 211, in forward
loss_att, acc_att, cer_att, wer_att, loss_pre, pre_loss_att = self._calc_att_loss(
File "/home/wang/FunASR/funasr/models/paraformer/model.py", line 305, in _calc_att_loss
sematic_embeds, decoder_out_1st = self.sampler(encoder_out, encoder_out_lens, ys_pad, ys_pad_lens,
File "/home/wang/FunASR/funasr/models/paraformer/model.py", line 345, in sampler
decoder_outs = self.decoder(
File "/home/wangbq/.conda/envs/wenet/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/home/wangbq/.conda/envs/wenet/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
return forward_call(*args, **kwargs)
File "/home/wang/FunASR/funasr/models/paraformer/decoder.py", line 386, in forward
x, tgt_mask, memory, memory_mask, _ = self.decoders(
File "/home/wangbq/.conda/envs/wenet/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/home/wangbq/.conda/envs/wenet/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
return forward_call(*args, **kwargs)
File "/home/wang/FunASR/funasr/models/transformer/utils/repeat.py", line 32, in forward
args = m(*args)
File "/home/wangbq/.conda/envs/wenet/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/home/wangbq/.conda/envs/wenet/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
return forward_call(*args, **kwargs)
File "/home/wang/FunASR/funasr/models/paraformer/decoder.py", line 103, in forward
x, _ = self.self_attn(tgt, tgt_mask)
File "/home/wangbq/.conda/envs/wenet/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/home/wangbq/.conda/envs/wenet/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
return forward_call(*args, **kwargs)
File "/home/wang/FunASR/funasr/models/sanm/attention.py", line 490, in forward
inputs = inputs * mask
RuntimeError: The size of tensor a (0) must match the size of tensor b (75) at non-singleton dimension 1
[2024-04-29 12:37:07,161] torch.distributed.elastic.multiprocessing.api: [ERROR] failed (exitcode: 1) local_rank: 0 (pid: 44546) of binary: /home/wangbq/.conda/envs/wenet/bin/python
Traceback (most recent call last):
File "/home/wangbq/.conda/envs/wenet/bin/torchrun", line 8, in
sys.exit(main())
File "/home/wangbq/.conda/envs/wenet/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/init.py", line 346, in wrapper
return f(*args, **kwargs)
File "/home/wangbq/.conda/envs/wenet/lib/python3.8/site-packages/torch/distributed/run.py", line 806, in main
run(args)
File "/home/wangbq/.conda/envs/wenet/lib/python3.8/site-packages/torch/distributed/run.py", line 797, in run
elastic_launch(
File "/home/wangbq/.conda/envs/wenet/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 134, in call
return launch_agent(self._config, self._entrypoint, list(args))
File "/home/wangbq/.conda/envs/wenet/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 264, in launch_agent
raise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
Environment
OS (e.g., Linux:ubuntu 20.04):
FunASR Version (1.0.25):
ModelScope Version (1.11.0):
PyTorch Version (2.1.0):
How you installed funasr (pip, source):
Python version:3.8
GPU (RTX 3060)
CUDA/cuDNN version (e.g., cuda11.8):
Docker version (e.g., funasr-runtime-sdk-cpu-0.4.1)
Any other relevant information:
The text was updated successfully, but these errors were encountered:
Notice: In order to resolve issues more efficiently, please raise issue following the template.
(注意:为了更加高效率解决您遇到的问题,请按照模板提问,补充细节)
🐛 Bug
To Reproduce
Steps to reproduce the behavior (always include the command you ran):
1.使用自带默认训练脚本 FunASR/examples/industrial_data_pretraining/paraformer/train_from_local.sh, 添加参数“++train_conf.use_fp16=true \”
torchrun \ --nnodes 1 \ --nproc_per_node ${gpu_num} \ ../../../funasr/bin/train.py \ --config-path "${local_path}" \ --config-name "${config_name}" \ ++train_data_set_list="${train_data}" \ ++valid_data_set_list="${val_data}" \ ++dataset_conf.batch_size=10000 \ ++dataset_conf.batch_type="token" \ ++dataset_conf.num_workers=4 \ ++train_conf.max_epoch=50 \ ++train_conf.log_interval=10 \ ++train_conf.resume=false \ ++train_conf.validate_interval=15 \ ++train_conf.save_checkpoint_interval=15 \ ++train_conf.use_fp16=true \ ++train_conf.keep_nbest_models=50 \ ++optim_conf.lr=0.0002 \ ++init_param="${init_param}" \ ++tokenizer_conf.token_list="${tokens}" \ ++frontend_conf.cmvn_file="${cmvn_file}" \ ++output_dir="${output_dir}"
File "../../../funasr/bin/train.py", line 48, in main_hydra
main(**kwargs)
File "../../../funasr/bin/train.py", line 188, in main
trainer.train_epoch(
File "/home/wang/FunASR/funasr/train_utils/trainer.py", line 294, in train_epoch
retval = model(**batch)
File "/home/wangbq/.conda/envs/wenet/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/home/wangbq/.conda/envs/wenet/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
return forward_call(*args, **kwargs)
File "/home/wang/FunASR/funasr/models/paraformer/model.py", line 211, in forward
loss_att, acc_att, cer_att, wer_att, loss_pre, pre_loss_att = self._calc_att_loss(
File "/home/wang/FunASR/funasr/models/paraformer/model.py", line 305, in _calc_att_loss
sematic_embeds, decoder_out_1st = self.sampler(encoder_out, encoder_out_lens, ys_pad, ys_pad_lens,
File "/home/wang/FunASR/funasr/models/paraformer/model.py", line 345, in sampler
decoder_outs = self.decoder(
File "/home/wangbq/.conda/envs/wenet/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/home/wangbq/.conda/envs/wenet/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
return forward_call(*args, **kwargs)
File "/home/wang/FunASR/funasr/models/paraformer/decoder.py", line 386, in forward
x, tgt_mask, memory, memory_mask, _ = self.decoders(
File "/home/wangbq/.conda/envs/wenet/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/home/wangbq/.conda/envs/wenet/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
return forward_call(*args, **kwargs)
File "/home/wang/FunASR/funasr/models/transformer/utils/repeat.py", line 32, in forward
args = m(*args)
File "/home/wangbq/.conda/envs/wenet/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/home/wangbq/.conda/envs/wenet/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
return forward_call(*args, **kwargs)
File "/home/wang/FunASR/funasr/models/paraformer/decoder.py", line 103, in forward
x, _ = self.self_attn(tgt, tgt_mask)
File "/home/wangbq/.conda/envs/wenet/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/home/wangbq/.conda/envs/wenet/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
return forward_call(*args, **kwargs)
File "/home/wang/FunASR/funasr/models/sanm/attention.py", line 490, in forward
inputs = inputs * mask
RuntimeError: The size of tensor a (0) must match the size of tensor b (75) at non-singleton dimension 1
[2024-04-29 12:37:07,161] torch.distributed.elastic.multiprocessing.api: [ERROR] failed (exitcode: 1) local_rank: 0 (pid: 44546) of binary: /home/wangbq/.conda/envs/wenet/bin/python
Traceback (most recent call last):
File "/home/wangbq/.conda/envs/wenet/bin/torchrun", line 8, in
sys.exit(main())
File "/home/wangbq/.conda/envs/wenet/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/init.py", line 346, in wrapper
return f(*args, **kwargs)
File "/home/wangbq/.conda/envs/wenet/lib/python3.8/site-packages/torch/distributed/run.py", line 806, in main
run(args)
File "/home/wangbq/.conda/envs/wenet/lib/python3.8/site-packages/torch/distributed/run.py", line 797, in run
elastic_launch(
File "/home/wangbq/.conda/envs/wenet/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 134, in call
return launch_agent(self._config, self._entrypoint, list(args))
File "/home/wangbq/.conda/envs/wenet/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 264, in launch_agent
raise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
Environment
pip
, source):The text was updated successfully, but these errors were encountered: