You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
WARNING 11-21 15:03:40 config.py:140] awq quantization is not fully optimized yet. The speed can be slower than non-quantized models.
INFO 11-21 15:03:40 llm_engine.py:72] Initializing an LLM engine with config: model='/home/fubo/code/vicuna-13b-awq-gemv', tokenizer='/home/fubo/code/vicuna-13b-awq-gemv', tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=4096, download_dir=None, load_format=auto, tensor_parallel_size=1, quantization=awq, seed=0)
Traceback (most recent call last):
File "vllm_worker.py", line 157, in
engine = AsyncLLMEngine.from_engine_args(engine_args)
File "/home/fubo/.conda/envs/vllm/lib/python3.8/site-packages/vllm/engine/async_llm_engine.py", line 486, in from_engine_args
engine = cls(parallel_config.worker_use_ray,
File "/home/fubo/.conda/envs/vllm/lib/python3.8/site-packages/vllm/engine/async_llm_engine.py", line 269, in init
self.engine = self._init_engine(*args, **kwargs)
File "/home/fubo/.conda/envs/vllm/lib/python3.8/site-packages/vllm/engine/async_llm_engine.py", line 305, in _init_engine
return engine_class(*args, **kwargs)
File "/home/fubo/.conda/envs/vllm/lib/python3.8/site-packages/vllm/engine/llm_engine.py", line 110, in init
self._init_workers(distributed_init_method)
File "/home/fubo/.conda/envs/vllm/lib/python3.8/site-packages/vllm/engine/llm_engine.py", line 142, in _init_workers
self._run_workers(
File "/home/fubo/.conda/envs/vllm/lib/python3.8/site-packages/vllm/engine/llm_engine.py", line 700, in _run_workers
output = executor(*args, **kwargs)
File "/home/fubo/.conda/envs/vllm/lib/python3.8/site-packages/vllm/worker/worker.py", line 70, in init_model
self.model = get_model(self.model_config)
File "/home/fubo/.conda/envs/vllm/lib/python3.8/site-packages/vllm/model_executor/model_loader.py", line 98, in get_model
model.load_weights(model_config.model, model_config.download_dir,
File "/home/fubo/.conda/envs/vllm/lib/python3.8/site-packages/vllm/model_executor/models/llama.py", line 330, in load_weights
weight_loader(param, loaded_weight)
File "/home/fubo/.conda/envs/vllm/lib/python3.8/site-packages/vllm/model_executor/layers/linear.py", line 512, in weight_loader
loaded_weight = loaded_weight.narrow(input_dim, start_idx,
RuntimeError: start (0) + length (13824) exceeds dimension size (5120).
The text was updated successfully, but these errors were encountered:
I can load GEMM model, but when I use config GEMV, I failed.
quant_config = { "zero_point": True, "q_group_size": 128, "w_bit": 4, "version": "GEMV" }
WARNING 11-21 15:03:40 config.py:140] awq quantization is not fully optimized yet. The speed can be slower than non-quantized models.
INFO 11-21 15:03:40 llm_engine.py:72] Initializing an LLM engine with config: model='/home/fubo/code/vicuna-13b-awq-gemv', tokenizer='/home/fubo/code/vicuna-13b-awq-gemv', tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=4096, download_dir=None, load_format=auto, tensor_parallel_size=1, quantization=awq, seed=0)
Traceback (most recent call last):
File "vllm_worker.py", line 157, in
engine = AsyncLLMEngine.from_engine_args(engine_args)
File "/home/fubo/.conda/envs/vllm/lib/python3.8/site-packages/vllm/engine/async_llm_engine.py", line 486, in from_engine_args
engine = cls(parallel_config.worker_use_ray,
File "/home/fubo/.conda/envs/vllm/lib/python3.8/site-packages/vllm/engine/async_llm_engine.py", line 269, in init
self.engine = self._init_engine(*args, **kwargs)
File "/home/fubo/.conda/envs/vllm/lib/python3.8/site-packages/vllm/engine/async_llm_engine.py", line 305, in _init_engine
return engine_class(*args, **kwargs)
File "/home/fubo/.conda/envs/vllm/lib/python3.8/site-packages/vllm/engine/llm_engine.py", line 110, in init
self._init_workers(distributed_init_method)
File "/home/fubo/.conda/envs/vllm/lib/python3.8/site-packages/vllm/engine/llm_engine.py", line 142, in _init_workers
self._run_workers(
File "/home/fubo/.conda/envs/vllm/lib/python3.8/site-packages/vllm/engine/llm_engine.py", line 700, in _run_workers
output = executor(*args, **kwargs)
File "/home/fubo/.conda/envs/vllm/lib/python3.8/site-packages/vllm/worker/worker.py", line 70, in init_model
self.model = get_model(self.model_config)
File "/home/fubo/.conda/envs/vllm/lib/python3.8/site-packages/vllm/model_executor/model_loader.py", line 98, in get_model
model.load_weights(model_config.model, model_config.download_dir,
File "/home/fubo/.conda/envs/vllm/lib/python3.8/site-packages/vllm/model_executor/models/llama.py", line 330, in load_weights
weight_loader(param, loaded_weight)
File "/home/fubo/.conda/envs/vllm/lib/python3.8/site-packages/vllm/model_executor/layers/linear.py", line 512, in weight_loader
loaded_weight = loaded_weight.narrow(input_dim, start_idx,
RuntimeError: start (0) + length (13824) exceeds dimension size (5120).
The text was updated successfully, but these errors were encountered: