You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
model_name_or_path = "TheBloke/Llama-2-7B-chat-AWQ"
# Load model
model = AutoAWQForCausalLM.from_quantized(model_name_or_path, fuse_layers=True,
trust_remote_code=False, safetensors=True)
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=False)
prompt = "Tell me about AI"
prompt_template=f'''[INST] <<SYS>>
You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature. If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.
<</SYS>>
{prompt}[/INST]
'''
print("\n\n*** Generate:")
tokens = tokenizer(
prompt_template,
return_tensors='pt'
).input_ids.cuda()
# Generate output
generation_output = model.generate(
tokens,
do_sample=True,
temperature=0.7,
top_p=0.95,
top_k=40,
max_new_tokens=512
)
print("Output: ", tokenizer.decode(generation_output[0]))
Error:
TypeError Traceback (most recent call last)
Cell In[12], line 24
18 tokens = tokenizer(
19 prompt_template,
20 return_tensors='pt'
21 ).input_ids.cuda()
23 # Generate output
---> 24 generation_output = model.generate(
25 tokens,
26 do_sample=True,
27 temperature=0.7,
28 top_p=0.95,
29 top_k=40,
30 max_new_tokens=512
31 )
33 print("Output: ", tokenizer.decode(generation_output[0]))
File /usr/local/lib/python3.10/dist-packages/awq/models/base.py:36, in BaseAWQForCausalLM.generate(self, *args, **kwargs)
34 def generate(self, *args, **kwargs):
35 with torch.inference_mode():
---> 36 return self.model.generate(*args, **kwargs)
File /usr/local/lib/python3.10/dist-packages/torch/utils/_contextlib.py:115, in context_decorator.<locals>.decorate_context(*args, **kwargs)
112 @functools.wraps(func)
113 def decorate_context(*args, **kwargs):
114 with ctx_factory():
--> 115 return func(*args, **kwargs)
File /usr/local/lib/python3.10/dist-packages/transformers/generation/utils.py:1652, in GenerationMixin.generate(self, inputs, generation_config, logits_processor, stopping_criteria, prefix_allowed_tokens_fn, synced_gpus, assistant_model, streamer, negative_prompt_ids, negative_prompt_attention_mask, **kwargs)
1644 input_ids, model_kwargs = self._expand_inputs_for_generation(
1645 input_ids=input_ids,
1646 expand_size=generation_config.num_return_sequences,
1647 is_encoder_decoder=self.config.is_encoder_decoder,
1648 **model_kwargs,
1649 )
1651 # 13. run sample
-> 1652 return self.sample(
1653 input_ids,
1654 logits_processor=logits_processor,
1655 logits_warper=logits_warper,
1656 stopping_criteria=stopping_criteria,
1657 pad_token_id=generation_config.pad_token_id,
1658 eos_token_id=generation_config.eos_token_id,
1659 output_scores=generation_config.output_scores,
1660 return_dict_in_generate=generation_config.return_dict_in_generate,
1661 synced_gpus=synced_gpus,
1662 streamer=streamer,
1663 **model_kwargs,
1664 )
1666 elif generation_mode == GenerationMode.BEAM_SEARCH:
1667 # 11. prepare beam search scorer
1668 beam_scorer = BeamSearchScorer(
1669 batch_size=batch_size,
1670 num_beams=generation_config.num_beams,
(...)
1675 max_length=generation_config.max_length,
1676 )
File /usr/local/lib/python3.10/dist-packages/transformers/generation/utils.py:2734, in GenerationMixin.sample(self, input_ids, logits_processor, stopping_criteria, logits_warper, max_length, pad_token_id, eos_token_id, output_attentions, output_hidden_states, output_scores, return_dict_in_generate, synced_gpus, streamer, **model_kwargs)
2731 model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
2733 # forward pass to get next token
-> 2734 outputs = self(
2735 **model_inputs,
2736 return_dict=True,
2737 output_attentions=output_attentions,
2738 output_hidden_states=output_hidden_states,
2739 )
2741 if synced_gpus and this_peer_finished:
2742 continue # don't waste resources running the code we don't need
File /usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1501, in Module._call_impl(self, *args, **kwargs)
1496 # If we don't have any hooks, we want to skip the rest of the logic in
1497 # this function, and just call forward.
1498 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1499 or _global_backward_pre_hooks or _global_backward_hooks
1500 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1501 return forward_call(*args, **kwargs)
1502 # Do not call functions when jit is used
1503 full_backward_hooks, non_full_backward_hooks = [], []
File /usr/local/lib/python3.10/dist-packages/transformers/models/llama/modeling_llama.py:1034, in LlamaForCausalLM.forward(self, input_ids, attention_mask, position_ids, past_key_values, inputs_embeds, labels, use_cache, output_attentions, output_hidden_states, return_dict)
1031 return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1033 # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
-> 1034 outputs = self.model(
1035 input_ids=input_ids,
1036 attention_mask=attention_mask,
1037 position_ids=position_ids,
1038 past_key_values=past_key_values,
1039 inputs_embeds=inputs_embeds,
1040 use_cache=use_cache,
1041 output_attentions=output_attentions,
1042 output_hidden_states=output_hidden_states,
1043 return_dict=return_dict,
1044 )
1046 hidden_states = outputs[0]
1047 if self.config.pretraining_tp > 1:
File /usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1501, in Module._call_impl(self, *args, **kwargs)
1496 # If we don't have any hooks, we want to skip the rest of the logic in
1497 # this function, and just call forward.
1498 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1499 or _global_backward_pre_hooks or _global_backward_hooks
1500 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1501 return forward_call(*args, **kwargs)
1502 # Do not call functions when jit is used
1503 full_backward_hooks, non_full_backward_hooks = [], []
File /usr/local/lib/python3.10/dist-packages/transformers/models/llama/modeling_llama.py:921, in LlamaModel.forward(self, input_ids, attention_mask, position_ids, past_key_values, inputs_embeds, use_cache, output_attentions, output_hidden_states, return_dict)
917 layer_outputs = torch.utils.checkpoint.checkpoint(
918 create_custom_forward(decoder_layer), hidden_states, attention_mask, position_ids
919 )
920 else:
--> 921 layer_outputs = decoder_layer(
922 hidden_states,
923 attention_mask=attention_mask,
924 position_ids=position_ids,
925 past_key_value=past_key_value,
926 output_attentions=output_attentions,
927 use_cache=use_cache,
928 padding_mask=padding_mask,
929 )
931 hidden_states = layer_outputs[0]
933 if use_cache:
File /usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1501, in Module._call_impl(self, *args, **kwargs)
1496 # If we don't have any hooks, we want to skip the rest of the logic in
1497 # this function, and just call forward.
1498 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1499 or _global_backward_pre_hooks or _global_backward_hooks
1500 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1501 return forward_call(*args, **kwargs)
1502 # Do not call functions when jit is used
1503 full_backward_hooks, non_full_backward_hooks = [], []
File /usr/local/lib/python3.10/dist-packages/transformers/models/llama/modeling_llama.py:631, in LlamaDecoderLayer.forward(self, hidden_states, attention_mask, position_ids, past_key_value, output_attentions, use_cache, padding_mask)
628 hidden_states = self.input_layernorm(hidden_states)
630 # Self Attention
--> 631 hidden_states, self_attn_weights, present_key_value = self.self_attn(
632 hidden_states=hidden_states,
633 attention_mask=attention_mask,
634 position_ids=position_ids,
635 past_key_value=past_key_value,
636 output_attentions=output_attentions,
637 use_cache=use_cache,
638 padding_mask=padding_mask,
639 )
640 hidden_states = residual + hidden_states
642 # Fully Connected
File /usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1501, in Module._call_impl(self, *args, **kwargs)
1496 # If we don't have any hooks, we want to skip the rest of the logic in
1497 # this function, and just call forward.
1498 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1499 or _global_backward_pre_hooks or _global_backward_hooks
1500 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1501 return forward_call(*args, **kwargs)
1502 # Do not call functions when jit is used
1503 full_backward_hooks, non_full_backward_hooks = [], []
TypeError: QuantAttentionFused.forward() got an unexpected keyword argument 'padding_mask'
The text was updated successfully, but these errors were encountered:
Install on A6000:
Replication:
Error:
The text was updated successfully, but these errors were encountered: