In [None]:
import torch

device = "cuda"

if torch.cuda.is_available():
    cuda = torch.device("cuda")
    x = torch.ones(1, device=cuda)
    print(x)
else:
    print("Cuda not found")

tensor([1.], device='cuda:0')


In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer
device = "cuda" # the device to load the model onto

model = AutoModelForCausalLM.from_pretrained(
    "Qwen/Qwen2-0.5B-Instruct",
    torch_dtype="auto",
    device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-0.5B-Instruct")

prompt = "Give me a short introduction to large language model."
messages = [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": prompt}
]
text = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True
)
model_inputs = tokenizer([text], return_tensors="pt").to(device)

generated_ids = model.generate(
    model_inputs.input_ids,
    max_new_tokens=512,
)
generated_ids = [
    output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
]

response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]


config.json:   0%|          | 0.00/659 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/988M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/242 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.29k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [38]:
response

'A large language model is a type of artificial intelligence model that can generate human-like text by analyzing and processing large amounts of natural language data. These models are trained on massive datasets, which includes texts written in multiple languages, as well as information from various sources such as Wikipedia articles, social media posts, and news articles.\nLarge language models are capable of generating complex sentences with rich meaning and nuance, and they can also be used for tasks such as summarization, question answering, and chatbots. They have been used in a variety of applications, including text classification, language translation, and machine translation.\nIn recent years, there has been a growing interest in large language models due to their potential for improving the efficiency and effectiveness of various tasks, as well as their ability to process large volumes of data at scale. However, large language models still face significant challenges, such 

In [39]:
model

Qwen2ForCausalLM(
  (model): Qwen2Model(
    (embed_tokens): Embedding(151936, 896)
    (layers): ModuleList(
      (0-23): 24 x Qwen2DecoderLayer(
        (self_attn): Qwen2SdpaAttention(
          (q_proj): Linear(in_features=896, out_features=896, bias=True)
          (k_proj): Linear(in_features=896, out_features=128, bias=True)
          (v_proj): Linear(in_features=896, out_features=128, bias=True)
          (o_proj): Linear(in_features=896, out_features=896, bias=False)
          (rotary_emb): Qwen2RotaryEmbedding()
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear(in_features=896, out_features=4864, bias=False)
          (up_proj): Linear(in_features=896, out_features=4864, bias=False)
          (down_proj): Linear(in_features=4864, out_features=896, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen2RMSNorm()
        (post_attention_layernorm): Qwen2RMSNorm()
      )
    )
    (norm): Qwen2RMSNorm()
  )
  (lm_head): Linear(in_featur

In [40]:
print(dir(model))



In [41]:
def hook(module, input, output):
    print(f"Input: {input}")
    print(f"Output: {output}")

model.register_forward_hook(hook)

<torch.utils.hooks.RemovableHandle at 0x7fd4fd704af0>

In [79]:
model_output = model(model_inputs.input_ids, use_cache=True, output_hidden_states=True, output_attentions=True)

Input: (tensor([[151644,   8948,    198,   2610,    525,    264,  10950,  17847,     13,
         151645,    198, 151644,    872,    198,  35127,    752,    264,   2805,
          16800,    311,   3460,   4128,   1614,     13, 151645,    198, 151644,
          77091,    198]], device='cuda:0'),)
Output: CausalLMOutputWithPast(loss=None, logits=tensor([[[ 2.4219,  2.6250,  2.9531,  ..., -3.3281, -3.3281, -3.3281],
         [ 0.6094,  1.7031,  3.5312,  ..., -4.5312, -4.5312, -4.5312],
         [ 8.0625,  4.8125, 10.8750,  ..., -3.5000, -3.4844, -3.4844],
         ...,
         [ 4.8438,  1.7812,  5.7188,  ..., -1.9141, -1.9141, -1.9141],
         [ 7.8125,  6.9062,  5.8125,  ..., -3.1094, -3.1094, -3.1094],
         [ 4.8125, 14.6250,  8.4375,  ..., -3.7500, -3.7500, -3.7500]]],
       device='cuda:0', grad_fn=<ToCopyBackward0>), past_key_values=((tensor([[[[  12.5625,  -10.1250,   10.1250,  ...,  -19.5000,  -77.0000,
             63.7500],
          [   5.4375,  -12.3750,    9.6250,  ..

In [102]:
len(model_output.hidden_states)

# https://huggingface.co/docs/transformers/en/main_classes/output#transformers.modeling_outputs.CausalLMOutputWithPast
model_output.hidden_states[0].shape # this is the embedding layer

model_output.hidden_states[1].shape # this is the first layer of the model, but after the MLP

torch.Size([1, 29, 896])