<a href="https://colab.research.google.com/github/Zohreh6384NKH/LLM_Project/blob/main/chapter3_looking_inside_Transformer_LLMs.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

# Load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-4k-instruct")

model = AutoModelForCausalLM.from_pretrained(
    "microsoft/Phi-3-mini-4k-instruct",
    device_map="cuda",
    torch_dtype="auto",                    # model
    trust_remote_code=True,                # custom code for this code to be used
)

# Create a pipeline
generator = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    return_full_text=False,
    max_new_tokens=50,
    do_sample=False,
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

## Input and Output of a trained transformer LLM




In [9]:
prompt = "Write an email apologizing to Sarah for the tragic gardening mishap. Explain how it happened."
output = generator(prompt)
print(output)   #it returns a list of outputs


[{'generated_text': " Mention the steps you're taking to prevent it in the future.\n\nDear Sarah,\n\nI hope this message finds you well. I am writing to express my deepest apologies for the unfortunate incident that occurred in"}]


In [10]:
print(output[0])    # access to first element in the list which is a dictionary containing keys.

{'generated_text': " Mention the steps you're taking to prevent it in the future.\n\nDear Sarah,\n\nI hope this message finds you well. I am writing to express my deepest apologies for the unfortunate incident that occurred in"}


In [11]:
print(output[0]['generated_text'])

 Mention the steps you're taking to prevent it in the future.

Dear Sarah,

I hope this message finds you well. I am writing to express my deepest apologies for the unfortunate incident that occurred in


In [12]:
print(model)

Phi3ForCausalLM(
  (model): Phi3Model(
    (embed_tokens): Embedding(32064, 3072, padding_idx=32000)
    (embed_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-31): 32 x Phi3DecoderLayer(
        (self_attn): Phi3Attention(
          (o_proj): Linear(in_features=3072, out_features=3072, bias=False)
          (qkv_proj): Linear(in_features=3072, out_features=9216, bias=False)
          (rotary_emb): Phi3RotaryEmbedding()
        )
        (mlp): Phi3MLP(
          (gate_up_proj): Linear(in_features=3072, out_features=16384, bias=False)
          (down_proj): Linear(in_features=8192, out_features=3072, bias=False)
          (activation_fn): SiLU()
        )
        (input_layernorm): Phi3RMSNorm()
        (resid_attn_dropout): Dropout(p=0.0, inplace=False)
        (resid_mlp_dropout): Dropout(p=0.0, inplace=False)
        (post_attention_layernorm): Phi3RMSNorm()
      )
    )
    (norm): Phi3RMSNorm()
  )
  (lm_head): Linear(in_features=3072, out_features=3206

## Choosing a single token from the probability distribution (sampling / decoding)

In [13]:
prompt = "The capital of France is"
#tokenize the input prompt
input_ids = tokenizer(prompt, return_tensors = 'pt')
print(input_ids)
input_ids = input_ids['input_ids']
print(type(input_ids))
input_ids = input_ids.to('cuda')  # run input_ids on gpu for faster processing
print(type(input_ids))

# in transformer architecture model.model(input_ids) gives the hidden state(core transformer) before they are passed through lm_head.
# model.lm_head(input_ids) on the other hand is responsible for converting those hidden states into predictions over the vocabulary.(final prediction layer)

# Get the output of the model before the lm_head
model_output = model.model(input_ids)
print(type(model_output))

#get the output of the lm_head
lm_head_output = model.lm_head(model_output[0])  #selecting the first dimention in the batch
print(lm_head_output.shape)



{'input_ids': tensor([[ 450, 7483,  310, 3444,  338]]), 'attention_mask': tensor([[1, 1, 1, 1, 1]])}
<class 'torch.Tensor'>
<class 'torch.Tensor'>
<class 'transformers.modeling_outputs.BaseModelOutputWithPast'>
torch.Size([1, 5, 32064])


In [14]:
token_id = lm_head_output
print(token_id)

token_id = lm_head_output[0, -1] #retrieves the tensor at the very last position in the sequence
print(token_id.shape)
print(token_id)


tensor([[[24.7500, 24.8750, 22.7500,  ..., 19.0000, 19.0000, 19.0000],
         [31.1250, 31.5000, 26.0000,  ..., 26.0000, 26.0000, 26.0000],
         [31.5000, 28.8750, 31.1250,  ..., 26.2500, 26.2500, 26.2500],
         [33.0000, 31.8750, 36.0000,  ..., 27.8750, 27.8750, 27.8750],
         [27.8750, 29.5000, 28.1250,  ..., 20.5000, 20.5000, 20.5000]]],
       device='cuda:0', dtype=torch.bfloat16, grad_fn=<UnsafeViewBackward0>)
torch.Size([32064])
tensor([27.8750, 29.5000, 28.1250,  ..., 20.5000, 20.5000, 20.5000],
       device='cuda:0', dtype=torch.bfloat16, grad_fn=<SelectBackward0>)


In [15]:
token_id = token_id = lm_head_output[0, -1].argmax(-1)  #finding the index of highest probability
print(token_id)
output = tokenizer.decode(token_id)
print(output)

tensor(3681, device='cuda:0')
Paris


# Speeding up generation by caching keys and values


In [16]:
prompt = "Write a very long email apologizing to Sarah for the tragic gardening mishap. Explain how it happened."

# Tokenize the input prompt
input_ids = tokenizer(prompt, return_tensors="pt").input_ids
input_ids = input_ids.to("cuda")

In [17]:
%%timeit -n 1
# Generate the text
generation_output = model.generate(
  input_ids=input_ids,
  max_new_tokens=100,
  use_cache=True
)

4.71 s ± 362 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [18]:
%%timeit -n 1
# Generate the text
generation_output = model.generate(
  input_ids=input_ids,
  max_new_tokens=100,
  use_cache=False
)

32 s ± 753 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
