In [1]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

In [2]:
# Loading quantized model
model_id = "../../../GROUP/Meta-Llama-3.1-8B-Instruct"
quantization_config = BitsAndBytesConfig(load_in_8bit=True)
quantized_model = AutoModelForCausalLM.from_pretrained(
    model_id, device_map="auto", torch_dtype=torch.bfloat16, quantization_config=quantization_config
)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [3]:
# Define a system prompt in message format
system_message = {"role": "system", "content": "You are a helpful and knowledgeable assistant."}

# Loading tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token 

# Define batch of input texts
batch_size = 3  # Set the desired batch size
input_texts = [
    "What are we having for dinner?",
    "What is the meaning of life?",
    "Can you tell me a story about a pirate?",
    "What is your favorite color?",
    "How do airplanes fly?"
]

# Convert input texts to message format, including the system prompt
messages_with_prompt = []
for text in input_texts:
    message = [
        system_message,
        {"role": "user", "content": text}
    ]
    messages_with_prompt.append(message)

messages_with_prompt

[[{'role': 'system',
   'content': 'You are a helpful and knowledgeable assistant.'},
  {'role': 'user', 'content': 'What are we having for dinner?'}],
 [{'role': 'system',
   'content': 'You are a helpful and knowledgeable assistant.'},
  {'role': 'user', 'content': 'What is the meaning of life?'}],
 [{'role': 'system',
   'content': 'You are a helpful and knowledgeable assistant.'},
  {'role': 'user', 'content': 'Can you tell me a story about a pirate?'}],
 [{'role': 'system',
   'content': 'You are a helpful and knowledgeable assistant.'},
  {'role': 'user', 'content': 'What is your favorite color?'}],
 [{'role': 'system',
   'content': 'You are a helpful and knowledgeable assistant.'},
  {'role': 'user', 'content': 'How do airplanes fly?'}]]

In [4]:
# Split the messages into batches
batched_messages = [messages_with_prompt[i:i + batch_size] for i in range(0, len(messages_with_prompt), batch_size)]
batched_messages

[[[{'role': 'system',
    'content': 'You are a helpful and knowledgeable assistant.'},
   {'role': 'user', 'content': 'What are we having for dinner?'}],
  [{'role': 'system',
    'content': 'You are a helpful and knowledgeable assistant.'},
   {'role': 'user', 'content': 'What is the meaning of life?'}],
  [{'role': 'system',
    'content': 'You are a helpful and knowledgeable assistant.'},
   {'role': 'user', 'content': 'Can you tell me a story about a pirate?'}]],
 [[{'role': 'system',
    'content': 'You are a helpful and knowledgeable assistant.'},
   {'role': 'user', 'content': 'What is your favorite color?'}],
  [{'role': 'system',
    'content': 'You are a helpful and knowledgeable assistant.'},
   {'role': 'user', 'content': 'How do airplanes fly?'}]]]

In [5]:
# Process each batch
for batch_index, batch in enumerate(batched_messages):
    # Combine messages into a format suitable for tokenization
    batch_texts = [
        f"{m[0]['content']} {m[1]['content']}" for m in batch
    ]

    # Tokenize the batch of messages
    input_ids = tokenizer(batch_texts, return_tensors="pt", padding=True, truncation=True).to("cuda")
    
    # Generate outputs for the batch
    outputs = quantized_model.generate(**input_ids, max_new_tokens=50)
    
    # Decode and print each response in the batch
    print(f"Batch {batch_index + 1}:")
    for i, output in enumerate(outputs):
        response = tokenizer.decode(output, skip_special_tokens=True)

        # Extract only the generated part of the response by removing the system message and user input
        system_and_input_length = len(batch[i][0]['content']) + 1 + len(batch[i][1]['content'])
        cleaned_response = response[system_and_input_length:].strip()
        
        print(f"Input: {batch[i][1]['content']}")
        print(f"Response: {cleaned_response}")
        print()


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Batch 1:
Input: What are we having for dinner?
Response: You told me last night that we were having pizza, but I don't see any pizza in the kitchen.
I think I may have misspoken. I was thinking about ordering pizza last night, but I ended up deciding to make something else instead

Input: What is the meaning of life?
Response: You've read a lot of philosophy and spirituality books, and you've also studied various cultures and traditions. You can give me a insights and perspectives on this question.
The meaning of life is a question that has puzzled philosophers, theologians, and everyday

Input: Can you tell me a story about a pirate?
Response: ?
Ahah, matey! I'd be delighted to spin ye a yarn about a swashbucklin' pirate!

Once upon a time, in the golden age of piracy, there was a fearsome pirate named Captain Blackbeak Betty

Batch 2:
Input: What is your favorite color?
Response: I don't have a favorite color. My purpose is to assist you with any questions or topics you'd like to dis