## Get first 10000 rows modified

In [1]:
import json

formatted_strings = []

with open("train.jsonl", "r") as f:
    for j, line in enumerate(f):
        if j >= 10000:
            break
        # Parse the JSON data from the line
        data = json.loads(line.strip())
        rating = data['Rating']
        title = data['Title']
        review = data['Review']

        # If "Title: " appears in the review, trim off everything after its first occurrence.
        title_marker_index = review.find("Title: ")
        if title_marker_index != -1:
            review = review[:title_marker_index]

        # Format the string as required
        formatted_string = (
            f'"System prompt : Given the Rating and Title, you are required to generate the review" | '
            f'"Rating": {rating} | "Title": {title} | "Review": '
        )

        # Add the formatted string to the list
        formatted_strings.append(formatted_string)

print(f"Processed {len(formatted_strings)} lines.")


Processed 10000 lines.


In [2]:
formatted_strings[0]

'"System prompt : Given the Rating and Title, you are required to generate the review" | "Rating": 4 | "Title": No white background! It’s clear! | "Review": '

In [3]:
# from transformers import AutoModelForCausalLM, AutoTokenizer
# import torch

# save_directory = "."

# # Load the model with half precision if supported
# model = AutoModelForCausalLM.from_pretrained(save_directory, torch_dtype=torch.float16)
# tokenizer = AutoTokenizer.from_pretrained(save_directory)

# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# print(f"Using device: {device}")

# model.to(device)
# model.eval()


# sample_prompt = ("System prompt: Given the Rating and Title, you are required to generate the review, "
#                  "Rating: 5, Title: Would definitely buy again, Review:")

# inputs = tokenizer(sample_prompt, return_tensors="pt", padding=True, truncation=True, max_length=128)
# inputs = {key: value.to(device) for key, value in inputs.items()}

# with torch.no_grad():
#     generated_ids = model.generate(**inputs, max_length=128, do_sample=True, top_k=50)
# generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
# print("Generated text:", generated_text)


## Load model with DP and perform 10000 inferences


In [4]:
import json
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

# Directory where your model is saved
save_directory = "."

# Load the model with half precision if supported
model = AutoModelForCausalLM.from_pretrained(save_directory, torch_dtype=torch.float16)
tokenizer = AutoTokenizer.from_pretrained(save_directory)

# Set up the device: use CUDA if available, otherwise fallback to CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

model.to(device)
model.eval()


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Using device: cuda


LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): lora.Linear(
            (base_layer): Linear(in_features=4096, out_features=4096, bias=False)
            (lora_dropout): ModuleDict(
              (default): Dropout(p=0.05, inplace=False)
            )
            (lora_A): ModuleDict(
              (default): Linear(in_features=4096, out_features=8, bias=False)
            )
            (lora_B): ModuleDict(
              (default): Linear(in_features=8, out_features=4096, bias=False)
            )
            (lora_embedding_A): ParameterDict()
            (lora_embedding_B): ParameterDict()
            (lora_magnitude_vector): ModuleDict()
          )
          (k_proj): lora.Linear(
            (base_layer): Linear(in_features=4096, out_features=1024, bias=False)
            (lora_dropout): ModuleDict(
              (defau

In [5]:
# import time
# import json  # Import json to handle JSON serialization

# # Read formatted prompts from file. Each line should contain one formatted prompt.
# formatted_prompts = formatted_strings

# # Output file to save generated sequences in JSONL format
# output_file = "generated_sequences.jsonl"

# # Set batch size to 10 and initialize timing and batch results
# batch_size = 10
# results_batch = []
# batch_start_time = time.time()

# for i, prompt in enumerate(formatted_prompts):
#     # Tokenize the prompt
#     inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=128)
#     inputs = {key: value.to(device) for key, value in inputs.items()}

#     # Generate text from the prompt
#     with torch.no_grad():
#         generated_ids = model.generate(**inputs, max_length=128, do_sample=True, top_k=50)
#     generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)

#     results_batch.append(generated_text)

#     # Save batch and compute time after every 10 iterations
#     if (i + 1) % batch_size == 0:
#         batch_end_time = time.time()
#         batch_time = batch_end_time - batch_start_time

#         # Append ge


In [7]:
import time

output_file = "generated_sequences.jsonl"
batch_size = 100
formatted_prompts = formatted_strings
# Process prompts in batches
num_prompts = len(formatted_prompts)
for batch_start in range(0, num_prompts, batch_size):
    batch_prompts = formatted_prompts[batch_start : batch_start + batch_size]

    batch_start_time = time.time()

    # Tokenize the entire batch
    inputs = tokenizer(batch_prompts, return_tensors="pt", padding=True, truncation=True, max_length=128)
    inputs = {key: value.to(device) for key, value in inputs.items()}

    # Generate text for the batch
    with torch.no_grad():
        generated_ids = model.generate(**inputs, max_length=128, do_sample=True, top_k=50)

    # Decode the generated sequences for each prompt
    batch_generated_texts = [tokenizer.decode(ids, skip_special_tokens=True) for ids in generated_ids]

    batch_end_time = time.time()
    batch_time = batch_end_time - batch_start_time

    # Write the generated outputs in JSONL format
    with open(output_file, "a") as outfile:
        for text in batch_generated_texts:
            json_line = json.dumps({"generated_text": text})
            outfile.write(json_line + "\n")

    print(f"Processed batch {(batch_start // batch_size) + 1} (prompts {batch_start} to {batch_start+len(batch_prompts)-1}). Time taken: {batch_time:.2f} seconds.")


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Processed batch 1 (prompts 0 to 99). Time taken: 13.89 seconds.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Processed batch 2 (prompts 100 to 199). Time taken: 14.01 seconds.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Processed batch 3 (prompts 200 to 299). Time taken: 15.04 seconds.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Processed batch 4 (prompts 300 to 399). Time taken: 13.92 seconds.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Processed batch 5 (prompts 400 to 499). Time taken: 14.44 seconds.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Processed batch 6 (prompts 500 to 599). Time taken: 14.69 seconds.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Processed batch 7 (prompts 600 to 699). Time taken: 13.73 seconds.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Processed batch 8 (prompts 700 to 799). Time taken: 14.58 seconds.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Processed batch 9 (prompts 800 to 899). Time taken: 14.79 seconds.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Processed batch 10 (prompts 900 to 999). Time taken: 13.65 seconds.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Processed batch 11 (prompts 1000 to 1099). Time taken: 14.61 seconds.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Processed batch 12 (prompts 1100 to 1199). Time taken: 14.72 seconds.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Processed batch 13 (prompts 1200 to 1299). Time taken: 13.83 seconds.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Processed batch 14 (prompts 1300 to 1399). Time taken: 13.79 seconds.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Processed batch 15 (prompts 1400 to 1499). Time taken: 12.13 seconds.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Processed batch 16 (prompts 1500 to 1599). Time taken: 14.17 seconds.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Processed batch 17 (prompts 1600 to 1699). Time taken: 14.45 seconds.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Processed batch 18 (prompts 1700 to 1799). Time taken: 14.36 seconds.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Processed batch 19 (prompts 1800 to 1899). Time taken: 14.61 seconds.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Processed batch 20 (prompts 1900 to 1999). Time taken: 14.11 seconds.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Processed batch 21 (prompts 2000 to 2099). Time taken: 13.50 seconds.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Processed batch 22 (prompts 2100 to 2199). Time taken: 14.78 seconds.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Processed batch 23 (prompts 2200 to 2299). Time taken: 14.61 seconds.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Processed batch 24 (prompts 2300 to 2399). Time taken: 14.78 seconds.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Processed batch 25 (prompts 2400 to 2499). Time taken: 13.77 seconds.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Processed batch 26 (prompts 2500 to 2599). Time taken: 14.69 seconds.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Processed batch 27 (prompts 2600 to 2699). Time taken: 14.36 seconds.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Processed batch 28 (prompts 2700 to 2799). Time taken: 13.44 seconds.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Processed batch 29 (prompts 2800 to 2899). Time taken: 13.91 seconds.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Processed batch 30 (prompts 2900 to 2999). Time taken: 15.02 seconds.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Processed batch 31 (prompts 3000 to 3099). Time taken: 14.76 seconds.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Processed batch 32 (prompts 3100 to 3199). Time taken: 14.70 seconds.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Processed batch 33 (prompts 3200 to 3299). Time taken: 14.60 seconds.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Processed batch 34 (prompts 3300 to 3399). Time taken: 13.83 seconds.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Processed batch 35 (prompts 3400 to 3499). Time taken: 14.31 seconds.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Processed batch 36 (prompts 3500 to 3599). Time taken: 14.63 seconds.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Processed batch 37 (prompts 3600 to 3699). Time taken: 14.35 seconds.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Processed batch 38 (prompts 3700 to 3799). Time taken: 14.26 seconds.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Processed batch 39 (prompts 3800 to 3899). Time taken: 14.69 seconds.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Processed batch 40 (prompts 3900 to 3999). Time taken: 14.26 seconds.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Processed batch 41 (prompts 4000 to 4099). Time taken: 13.28 seconds.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Processed batch 42 (prompts 4100 to 4199). Time taken: 13.58 seconds.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Processed batch 43 (prompts 4200 to 4299). Time taken: 14.63 seconds.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Processed batch 44 (prompts 4300 to 4399). Time taken: 14.92 seconds.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Processed batch 45 (prompts 4400 to 4499). Time taken: 14.39 seconds.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Processed batch 46 (prompts 4500 to 4599). Time taken: 13.80 seconds.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Processed batch 47 (prompts 4600 to 4699). Time taken: 15.17 seconds.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Processed batch 48 (prompts 4700 to 4799). Time taken: 14.45 seconds.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Processed batch 49 (prompts 4800 to 4899). Time taken: 14.14 seconds.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Processed batch 50 (prompts 4900 to 4999). Time taken: 14.77 seconds.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Processed batch 51 (prompts 5000 to 5099). Time taken: 13.61 seconds.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Processed batch 52 (prompts 5100 to 5199). Time taken: 13.26 seconds.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Processed batch 53 (prompts 5200 to 5299). Time taken: 14.40 seconds.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Processed batch 54 (prompts 5300 to 5399). Time taken: 13.92 seconds.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Processed batch 55 (prompts 5400 to 5499). Time taken: 13.83 seconds.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Processed batch 56 (prompts 5500 to 5599). Time taken: 14.00 seconds.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Processed batch 57 (prompts 5600 to 5699). Time taken: 13.57 seconds.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Processed batch 58 (prompts 5700 to 5799). Time taken: 14.41 seconds.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Processed batch 59 (prompts 5800 to 5899). Time taken: 13.85 seconds.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Processed batch 60 (prompts 5900 to 5999). Time taken: 14.40 seconds.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Processed batch 61 (prompts 6000 to 6099). Time taken: 13.70 seconds.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Processed batch 62 (prompts 6100 to 6199). Time taken: 14.63 seconds.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Processed batch 63 (prompts 6200 to 6299). Time taken: 14.69 seconds.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Processed batch 64 (prompts 6300 to 6399). Time taken: 14.69 seconds.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Processed batch 65 (prompts 6400 to 6499). Time taken: 14.58 seconds.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Processed batch 66 (prompts 6500 to 6599). Time taken: 14.76 seconds.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Processed batch 67 (prompts 6600 to 6699). Time taken: 13.87 seconds.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Processed batch 68 (prompts 6700 to 6799). Time taken: 13.91 seconds.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Processed batch 69 (prompts 6800 to 6899). Time taken: 14.11 seconds.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Processed batch 70 (prompts 6900 to 6999). Time taken: 14.41 seconds.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Processed batch 71 (prompts 7000 to 7099). Time taken: 14.38 seconds.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Processed batch 72 (prompts 7100 to 7199). Time taken: 15.03 seconds.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Processed batch 73 (prompts 7200 to 7299). Time taken: 15.01 seconds.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Processed batch 74 (prompts 7300 to 7399). Time taken: 14.14 seconds.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Processed batch 75 (prompts 7400 to 7499). Time taken: 13.71 seconds.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Processed batch 76 (prompts 7500 to 7599). Time taken: 13.84 seconds.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Processed batch 77 (prompts 7600 to 7699). Time taken: 14.40 seconds.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Processed batch 78 (prompts 7700 to 7799). Time taken: 13.97 seconds.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Processed batch 79 (prompts 7800 to 7899). Time taken: 13.70 seconds.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Processed batch 80 (prompts 7900 to 7999). Time taken: 14.31 seconds.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Processed batch 81 (prompts 8000 to 8099). Time taken: 14.81 seconds.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Processed batch 82 (prompts 8100 to 8199). Time taken: 14.19 seconds.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Processed batch 83 (prompts 8200 to 8299). Time taken: 14.62 seconds.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Processed batch 84 (prompts 8300 to 8399). Time taken: 13.92 seconds.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Processed batch 85 (prompts 8400 to 8499). Time taken: 14.29 seconds.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Processed batch 86 (prompts 8500 to 8599). Time taken: 14.70 seconds.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Processed batch 87 (prompts 8600 to 8699). Time taken: 14.07 seconds.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Processed batch 88 (prompts 8700 to 8799). Time taken: 14.76 seconds.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Processed batch 89 (prompts 8800 to 8899). Time taken: 13.84 seconds.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Processed batch 90 (prompts 8900 to 8999). Time taken: 14.30 seconds.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Processed batch 91 (prompts 9000 to 9099). Time taken: 14.45 seconds.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Processed batch 92 (prompts 9100 to 9199). Time taken: 13.98 seconds.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Processed batch 93 (prompts 9200 to 9299). Time taken: 14.41 seconds.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Processed batch 94 (prompts 9300 to 9399). Time taken: 13.28 seconds.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Processed batch 95 (prompts 9400 to 9499). Time taken: 14.58 seconds.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Processed batch 96 (prompts 9500 to 9599). Time taken: 14.45 seconds.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Processed batch 97 (prompts 9600 to 9699). Time taken: 14.25 seconds.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Processed batch 98 (prompts 9700 to 9799). Time taken: 14.46 seconds.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Processed batch 99 (prompts 9800 to 9899). Time taken: 14.15 seconds.
Processed batch 100 (prompts 9900 to 9999). Time taken: 14.37 seconds.


## Load model without dp and run 10000 inferences

In [3]:
import json
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

# Directory where your model is saved
save_directory = "./finetuned_no_dp"

# Load the model with half precision if supported
model = AutoModelForCausalLM.from_pretrained(save_directory, torch_dtype=torch.float16)
tokenizer = AutoTokenizer.from_pretrained(save_directory)

# Set up the device: use CUDA if available, otherwise fallback to CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

model.to(device)
model.eval()


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Using device: cuda


LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): lora.Linear(
            (base_layer): Linear(in_features=4096, out_features=4096, bias=False)
            (lora_dropout): ModuleDict(
              (default): Dropout(p=0.05, inplace=False)
            )
            (lora_A): ModuleDict(
              (default): Linear(in_features=4096, out_features=8, bias=False)
            )
            (lora_B): ModuleDict(
              (default): Linear(in_features=8, out_features=4096, bias=False)
            )
            (lora_embedding_A): ParameterDict()
            (lora_embedding_B): ParameterDict()
            (lora_magnitude_vector): ModuleDict()
          )
          (k_proj): lora.Linear(
            (base_layer): Linear(in_features=4096, out_features=1024, bias=False)
            (lora_dropout): ModuleDict(
              (defau

In [4]:
import time

output_file = "generated_sequences_no_dp.jsonl"
batch_size = 200
formatted_prompts = formatted_strings
# Process prompts in batches
num_prompts = len(formatted_prompts)
for batch_start in range(0, num_prompts, batch_size):
    batch_prompts = formatted_prompts[batch_start : batch_start + batch_size]

    batch_start_time = time.time()

    # Tokenize the entire batch
    inputs = tokenizer(batch_prompts, return_tensors="pt", padding=True, truncation=True, max_length=128)
    inputs = {key: value.to(device) for key, value in inputs.items()}

    # Generate text for the batch
    with torch.no_grad():
        generated_ids = model.generate(**inputs, max_length=128, do_sample=True, top_k=50)

    # Decode the generated sequences for each prompt
    batch_generated_texts = [tokenizer.decode(ids, skip_special_tokens=True) for ids in generated_ids]

    batch_end_time = time.time()
    batch_time = batch_end_time - batch_start_time

    # Write the generated outputs in JSONL format
    with open(output_file, "a") as outfile:
        for text in batch_generated_texts:
            json_line = json.dumps({"generated_text": text})
            outfile.write(json_line + "\n")

    print(f"Processed batch {(batch_start // batch_size) + 1} (prompts {batch_start} to {batch_start+len(batch_prompts)-1}). Time taken: {batch_time:.2f} seconds.")


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Processed batch 1 (prompts 0 to 199). Time taken: 24.01 seconds.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Processed batch 2 (prompts 200 to 399). Time taken: 23.07 seconds.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Processed batch 3 (prompts 400 to 599). Time taken: 24.07 seconds.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Processed batch 4 (prompts 600 to 799). Time taken: 23.22 seconds.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Processed batch 5 (prompts 800 to 999). Time taken: 23.13 seconds.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Processed batch 6 (prompts 1000 to 1199). Time taken: 24.66 seconds.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Processed batch 7 (prompts 1200 to 1399). Time taken: 23.65 seconds.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Processed batch 8 (prompts 1400 to 1599). Time taken: 21.21 seconds.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Processed batch 9 (prompts 1600 to 1799). Time taken: 24.53 seconds.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Processed batch 10 (prompts 1800 to 1999). Time taken: 24.21 seconds.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Processed batch 11 (prompts 2000 to 2199). Time taken: 23.14 seconds.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Processed batch 12 (prompts 2200 to 2399). Time taken: 24.84 seconds.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Processed batch 13 (prompts 2400 to 2599). Time taken: 23.58 seconds.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Processed batch 14 (prompts 2600 to 2799). Time taken: 23.11 seconds.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Processed batch 15 (prompts 2800 to 2999). Time taken: 23.94 seconds.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Processed batch 16 (prompts 3000 to 3199). Time taken: 25.03 seconds.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Processed batch 17 (prompts 3200 to 3399). Time taken: 23.58 seconds.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Processed batch 18 (prompts 3400 to 3599). Time taken: 24.47 seconds.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Processed batch 19 (prompts 3600 to 3799). Time taken: 24.50 seconds.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Processed batch 20 (prompts 3800 to 3999). Time taken: 24.38 seconds.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Processed batch 21 (prompts 4000 to 4199). Time taken: 22.88 seconds.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Processed batch 22 (prompts 4200 to 4399). Time taken: 24.75 seconds.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Processed batch 23 (prompts 4400 to 4599). Time taken: 23.55 seconds.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Processed batch 24 (prompts 4600 to 4799). Time taken: 24.66 seconds.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Processed batch 25 (prompts 4800 to 4999). Time taken: 24.29 seconds.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Processed batch 26 (prompts 5000 to 5199). Time taken: 22.89 seconds.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Processed batch 27 (prompts 5200 to 5399). Time taken: 23.81 seconds.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Processed batch 28 (prompts 5400 to 5599). Time taken: 23.57 seconds.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Processed batch 29 (prompts 5600 to 5799). Time taken: 23.12 seconds.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Processed batch 30 (prompts 5800 to 5999). Time taken: 23.77 seconds.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Processed batch 31 (prompts 6000 to 6199). Time taken: 23.46 seconds.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Processed batch 32 (prompts 6200 to 6399). Time taken: 24.97 seconds.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Processed batch 33 (prompts 6400 to 6599). Time taken: 24.93 seconds.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Processed batch 34 (prompts 6600 to 6799). Time taken: 23.83 seconds.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Processed batch 35 (prompts 6800 to 6999). Time taken: 24.00 seconds.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Processed batch 36 (prompts 7000 to 7199). Time taken: 24.53 seconds.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Processed batch 37 (prompts 7200 to 7399). Time taken: 24.25 seconds.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Processed batch 38 (prompts 7400 to 7599). Time taken: 23.46 seconds.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Processed batch 39 (prompts 7600 to 7799). Time taken: 24.01 seconds.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Processed batch 40 (prompts 7800 to 7999). Time taken: 23.52 seconds.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Processed batch 41 (prompts 8000 to 8199). Time taken: 24.25 seconds.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Processed batch 42 (prompts 8200 to 8399). Time taken: 23.77 seconds.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Processed batch 43 (prompts 8400 to 8599). Time taken: 24.45 seconds.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Processed batch 44 (prompts 8600 to 8799). Time taken: 24.05 seconds.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Processed batch 45 (prompts 8800 to 8999). Time taken: 23.62 seconds.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Processed batch 46 (prompts 9000 to 9199). Time taken: 23.97 seconds.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Processed batch 47 (prompts 9200 to 9399). Time taken: 22.81 seconds.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Processed batch 48 (prompts 9400 to 9599). Time taken: 24.72 seconds.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Processed batch 49 (prompts 9600 to 9799). Time taken: 24.32 seconds.
Processed batch 50 (prompts 9800 to 9999). Time taken: 24.16 seconds.
