In [2]:
!pip install unsloth



In [2]:
from unsloth import FastLanguageModel
import torch
from transformers import TextStreamer
import time
import os

# Set RTX 3090 Ti as the device (it's device 1 in your setup)
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

# Clear any existing CUDA memory
torch.cuda.empty_cache()

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Llama-3.2-3B-Instruct",
    max_seq_length = 2048,
    dtype = None,
    load_in_4bit = True,
)

FastLanguageModel.for_inference(model)

def generate_queries(passage):
    prompt = f"Given a passage generate 80 most relevant and diverse queries for the passage:\n\n{passage}\n\nGenerate one query per line, separated by <next>"
    
    messages = [
        {"role": "user", "content": prompt},
    ]
    
    start_time = time.time()
    
    inputs = tokenizer.apply_chat_template(
        messages,
        tokenize = True,
        add_generation_prompt = True,
        return_tensors = "pt"
    ).to("cuda")
    
    text_streamer = TextStreamer(tokenizer, skip_prompt = True)
    
    outputs = model.generate(
        input_ids = inputs,
        streamer = text_streamer,
        max_new_tokens = 1024,
        temperature = 1.2,
        min_p = 0.1,
        do_sample = True,
        use_cache = True
    )
    
    result = tokenizer.decode(outputs[0])
    
    end_time = time.time()
    execution_time = end_time - start_time
    
    print(f"\nExecution time: {execution_time:.2f} seconds")
    return result

# Example usage
test_passage = """Claude is an AI assistant created by Anthropic. 
    It aims to be helpful while being direct and honest about its capabilities. 
    It can assist with analysis, writing, math, coding and various other tasks."""

results = generate_queries(test_passage)

==((====))==  Unsloth 2024.12.4: Fast Llama patching. Transformers:4.47.0.
   \\   /|    GPU: NVIDIA GeForce GT 1030. Max memory: 1.953 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu124. CUDA: 6.1. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!




ValueError: Some modules are dispatched on the CPU or the disk. Make sure you have enough GPU RAM to fit the quantized model. If you want to dispatch the model on the CPU or the disk while keeping these modules in 32-bit, you need to set `llm_int8_enable_fp32_cpu_offload=True` and pass a custom `device_map` to `from_pretrained`. Check https://huggingface.co/docs/transformers/main/en/main_classes/quantization#offload-between-cpu-and-gpu for more details. 

In [4]:
import torch

# Print info for all available GPUs
print("Available GPUs:")
for i in range(torch.cuda.device_count()):
    props = torch.cuda.get_device_properties(i)
    print(f"GPU {i}: {props.name} (Total memory: {props.total_memory / 1024**2:.0f}MB)")

Available GPUs:
GPU 0: NVIDIA GeForce GT 1030 (Total memory: 2000MB)


In [3]:
from unsloth import FastLanguageModel
import torch
from transformers import TextStreamer
import time
import os

# Set RTX 3090 Ti as the device (it's device 1 in your setup)
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

# Clear any existing CUDA memory
torch.cuda.empty_cache()

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Llama-3.2-3B-Instruct",
    max_seq_length = 2048,
    dtype = None,
    load_in_4bit = True,
)

FastLanguageModel.for_inference(model)

def generate_queries(passage):
    prompt = f"Given a passage generate 80 most relevant and diverse queries for the passage:\n\n{passage}\n\nGenerate one query per line, separated by <next>"
    
    messages = [
        {"role": "user", "content": prompt},
    ]
    
    start_time = time.time()
    
    inputs = tokenizer.apply_chat_template(
        messages,
        tokenize = True,
        add_generation_prompt = True,
        return_tensors = "pt"
    ).to("cuda")
    
    text_streamer = TextStreamer(tokenizer, skip_prompt = True)
    
    outputs = model.generate(
        input_ids = inputs,
        streamer = text_streamer,
        max_new_tokens = 1024,
        temperature = 1.2,
        min_p = 0.1,
        do_sample = True,
        use_cache = True
    )
    
    result = tokenizer.decode(outputs[0])
    
    end_time = time.time()
    execution_time = end_time - start_time
    
    print(f"\nExecution time: {execution_time:.2f} seconds")
    return result

# Example usage
test_passage = """Claude is an AI assistant created by Anthropic. 
    It aims to be helpful while being direct and honest about its capabilities. 
    It can assist with analysis, writing, math, coding and various other tasks."""

results = generate_queries(test_passage)

==((====))==  Unsloth 2024.12.4: Fast Llama patching. Transformers:4.47.0.
   \\   /|    GPU: NVIDIA GeForce GT 1030. Max memory: 1.953 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu124. CUDA: 6.1. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!




ValueError: Some modules are dispatched on the CPU or the disk. Make sure you have enough GPU RAM to fit the quantized model. If you want to dispatch the model on the CPU or the disk while keeping these modules in 32-bit, you need to set `llm_int8_enable_fp32_cpu_offload=True` and pass a custom `device_map` to `from_pretrained`. Check https://huggingface.co/docs/transformers/main/en/main_classes/quantization#offload-between-cpu-and-gpu for more details. 

In [1]:
import time
from vllm import LLM, SamplingParams
import os
import gc
import torch

# torch.cuda.empty_cache()
# gc.collect()

# Set environment variable to use 3090 Ti
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

# Initialize model with explicit dtype
llm = LLM(
    model="unsloth/Llama-3.2-3B-Instruct",
    tensor_parallel_size=1,
    gpu_memory_utilization=0.90,
    trust_remote_code=True,
    max_model_len=2048,
    dtype="float16"
)

sampling_params = SamplingParams(
        temperature=0.7,  # Lower temperature for more focused output
        top_p=0.9,
        max_tokens=2048,
        presence_penalty=0.1,
        frequency_penalty=0.1,
        best_of=1,
        stop=["</s>", "<pad>"]  # Add stop tokens to prevent rambling
    )

def generate_queries_vllm(passage):
    start_time = time.time()
    
    prompt = f"Given a passage generate 80 most relevant and diverse queries for the passage:\n\n{passage}\n\nGenerate one query per line, separated by <next>"
    
    outputs = llm.generate([prompt], sampling_params)
    result = outputs[0].outputs[0].text
    
    end_time = time.time()
    print(f"\nGeneration time: {end_time - start_time:.2f} seconds")
    return result

# Example usage
test_passage = """Claude is an AI assistant created by Anthropic. 
    It aims to be helpful while being direct and honest about its capabilities. 
    It can assist with analysis, writing, math, coding and various other tasks."""

results = generate_queries_vllm(test_passage)
print(results)

INFO 12-08 23:02:19 config.py:350] This model supports multiple tasks: {'embedding', 'generate'}. Defaulting to 'generate'.
INFO 12-08 23:02:19 llm_engine.py:249] Initializing an LLM engine (v0.6.4.post1) with config: model='unsloth/Llama-3.2-3B-Instruct', speculative_config=None, tokenizer='unsloth/Llama-3.2-3B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.float16, max_seq_len=2048, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=0, served_model_name=unsloth/Llama-3.2-3B-I

Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


INFO 12-08 23:02:22 model_runner.py:1077] Loading model weights took 6.0160 GB
INFO 12-08 23:02:22 worker.py:232] Memory profiling results: total_gpu_memory=23.68GiB initial_memory_usage=6.49GiB peak_torch_memory=7.20GiB memory_usage_post_profile=6.52GiB non_torch_memory=0.50GiB kv_cache_size=13.61GiB gpu_memory_utilization=0.90
INFO 12-08 23:02:22 gpu_executor.py:113] # GPU blocks: 7965, # CPU blocks: 2340
INFO 12-08 23:02:22 gpu_executor.py:117] Maximum concurrency for 2048 tokens per request: 62.23x
INFO 12-08 23:02:24 model_runner.py:1400] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 12-08 23:02:24 model_runner.py:1404] If out-of-memory error occurs during cudagraph capture, consider decreasing `gpu_memory_utilization` or switching to eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO 1

Processed prompts: 100%|██████████| 1/1 [00:19<00:00, 19.03s/it, est. speed input: 3.78 toks/s, output: 103.91 toks/s]


Generation time: 19.03 seconds
 to make it easier to copy and paste.

1. What type of AI assistant is Claude created by Anthropic?
<next>
2. What are the key characteristics of Claude's personality?
<next>
3. What tasks can Claude assist with?
<next>
4. What specific capabilities does Claude have?
<next>
5. What kind of AI assistant is Claude different from?
<next>
6. How does Claude approach its assistance?
<next>
7. What are the benefits of using Claude as an AI assistant?
<next>
8. What are the limitations of Claude's capabilities?
<next>
9. Can Claude be used for creative tasks?
<next>
10. How does Claude's honesty about its capabilities impact its user experience?
<next>
11. What sets Claude apart from other AI assistants?
<next>
12. Can Claude be used for complex problem-solving tasks?
<next>
13. How does Claude's directness impact its user interaction?
<next>
14. What kind of feedback can users expect from Claude?
<next>
15. Is Claude a general-purpose AI assistant or specializ




: 

In [5]:
print(torch.cuda.get_device_name(0))  # Should show RTX 3090 Ti

NVIDIA GeForce GT 1030


In [11]:
import torch
print("Available GPUs:")
for i in range(torch.cuda.device_count()):
    props = torch.cuda.get_device_properties(i)
    print(f"GPU {i}: {props.name} (Compute Capability: {props.major}.{props.minor})")

Available GPUs:
GPU 0: NVIDIA GeForce RTX 3090 Ti (Compute Capability: 8.6)
GPU 1: NVIDIA GeForce GT 1030 (Compute Capability: 6.1)


In [4]:
# NEW GPU SET UP THINGS
from unsloth import FastLanguageModel
import torch
from transformers import TextStreamer
import time
import os

# Set RTX 3090 Ti as the device (it's device 0 in PyTorch)
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

# Clear any existing CUDA memory
torch.cuda.empty_cache()

# Verify we're using the 3090 Ti
print(f"Using GPU: {torch.cuda.get_device_name()}")


Using GPU: NVIDIA GeForce GT 1030


In [9]:
if "CUDA_VISIBLE_DEVICES" in os.environ:
    del os.environ["CUDA_VISIBLE_DEVICES"]

In [3]:
print(f"Number of CUDA devices: {torch.cuda.device_count()}")

Number of CUDA devices: 1


In [1]:
import torch
import gc
import os

# First, clear all CUDA memory and cache
torch.cuda.empty_cache()
gc.collect()

# Print initial GPU info
print("Initial GPU setup:")
for i in range(torch.cuda.device_count()):
    props = torch.cuda.get_device_properties(i)
    print(f"GPU {i}: {props.name} ({props.total_memory / 1024**2:.0f}MB)")

# Set the 3090 Ti (find it by memory size ~24GB)
for i in range(torch.cuda.device_count()):
    props = torch.cuda.get_device_properties(i)
    if props.total_memory > 20000 * 2**20:  # More than 20GB = 3090 Ti
        os.environ["CUDA_VISIBLE_DEVICES"] = str(i)
        torch.cuda.set_device(0)  # After setting VISIBLE_DEVICES, it becomes device 0
        print(f"\nSet RTX 3090 Ti (device {i}) as the active device")
        break

# Verify current device
print(f"\nCurrent active device: {torch.cuda.get_device_name()}")

Initial GPU setup:
GPU 0: NVIDIA GeForce RTX 3090 Ti (24245MB)
GPU 1: NVIDIA GeForce GT 1030 (2000MB)

Set RTX 3090 Ti (device 0) as the active device

Current active device: NVIDIA GeForce RTX 3090 Ti


In [3]:
from unsloth import FastLanguageModel
import torch
from transformers import TextStreamer
import time
import os

# Clear any existing CUDA memory
torch.cuda.empty_cache()

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Llama-3.2-3B-Instruct",
    max_seq_length = 2560,  # Increased to accommodate input(512) + output(2048)
    dtype = None,
    load_in_4bit = True,
)

FastLanguageModel.for_inference(model)

def generate_queries(passage):
    prompt = f"Given a passage generate 80 most relevant and diverse queries for the passage:\n\n{passage}\n\nGenerate one query per line, separated by <next>"
    
    messages = [
        {"role": "user", "content": prompt},
    ]
    
    start_time = time.time()
    
    inputs = tokenizer.apply_chat_template(
        messages,
        tokenize = True,
        add_generation_prompt = True,
        return_tensors = "pt"
    ).to("cuda")
    
    # Check input length
    input_length = inputs.shape[1]
    print(f"Input length: {input_length} tokens")
    
    text_streamer = TextStreamer(tokenizer, skip_prompt = True)
    
    outputs = model.generate(
        input_ids = inputs,
        streamer = text_streamer,
        max_new_tokens = 2048,  # Increased to 2048 for output
        temperature = 1.2,
        min_p = 0.1,
        do_sample = True,
        use_cache = True
    )
    
    result = tokenizer.decode(outputs[0])
    
    end_time = time.time()
    execution_time = end_time - start_time
    
    print(f"\nExecution time: {execution_time:.2f} seconds")
    return result

# Example usage
test_passage = """Claude is an AI assistant created by Anthropic. 
    It aims to be helpful while being direct and honest about its capabilities. 
    It can assist with analysis, writing, math, coding and various other tasks."""

results = generate_queries(test_passage)

==((====))==  Unsloth 2024.12.4: Fast Llama patching. Transformers:4.47.0.
   \\   /|    GPU: NVIDIA GeForce RTX 3090 Ti. Max memory: 23.677 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu124. CUDA: 8.6. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Input length: 106 tokens
Here are the 80 queries:

1. What tasks can Claude assist with?
2. Who created Claude AI assistant?
3. What are Claude's capabilities?
4. Is Claude a direct and honest AI?
5. What type of analysis can Claude do?
6. Can Claude assist with writing tasks?
7. Can Claude help with math problems?
8. What coding skills does Claude possess?
9. Can Claude assist with language translation?
10. What kind of support does Claude offer?
11. How does Claude operate?
12. What is the main goal of Claude's creators?


In [2]:
from unsloth import FastLanguageModel
import torch
from transformers import TextStreamer
import time
from torch.cuda.amp import autocast
from concurrent.futures import ThreadPoolExecutor
import queue

# Initialize model and tokenizer globally for reuse
def init_model():
    torch.cuda.empty_cache()
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name="unsloth/Llama-3.2-3B-Instruct",
        max_seq_length=2560,
        dtype=None,
        load_in_4bit=True,
    )
    FastLanguageModel.for_inference(model)
    return model, tokenizer

def process_single_passage(args):
    passage, model, tokenizer = args
    prompt = f"Given a passage generate 80 most relevant and diverse queries for the passage:\n\n{passage}\n\nGenerate one query per line, separated by <next>"
    
    messages = [
        {"role": "user", "content": prompt},
    ]
    
    inputs = tokenizer.apply_chat_template(
        messages,
        tokenize=True,
        add_generation_prompt=True,
        return_tensors="pt"
    ).to("cuda")
    
    with autocast():  # Enable automatic mixed precision
        outputs = model.generate(
            input_ids=inputs,
            max_new_tokens=2048,
            temperature=1.2,
            min_p=0.1,
            do_sample=True,
            use_cache=True
        )
    
    return tokenizer.decode(outputs[0])

def batch_process_passages(passages, num_workers=2):
    model, tokenizer = init_model()
    
    results = []
    with ThreadPoolExecutor(max_workers=num_workers) as executor:
        # Create args tuples for each passage
        args = [(passage, model, tokenizer) for passage in passages]
        
        # Submit all tasks and collect futures
        futures = [executor.submit(process_single_passage, arg) for arg in args]
        
        # Get results as they complete
        for future in futures:
            try:
                result = future.result()
                results.append(result)
            except Exception as e:
                print(f"Error processing passage: {e}")
    
    return results

# Example usage
if __name__ == "__main__":
    test_passages = [
        """Claude is an AI assistant created by Anthropic. 
        It aims to be helpful while being direct and honest about its capabilities.
        It can assist with analysis, writing, math, coding and various other tasks.""",
        """The presence of communication amid scientific minds was equally important to the success of the Manhattan Project as scientific intellect was. The only cloud hanging over the impressive achievement of the atomic researchers and engineers is what their success truly meant; hundreds of thousands of innocent lives obliterated.""",
        """The Manhattan Project and its atomic bomb helped bring an end to World War II. Its legacy of peaceful uses of atomic energy continues to have an impact on history and science""",
        """Essay on The Manhattan Project - The Manhattan Project The Manhattan Project was to see if making an atomic bomb possible. The success of this project would forever change the world forever making it known that something this powerful can be manmade."""
    ]
    
    start_time = time.time()
    results = batch_process_passages(test_passages)
    end_time = time.time()
    
    print(f"Total execution time for {len(test_passages)} passages: {end_time - start_time:.2f} seconds")

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2024.12.4: Fast Llama patching. Transformers:4.47.0.
   \\   /|    GPU: NVIDIA GeForce RTX 3090 Ti. Max memory: 23.677 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu124. CUDA: 8.6. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


  with autocast():  # Enable automatic mixed precision
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Error processing passage: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.

Error processing passage: 'LlamaAttention' object has no attribute 'temp_QA'
Error processing passage: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.

Error processing passage: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assert

../aten/src/ATen/native/cuda/TensorCompare.cu:110: _assert_async_cuda_kernel: block: [0,0,0], thread: [0,0,0] Assertion `probability tensor contains either `inf`, `nan` or element < 0` failed.


In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
from transformers import TextStreamer
import time

# Clear any existing CUDA memory
torch.cuda.empty_cache()

def init_model():
    model_id = "meta-llama/Llama-3.2-3B"
    
    # Initialize tokenizer and model
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        torch_dtype=torch.float16,  # Use float16 for better memory efficiency
        device_map="auto"  # Automatically choose best device
    )
    
    return model, tokenizer

def generate_queries(passage):
    model, tokenizer = init_model()
    
    prompt = f"Given a passage generate 80 most relevant and diverse queries for the passage:\n\n{passage}\n\nGenerate one query per line, separated by <next>"
    
    # Tokenize input
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    
    start_time = time.time()
    
    text_streamer = TextStreamer(tokenizer, skip_prompt=True)
    
    # Generate output
    outputs = model.generate(
        **inputs,
        streamer=text_streamer,
        max_new_tokens=2048,
        temperature=1.2,
        min_p=0.1,
        do_sample=True,
        use_cache=True
    )
    
    result = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    end_time = time.time()
    execution_time = end_time - start_time
    
    print(f"\nExecution time: {execution_time:.2f} seconds")
    return result

# Example usage
if __name__ == "__main__":
    test_passage = """Claude is an AI assistant created by Anthropic. 
    It aims to be helpful while being direct and honest about its capabilities.
    It can assist with analysis, writing, math, coding and various other tasks."""
    
    results = generate_queries(test_passage)
    print(results)

OSError: You are trying to access a gated repo.
Make sure to have access to it at https://huggingface.co/meta-llama/Llama-3.2-3B.
403 Client Error. (Request ID: Root=1-67575c7c-584d7ed3594a36a4501124a4;95d8c545-2aac-45b9-8822-123aec276e93)

Cannot access gated repo for url https://huggingface.co/meta-llama/Llama-3.2-3B/resolve/main/config.json.
Access to model meta-llama/Llama-3.2-3B is restricted and you are not in the authorized list. Visit https://huggingface.co/meta-llama/Llama-3.2-3B to ask for access.

In [3]:
# Generating with num_sequence_length = 80
from unsloth import FastLanguageModel
import torch
from transformers import TextStreamer
import time
import os

# Clear any existing CUDA memory
torch.cuda.empty_cache()

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/Llama-3.2-3B-Instruct",
    max_seq_length=2560,
    dtype=None,
    load_in_4bit=True,
)

FastLanguageModel.for_inference(model)

def generate_queries(passage):
    prompt = f"Given a passage generate a query that is relevant and diverse queries for the passage:\n\n{passage}.  Do not produce anything else."
    
    messages = [
        {"role": "user", "content": prompt},
    ]
    
    start_time = time.time()
    
    inputs = tokenizer.apply_chat_template(
        messages,
        tokenize=True,
        add_generation_prompt=True,
        return_tensors="pt"
    ).to("cuda")
    
    # Check input length
    input_length = inputs.shape[1]
    print(f"Input length: {input_length} tokens")
    
    text_streamer = TextStreamer(tokenizer, skip_prompt=True)
    
    # Generate multiple sequences
    outputs = model.generate(
        input_ids=inputs,
        streamer=text_streamer,
        max_new_tokens=50,  # Reduced to 50 tokens per sequence
        num_return_sequences=80,  # Generate 80 different sequences
        temperature=1.2,
        min_p=0.1,
        do_sample=True,
        use_cache=True,
        pad_token_id=tokenizer.pad_token_id if tokenizer.pad_token_id is not None else tokenizer.eos_token_id,
        num_beams=1  # Using sampling instead of beam search for diversity
    )
    
    # Decode all sequences
    results = []
    for output in outputs:
        decoded = tokenizer.decode(output, skip_special_tokens=True)
        results.append(decoded)
    
    end_time = time.time()
    execution_time = end_time - start_time
    
    print(f"\nExecution time: {execution_time:.2f} seconds")
    return results

# Example usage
test_passage = """Claude is an AI assistant created by Anthropic.
    It aims to be helpful while being direct and honest about its capabilities.
    It can assist with analysis, writing, math, coding and various other tasks."""

results = generate_queries(test_passage)

# Print all generated queries
for i, result in enumerate(results, 1):
    print(f"Query {i}: {result}")

==((====))==  Unsloth 2024.12.4: Fast Llama patching. Transformers:4.47.0.
   \\   /|    GPU: NVIDIA GeForce RTX 3090 Ti. Max memory: 23.677 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu124. CUDA: 8.6. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Input length: 101 tokens
HereHereYouYouToHereHereHereYouHereYouHereHereHereHereYouToToHereYouHereHereYouYouHereHereHereYouHereHereHereHereYouHereYouYouHereHereHereYouHereYouHereHereYouYouHereHereHereYouHereHereHereHereHereYouYouHereToHereHereYouYouHereToHereHereYouYouHereHereYouHereHereYouYouYouYouYouHere are are can can generate are are are can are can are are are are can generate generate are can are are can can are are are can are are are are can are can can are are are can are can are are can can are are are can are are

In [4]:
from unsloth import FastLanguageModel
import torch
import time
import os

# Clear any existing CUDA memory
torch.cuda.empty_cache()

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/Llama-3.2-3B-Instruct",
    max_seq_length=2560,
    dtype=None,
    load_in_4bit=True,
)

FastLanguageModel.for_inference(model)

def generate_queries(passage):
    prompt = f"Generate one relevant query for this passage:\n\n{passage}"
    
    messages = [
        {"role": "user", "content": prompt},
    ]
    
    start_time = time.time()
    
    inputs = tokenizer.apply_chat_template(
        messages,
        tokenize=True,
        add_generation_prompt=True,
        return_tensors="pt"
    ).to("cuda")
    
    outputs = model.generate(
        input_ids=inputs,
        max_new_tokens=50,  # Short output for single query
        num_return_sequences=80,  # Generate 80 variations
        temperature=0.9,
        top_k=50,
        do_sample=True,
        use_cache=True,
        pad_token_id=tokenizer.pad_token_id if tokenizer.pad_token_id is not None else tokenizer.eos_token_id,
    )
    
    results = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
    
    end_time = time.time()
    print(f"\nExecution time: {time.time() - start_time:.2f} seconds")
    
    return results

# Example usage
test_passage = """Claude is an AI assistant created by Anthropic.
    It aims to be helpful while being direct and honest about its capabilities.
    It can assist with analysis, writing, math, coding and various other tasks."""

results = generate_queries(test_passage)

# Print clean queries only
for i, query in enumerate(results, 1):
    clean_query = query.strip().split('\n')[0]  # Take only first line
    if 'system' not in clean_query.lower() and 'assistant' not in clean_query.lower():
        print(f"{i}: {clean_query}")

==((====))==  Unsloth 2024.12.4: Fast Llama patching. Transformers:4.47.0.
   \\   /|    GPU: NVIDIA GeForce RTX 3090 Ti. Max memory: 23.677 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu124. CUDA: 8.6. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!

Execution time: 1.85 seconds


In [11]:
from unsloth import FastLanguageModel
import torch
import time
import os

# Clear any existing CUDA memory
torch.cuda.empty_cache()

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/Llama-3.2-3B-Instruct",
    max_seq_length=2560,
    dtype=None,
    load_in_4bit=True,
)

FastLanguageModel.for_inference(model)

def generate_queries(passage):
    # Modified prompt to force more direct query generation
    prompt = f"Write a single question about this passage. The question must start with 'What', 'How', 'Why', 'Can', or 'Which'. Do not add any other text:\n\n{passage}"
    
    messages = [
        {"role": "user", "content": prompt},
    ]
    
    start_time = time.time()
    
    inputs = tokenizer.apply_chat_template(
        messages,
        tokenize=True,
        add_generation_prompt=True,
        return_tensors="pt"
    ).to("cuda")
    
    outputs = model.generate(
        input_ids=inputs,
        max_new_tokens=30,  # Reduced for tighter responses
        num_return_sequences=80,
        temperature=0.8,  # Slightly reduced for more focused outputs
        top_k=50,
        do_sample=True,
        use_cache=True,
        pad_token_id=tokenizer.pad_token_id if tokenizer.pad_token_id is not None else tokenizer.eos_token_id,
        repetition_penalty=1.2  # Added to reduce duplicates
    )
    
    results = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
    
    end_time = time.time()
    print(f"\nExecution time: {time.time() - start_time:.2f} seconds")
    
    # Improved cleaning function
    def clean_query(query):
        query = query.strip()
        # Split on common terminators and take first part
        for term in ['\n', '?', '.']:
            query = query.split(term)[0]
        # Add question mark if missing
        if not query.endswith('?'):
            query += '?'
        return query

    # Better filtering of results
    clean_results = []
    for query in results:
        query = clean_query(query)
        # Check if it's a valid question
        if (query.startswith(('What', 'How', 'Why', 'Can', 'Which')) and 
            len(query) > 15 and
            'system' not in query.lower() and 
            'assistant' not in query.lower() and
            'human' not in query.lower()):
            clean_results.append(query)
    
    return clean_results

# Example usage
test_passage = """Claude is an AI assistant created by Anthropic.
    It aims to be helpful while being direct and honest about its capabilities.
    It can assist with analysis, writing, math, coding and various other tasks."""

results = generate_queries(test_passage)

# Print clean queries
for i, query in enumerate(results, 1):
    print(f"{i}: {query}")

==((====))==  Unsloth 2024.12.4: Fast Llama patching. Transformers:4.47.0.
   \\   /|    GPU: NVIDIA GeForce RTX 3090 Ti. Max memory: 23.677 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu124. CUDA: 8.6. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!

Execution time: 1.82 seconds


In [15]:
from unsloth import FastLanguageModel
import torch
import time
import os

# Clear any existing CUDA memory
torch.cuda.empty_cache()

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/Llama-3.2-1B-Instruct",
    max_seq_length=2560,
    dtype=None,
    load_in_4bit=True,
)

FastLanguageModel.for_inference(model)

def generate_queries(passage):
    prompt = f"Generate one relevant query for this passage:\n\n{passage}. Do not produce anything else."
    
    messages = [
        {"role": "user", "content": prompt},
    ]
    
    start_time = time.time()
    
    inputs = tokenizer.apply_chat_template(
        messages,
        tokenize=True,
        add_generation_prompt=True,
        return_tensors="pt"
    ).to("cuda")
    
    outputs = model.generate(
        input_ids=inputs,
        max_new_tokens=50,
        num_return_sequences=80,
        temperature=0.9,
        top_k=50,
        do_sample=True,
        use_cache=True,
        pad_token_id=tokenizer.pad_token_id if tokenizer.pad_token_id is not None else tokenizer.eos_token_id,
    )
    
    results = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
    
    end_time = time.time()
    print(f"\nExecution time: {time.time() - start_time:.2f} seconds")
    
    return results

# Example usage
test_passage = """Claude is an AI assistant created by Anthropic.
    It aims to be helpful while being direct and honest about its capabilities.
    It can assist with analysis, writing, math, coding and various other tasks."""

results = generate_queries(test_passage)

# Print raw outputs
print("\nRAW OUTPUTS:")
print("=" * 50)
for i, raw_output in enumerate(results, 1):
    print(f"\nOutput {i}:")
    print(raw_output)
    print("-" * 50)

==((====))==  Unsloth 2024.12.4: Fast Llama patching. Transformers:4.47.0.
   \\   /|    GPU: NVIDIA GeForce RTX 3090 Ti. Max memory: 23.677 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu124. CUDA: 8.6. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!

Execution time: 1.02 seconds

RAW OUTPUTS:

Output 1:
system

Cutting Knowledge Date: December 2023
Today Date: 09 Dec 2024

user

Generate one relevant query for this passage:

Claude is an AI assistant created by Anthropic.
    It aims to be helpful while being direct and honest about its capabilities.
    It can assist with analysis, writing, math, coding and various other tasks.. Do not produce anything else.assistant

Here's a relevant query:

"What capabilities does Anthropic's AI assistant Claude have?"
------------

In [16]:
test_passage = """The presence of communication amid scientific minds was equally important to the success of the Manhattan Project as scientific intellect was. The only cloud hanging over the impressive achievement of the atomic researchers and engineers is what their success truly meant; hundreds of thousands of innocent lives obliterated."""

results = generate_queries(test_passage)

# Print raw outputs
print("\nRAW OUTPUTS:")
print("=" * 50)
for i, raw_output in enumerate(results, 1):
    print(f"\nOutput {i}:")
    print(raw_output)
    print("-" * 50)


Execution time: 1.02 seconds

RAW OUTPUTS:

Output 1:
system

Cutting Knowledge Date: December 2023
Today Date: 09 Dec 2024

user

Generate one relevant query for this passage:

The presence of communication amid scientific minds was equally important to the success of the Manhattan Project as scientific intellect was. The only cloud hanging over the impressive achievement of the atomic researchers and engineers is what their success truly meant; hundreds of thousands of innocent lives obliterated.. Do not produce anything else.assistant

The Manhattan Project's success was not solely due to scientific intellect, but rather the collaboration of thousands of scientists and engineers from various disciplines.
--------------------------------------------------

Output 2:
system

Cutting Knowledge Date: December 2023
Today Date: 09 Dec 2024

user

Generate one relevant query for this passage:

The presence of communication amid scientific minds was equally important to the success of the 

In [7]:
from unsloth import FastLanguageModel
import torch
import time
import os
from transformers import AutoConfig
import torch.onnx
import onnxruntime as ort

# Enable CUDA optimizations
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True
torch.backends.cudnn.benchmark = True

# Clear any existing CUDA memory
torch.cuda.empty_cache()

def optimize_model():
    # Initialize model with optimizations
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name="unsloth/Llama-3.2-3B-Instruct",
        max_seq_length=2560,
        dtype=None,
        load_in_4bit=True,
    )
    
    # Set model to inference mode and optimize
    model.eval()
    FastLanguageModel.for_inference(model)
    
    # Create ONNX session
    providers = ['CUDAExecutionProvider', 'CPUExecutionProvider']
    sess_options = ort.SessionOptions()
    sess_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
    sess_options.enable_mem_pattern = True
    sess_options.enable_cpu_mem_arena = True
    
    return model, tokenizer

def generate_queries(passage, batch_size=80):
    prompt = f"Generate one relevant query for this passage:\n\n{passage}"
    
    messages = [
        {"role": "user", "content": prompt},
    ]
    
    start_time = time.time()
    
    # Batch processing of inputs
    inputs = tokenizer.apply_chat_template(
        messages,
        tokenize=True,
        add_generation_prompt=True,
        return_tensors="pt"
    ).to("cuda")
    
    with torch.cuda.amp.autocast(dtype=torch.float16):
        with torch.no_grad():
            outputs = model.generate(
                input_ids=inputs,
                max_new_tokens=50,
                num_return_sequences=batch_size,
                temperature=0.9,
                top_k=50,
                do_sample=True,
                use_cache=True,
                pad_token_id=tokenizer.pad_token_id if tokenizer.pad_token_id is not None else tokenizer.eos_token_id,
            )
    
    results = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
    
    end_time = time.time()
    print(f"\nExecution time: {time.time() - start_time:.2f} seconds")
    print(f"Queries per second: {batch_size/(end_time-start_time):.2f}")
    
    return results

# Initialize optimized model
model, tokenizer = optimize_model()

# Example usage
test_passage = """Claude is an AI assistant created by Anthropic.
    It aims to be helpful while being direct and honest about its capabilities.
    It can assist with analysis, writing, math, coding and various other tasks."""

results = generate_queries(test_passage)

# Print outputs
for i, raw_output in enumerate(results, 1):
    print(f"\nOutput {i}:")
    print(raw_output)
    print("-" * 50)

==((====))==  Unsloth 2024.12.4: Fast Llama patching. Transformers:4.47.0.
   \\   /|    GPU: NVIDIA GeForce RTX 3090 Ti. Max memory: 23.677 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu124. CUDA: 8.6. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


  with torch.cuda.amp.autocast(dtype=torch.float16):



Execution time: 2.26 seconds
Queries per second: 35.37

Output 1:
system

Cutting Knowledge Date: December 2023
Today Date: 09 Dec 2024

user

Generate one relevant query for this passage:

Claude is an AI assistant created by Anthropic.
    It aims to be helpful while being direct and honest about its capabilities.
    It can assist with analysis, writing, math, coding and various other tasks.assistant

What tasks can Claude, the AI assistant created by Anthropic, assist with besides being direct and honest about its capabilities?
--------------------------------------------------

Output 2:
system

Cutting Knowledge Date: December 2023
Today Date: 09 Dec 2024

user

Generate one relevant query for this passage:

Claude is an AI assistant created by Anthropic.
    It aims to be helpful while being direct and honest about its capabilities.
    It can assist with analysis, writing, math, coding and various other tasks.assistant

What tasks can Claude, the AI assistant created by Anthro

In [8]:
test_passage = """Claude is an AI assistant created by Anthropic.
    It aims to be helpful while being direct and honest about its capabilities.
    It can assist with analysis, writing, math, coding and various other tasks."""

results = generate_queries(test_passage)

  with torch.cuda.amp.autocast(dtype=torch.float16):



Execution time: 2.12 seconds
Queries per second: 37.82


In [10]:
for i, raw_output in enumerate(results, 1):
    print(f"\nOutput {i}:")
    print(raw_output)
    print("-" * 50)


Output 1:
system

Cutting Knowledge Date: December 2023
Today Date: 09 Dec 2024

user

Generate one relevant query for this passage:

Claude is an AI assistant created by Anthropic.
    It aims to be helpful while being direct and honest about its capabilities.
    It can assist with analysis, writing, math, coding and various other tasks.assistant

What tasks does Claude, the AI assistant created by Anthropic, assist with?
--------------------------------------------------

Output 2:
system

Cutting Knowledge Date: December 2023
Today Date: 09 Dec 2024

user

Generate one relevant query for this passage:

Claude is an AI assistant created by Anthropic.
    It aims to be helpful while being direct and honest about its capabilities.
    It can assist with analysis, writing, math, coding and various other tasks.assistant

What specific tasks or areas of assistance can Claude, the AI assistant created by Anthropic, provide?
--------------------------------------------------

Output 3:
sy