In [1]:
import torch
import gc
import os

# First, clear all CUDA memory and cache
torch.cuda.empty_cache()
gc.collect()

# Print initial GPU info
print("Initial GPU setup:")
for i in range(torch.cuda.device_count()):
    props = torch.cuda.get_device_properties(i)
    print(f"GPU {i}: {props.name} ({props.total_memory / 1024**2:.0f}MB)")

# Set the 3090 Ti (find it by memory size ~24GB)
for i in range(torch.cuda.device_count()):
    props = torch.cuda.get_device_properties(i)
    if props.total_memory > 20000 * 2**20:  # More than 20GB = 3090 Ti
        os.environ["CUDA_VISIBLE_DEVICES"] = str(i)
        torch.cuda.set_device(0)  # After setting VISIBLE_DEVICES, it becomes device 0
        print(f"\nSet RTX 3090 Ti (device {i}) as the active device")
        break

# Verify current device
print(f"\nCurrent active device: {torch.cuda.get_device_name()}")

Initial GPU setup:
GPU 0: NVIDIA GeForce RTX 3090 Ti (24245MB)
GPU 1: NVIDIA GeForce GT 1030 (2000MB)

Set RTX 3090 Ti (device 0) as the active device

Current active device: NVIDIA GeForce RTX 3090 Ti


In [2]:
from unsloth import FastLanguageModel
import torch
import time
import os

# Clear any existing CUDA memory
torch.cuda.empty_cache()

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/Llama-3.2-3B-Instruct",
    max_seq_length=512,
    dtype=None,
    load_in_4bit=True,
)

FastLanguageModel.for_inference(model)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2024.12.4: Fast Llama patching. Transformers:4.47.0.
   \\   /|    GPU: NVIDIA GeForce RTX 3090 Ti. Max memory: 23.677 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu124. CUDA: 8.6. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


generation_config.json:   0%|          | 0.00/234 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/54.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 3072, padding_idx=128004)
    (layers): ModuleList(
      (0-27): 28 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear4bit(in_features=3072, out_features=3072, bias=False)
          (k_proj): Linear4bit(in_features=3072, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=3072, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=3072, out_features=3072, bias=False)
          (rotary_emb): LlamaExtendedRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=3072, out_features=8192, bias=False)
          (up_proj): Linear4bit(in_features=3072, out_features=8192, bias=False)
          (down_proj): Linear4bit(in_features=8192, out_features=3072, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((3072,), eps=1e-05)
        (post_attention_layernorm): Llam

In [3]:
def process_tsv_and_generate(input_file, output_dir="queries3b", num_lines=1000):
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    
    with open(input_file, 'r', encoding='utf-8') as f:
        for i in range(num_lines):
            line = f.readline().strip()
            if not line:
                break
                
            # Split line into doc_id and document
            try:
                doc_id, document = line.split('\t', 1)
            except ValueError:
                print(f"Skipping malformed line {i}")
                continue
            
            # Generate queries
            prompt = f"Generate one relevant query for this passage:\n\n{document}. Do not produce anything else."
            
            messages = [
                {"role": "user", "content": prompt},
            ]
            
            start_time = time.time()
            
            inputs = tokenizer.apply_chat_template(
                messages,
                tokenize=True,
                add_generation_prompt=True,
                return_tensors="pt"
            ).to("cuda")
            
            outputs = model.generate(
                input_ids=inputs,
                max_new_tokens=50,
                num_return_sequences=80,
                temperature=0.9,
                top_k=50,
                do_sample=True,
                use_cache=True,
                pad_token_id=tokenizer.pad_token_id if tokenizer.pad_token_id is not None else tokenizer.eos_token_id,
            )
            
            results = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
            
            # Save to individual file for this doc_id
            output_file = os.path.join(output_dir, f"queries_{doc_id}.txt")
            with open(output_file, 'w', encoding='utf-8') as out_f:
                for j, result in enumerate(results, 1):
                    out_f.write(f"Output {j}:\n{result}\n{'-' * 50}\n")
                out_f.write(f"\nExecution time: {time.time() - start_time:.2f} seconds\n")
            
            print(f"Processed document {doc_id} ({i+1}/{num_lines})")
            
            # Optional: Clear GPU memory periodically
            if i % 10 == 0:
                torch.cuda.empty_cache()

# Run the processing
input_file = "/drive_reader/as16386/Datasets/msmarco-2018.tsv"
process_tsv_and_generate(input_file)

Processed document 0 (1/1000)
Processed document 1 (2/1000)
Processed document 2 (3/1000)
Processed document 3 (4/1000)
Processed document 4 (5/1000)
Processed document 5 (6/1000)
Processed document 6 (7/1000)
Processed document 7 (8/1000)
Processed document 8 (9/1000)
Processed document 9 (10/1000)
Processed document 10 (11/1000)
Processed document 11 (12/1000)
Processed document 12 (13/1000)
Processed document 13 (14/1000)
Processed document 14 (15/1000)
Processed document 15 (16/1000)
Processed document 16 (17/1000)
Processed document 17 (18/1000)
Processed document 18 (19/1000)
Processed document 19 (20/1000)
Processed document 20 (21/1000)
Processed document 21 (22/1000)
Processed document 22 (23/1000)
Processed document 23 (24/1000)
Processed document 24 (25/1000)
Processed document 25 (26/1000)
Processed document 26 (27/1000)
Processed document 27 (28/1000)
Processed document 28 (29/1000)
Processed document 29 (30/1000)
Processed document 30 (31/1000)
Processed document 31 (32/1

# average time: 2.331 passage / second

In [10]:
from unsloth import FastLanguageModel
import torch
import time
import os

# Clear any existing CUDA memory
torch.cuda.empty_cache()

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/Llama-3.2-1B-Instruct",
    max_seq_length=512,
    dtype=None,
    load_in_4bit=True,
)

FastLanguageModel.for_inference(model)

def generate_queries(passage):
    prompt = f"Generate one relevant query for this passage:\n\n{passage}. Do not produce anything else."
    
    messages = [
        {"role": "user", "content": prompt},
    ]
    
    start_time = time.time()
    
    inputs = tokenizer.apply_chat_template(
        messages,
        tokenize=True,
        add_generation_prompt=True,
        return_tensors="pt"
    ).to("cuda")
    
    outputs = model.generate(
        input_ids=inputs,
        max_new_tokens=50,
        num_return_sequences=40,
        temperature=0.9,
        top_k=50,
        do_sample=True,
        use_cache=True,
        pad_token_id=tokenizer.pad_token_id if tokenizer.pad_token_id is not None else tokenizer.eos_token_id,
    )
    
    results = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
    
    end_time = time.time()
    print(f"\nExecution time: {time.time() - start_time:.2f} seconds")
    
    return results

# Example usage
test_passage = """Claude is an AI assistant created by Anthropic.
    It aims to be helpful while being direct and honest about its capabilities.
    It can assist with analysis, writing, math, coding and various other tasks."""

results = generate_queries(test_passage)

# Print raw outputs
print("\nRAW OUTPUTS:")
print("=" * 50)
for i, raw_output in enumerate(results, 1):
    print(f"\nOutput {i}:")
    print(raw_output)
    print("-" * 50)

==((====))==  Unsloth 2024.12.4: Fast Llama patching. Transformers:4.47.0.
   \\   /|    GPU: NVIDIA GeForce RTX 3090 Ti. Max memory: 23.677 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu124. CUDA: 8.6. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!

Execution time: 0.55 seconds

RAW OUTPUTS:

Output 1:
system

Cutting Knowledge Date: December 2023
Today Date: 20 Jan 2025

user

Generate one relevant query for this passage:

Claude is an AI assistant created by Anthropic.
    It aims to be helpful while being direct and honest about its capabilities.
    It can assist with analysis, writing, math, coding and various other tasks.. Do not produce anything else.assistant

What is the primary function of Claude, an AI assistant created by Anthropic?
-----------------------