In [1]:
# Set this at the very start of your notebook
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"  # Use RTX 3090 Ti

# Now import other libraries
import torch
import gc

# Clear GPU memory
torch.cuda.empty_cache()
gc.collect()

# Verify we're using the right GPU
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"Device count: {torch.cuda.device_count()}")
print(f"Device name: {torch.cuda.get_device_name(0)}")
print(f"Total memory: {torch.cuda.get_device_properties(0).total_memory / (1024**3):.1f} GB")


CUDA available: True
Device count: 1
Device name: NVIDIA GeForce RTX 3090 Ti
Total memory: 23.7 GB


In [2]:
!pip install unsloth transformers datasets tqdm accelerate bitsandbytes




In [2]:
import os
import torch
import random
import numpy as np
from datasets import Dataset
from tqdm.notebook import tqdm


from transformers import TrainingArguments, DataCollatorForSeq2Seq
from trl import SFTTrainer

# Set random seed for reproducibility
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

set_seed(42)

In [3]:
# Set the path to your document-query TSV file
data_path = "/drive_reader/as16386/collections/msmarco-passage/document-query-pairs.train.tsv"  # Adjust this path

# Function to load a sample of data from the TSV file
def load_doc_query_sample(file_path, num_samples=2000):
    documents = []
    queries = []
    
    with open(file_path, 'r', encoding='utf-8') as f:
        # Read all lines and randomly sample
        lines = f.readlines()
        
        # Randomly sample num_samples lines
        if len(lines) > num_samples:
            lines = random.sample(lines, num_samples)
        
        for line in tqdm(lines, desc="Loading data"):
            parts = line.strip().split('\t')
            if len(parts) == 2:
                documents.append(parts[0])
                queries.append(parts[1])
    
    return {"document": documents, "query": queries}

# Load a small sample of data
sample_data = load_doc_query_sample(data_path, num_samples=2000)
dataset = Dataset.from_dict(sample_data)

# Let's see a few examples
print(f"Loaded {len(dataset)} document-query pairs")
print("\nExample 1:")
print(f"Document: {dataset[0]['document'][:200]}...")
print(f"Query: {dataset[0]['query']}")

print("\nExample 2:")
print(f"Document: {dataset[1]['document'][:200]}...")
print(f"Query: {dataset[1]['query']}")

Loading data:   0%|          | 0/2000 [00:00<?, ?it/s]

Loaded 2000 document-query pairs

Example 1:
Document: Diabetes also can causes problems with blood flow ... The ulcer itself is usually caused by: Repetitive trauma or pressure on the foot; Puncture wound on the foot; Objects in the ... Tell your doctor ...
Query: what causes legs to swell and acne and have blisters

Example 2:
Document: However, many fans can recall that shows do not get cancelled because of ratings alone. According to the publication, Blue Bloods Season 7 may be its last or close to the finale because the production...
Query: was blue bloods cancelled


In [None]:
# Training

import torch
import random
from datasets import Dataset
from transformers import (
    AutoModelForCausalLM, 
    AutoTokenizer, 
    BitsAndBytesConfig,
    TrainingArguments,
    DataCollatorForSeq2Seq,
    Trainer
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from tqdm.auto import tqdm

# Set random seed for reproducibility
random.seed(42)
torch.manual_seed(42)

# Check for GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Model configuration
MODEL_NAME = "meta-llama/Llama-3.2-1B-Instruct"  # or "meta-llama/Llama-3.2-1B-Instruct" for smaller model
OUTPUT_DIR = "doc2query-llama-3.2"
MAX_LENGTH = 512  # Maximum sequence length

# Format data for instruction fine-tuning
def generate_prompt(document, query=None):
    """Format the document and query into an instruction prompt."""
    if query:
        # For training examples with both document and expected query
        return f"[INST] Generate a search query for this document: {document} [/INST] {query}"
    else:
        # For inference (query generation)
        return f"[INST] Generate a search query for this document: {document} [/INST]"

def preprocess_function(examples):
    """Process multiple examples in the dataset."""
    prompts = [generate_prompt(doc, query) for doc, query in zip(examples["document"], examples["query"])]
    
    # Tokenize the examples
    tokenized_inputs = tokenizer(
        prompts,
        truncation=True,
        max_length=MAX_LENGTH,
        padding="max_length",
        return_tensors=None  # Change to None instead of "pt" - let the Dataset handle conversion
    )
    
    # Create labels (using list copying instead of tensor copying)
    tokenized_inputs["labels"] = [ids.copy() for ids in tokenized_inputs["input_ids"]]
    
    # Mask out the loss for the instruction part
    for i, prompt in enumerate(prompts):
        instruction_pos = prompt.find("[/INST]") + len("[/INST]")
        instruction_text = prompt[:instruction_pos]
        # Tokenize just the instruction part to find its length
        instruction_ids = tokenizer(instruction_text, add_special_tokens=False)["input_ids"]
        instruction_tok_len = len(instruction_ids)
        
        # Set the instruction part labels to -100 (ignore in loss calculation)
        tokenized_inputs["labels"][i][:instruction_tok_len] = [-100] * instruction_tok_len
    
    return tokenized_inputs

# Load tokenizer
print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

# Process dataset
print("Processing dataset...")
processed_dataset = dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=dataset.column_names,
    desc="Processing dataset",
)

# Configure 4-bit quantization
print("Configuring quantization...")
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

# Load model with quantization
print("Loading model...")
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=bnb_config,
    device_map="auto",
)

# Prepare model for training
print("Preparing model for training...")
model = prepare_model_for_kbit_training(model)

# LoRA configuration
peft_config = LoraConfig(
    r=16,  # Rank of the update matrices
    lora_alpha=32,  # Parameter for scaling
    lora_dropout=0.05,  # Dropout probability for LoRA layers
    target_modules=[
        "q_proj", "k_proj", "v_proj", "o_proj",  # Attention matrices
        "gate_proj", "up_proj", "down_proj"      # MLP matrices
    ],
    bias="none",
    task_type="CAUSAL_LM",
)

# Apply LoRA
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

# Configure training arguments
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    per_device_train_batch_size=2,  # Adjust based on your GPU memory
    gradient_accumulation_steps=8,
    learning_rate=2e-4,
    num_train_epochs=3,
    logging_steps=10,
    save_steps=100,
    save_total_limit=3,
    warmup_ratio=0.03,
    lr_scheduler_type="cosine",
    report_to="none",  # Can use "tensorboard" or "wandb"
    bf16=True if torch.cuda.is_available() and torch.cuda.get_device_capability()[0] >= 8 else False,
    gradient_checkpointing=True,
    fp16=False,
    optim="paged_adamw_32bit",
)

# Define data collator
data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    pad_to_multiple_of=8,
    return_tensors="pt"
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=processed_dataset,
    data_collator=data_collator,
)

# Start training
print("Starting training...")
trainer.train()

# Save the model
print(f"Saving model to {OUTPUT_DIR}")
model.save_pretrained(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)

In [5]:
import torch
from peft import PeftModel, PeftConfig
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

# 1. Load the fine-tuned model and tokenizer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_path = "/drive_reader/as16386/github/models/doc2query-llama-1b"

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_path)

# Load base model with same quantization as training
model = AutoModelForCausalLM.from_pretrained(
    "meta-llama/Llama-3.2-1B-Instruct",
    quantization_config=BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16
    ),
    device_map="auto",
)

# Load the LoRA weights
model = PeftModel.from_pretrained(model, model_path)
model.eval()

# 2. Define a function to generate multiple queries
def generate_multiple_queries(document, num_queries=80, max_new_tokens=30):
    prompt = f"[INST] Generate a search query for this document: {document} [/INST]"
    
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    
    # Set pad token id to eos token id if not set
    if model.config.pad_token_id is None:
        model.config.pad_token_id = model.config.eos_token_id
    
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=True,
            temperature=0.85,  # Slightly higher temperature for diversity
            top_p=0.92,
            top_k=50,
            num_return_sequences=num_queries,  # Generate multiple sequences
            num_beams=1,  # Must be 1 when using num_return_sequences with do_sample=True
            return_dict_in_generate=True,
            output_scores=False
        )
    
    generated_texts = tokenizer.batch_decode(outputs.sequences, skip_special_tokens=True)
    
    # Extract just the generated responses (after the instruction)
    responses = [text.split("[/INST]")[1].strip() for text in generated_texts]
    return responses

# 3. Test with one document from your dataset
# Choose an interesting document from your dataset
test_document = dataset[0]["document"]  # You could pick any index that has an interesting document
print(f"Document: {test_document[:200]}...\n")  # Show just the beginning to save space

# Generate 80 possible queries
queries = generate_multiple_queries(test_document, num_queries=80)

# Print the results
print(f"Generated {len(queries)} alternative queries:")
for i, query in enumerate(queries):
    print(f"{i+1}. {query}")

# 4. Optional: Save results to file
with open("generated_queries.txt", "w") as f:
    f.write(f"DOCUMENT:\n{test_document}\n\nGENERATED QUERIES:\n")
    for i, query in enumerate(queries):
        f.write(f"{i+1}. {query}\n")

print("\nQueries also saved to 'generated_queries.txt'")

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Document: Diabetes also can causes problems with blood flow ... The ulcer itself is usually caused by: Repetitive trauma or pressure on the foot; Puncture wound on the foot; Objects in the ... Tell your doctor ...



From v4.47 onwards, when a model cache is to be returned, `generate` will return a `Cache` instance instead by default (as opposed to the legacy tuple of tuples format). If you want to keep returning the legacy format, please set `return_legacy_cache=True`.


Generated 80 alternative queries:
1. does diabetes cause foot ulcers
2. what causes ulcers in your leg
3. what can cause blistering of feet and legs
4. what causes skin blisters and ulcers
5. what causes soreness in lower leg
6. causes of foot pain with ulcers
7. what causes blisters on the legs
8. what causes blood flow to legs to swell
9. what causes pain and swelling in the legs
10. what is causing my feet to hurt
11. what causes sunburn on lower leg
12. what causes ulcers in the feet
13. what causes pain in feet and ankles
14. causes of foot ulcers
15. what can cause blistering and pain in the feet
16. what causes ulcers on your feet
17. what causes painful sores on legs and feet
18. what causes sores on the foot
19. does tanning bed cause blisters
20. what causes sores and blisters on the feet
21. can uv light cause foot pain
22. can uv light cause foot pain
23. what causes foot ulcer
24. can sunlamps cause swelling in the legs
25. what cause sunburn
26. what causes a foot ulcer
2

# LORA Inference

In [18]:
# VLLM INFERENCE
# Install VLLM
# pip install vllm

from vllm import LLM, SamplingParams

# Step 1: Merge your LoRA weights with the base model (one-time operation)
from peft import PeftModel
import torch

# Load models
base_model = AutoModelForCausalLM.from_pretrained(
    "meta-llama/Llama-3.2-1B-Instruct", 
    torch_dtype=torch.float16,
    device_map="auto"
)
lora_model = PeftModel.from_pretrained(base_model, "/drive_reader/as16386/github/models/doc2query-llama-1b")

# Merge and save
merged_model = lora_model.merge_and_unload()
merged_model.save_pretrained("doc2query-llama-3.2-merged")
tokenizer.save_pretrained("doc2query-llama-3.2-merged")

# Step 2: Use VLLM for fast inference
llm = LLM(model="doc2query-llama-3.2-merged", dtype="float16")

# For single document
def generate_query_vllm(document):
    prompt = f"[INST] Generate a search query for this document: {document} [/INST]"
    sampling_params = SamplingParams(temperature = 0.9, top_p=0.95, top_k = 50, max_tokens=50)
    outputs = llm.generate(prompt, sampling_params)
    response = outputs[0].outputs[0].text.strip()
    return response

# For batch processing - VLLM really shines here!
def batch_generate_queries_vllm(documents, num_queries=1):
    prompts = [f"[INST] Generate a search query for this document: {doc} [/INST]" for doc in documents]
    sampling_params = SamplingParams(
        temperature=0.9,
        top_p=0.95, 
        top_k = 50,
        max_tokens=50,
        n=num_queries  # Generate multiple queries per document
    )
    outputs = llm.generate(prompts, sampling_params)
    
    results = []
    for output in outputs:
        document_queries = [out.text.strip() for out in output.outputs]
        results.append(document_queries)
    
    return results

Some parameters are on the meta device because they were offloaded to the cpu.
Some parameters are on the meta device because they were offloaded to the cpu.


Saving checkpoint shards:   0%|          | 0/1 [00:00<?, ?it/s]

INFO 03-29 16:30:31 config.py:350] This model supports multiple tasks: {'embedding', 'generate'}. Defaulting to 'generate'.
INFO 03-29 16:30:31 config.py:1136] Chunked prefill is enabled with max_num_batched_tokens=512.
INFO 03-29 16:30:31 llm_engine.py:249] Initializing an LLM engine (v0.6.4.post1) with config: model='doc2query-llama-3.2-merged', speculative_config=None, tokenizer='doc2query-llama-3.2-merged', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=131072, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_ti

Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


INFO 03-29 16:30:33 model_runner.py:1077] Loading model weights took 2.3029 GB
INFO 03-29 16:30:33 worker.py:232] Memory profiling results: total_gpu_memory=23.68GiB initial_memory_usage=22.34GiB peak_torch_memory=22.50GiB memory_usage_post_profile=22.34GiB non_torch_memory=1.00GiB kv_cache_size=-2.19GiB gpu_memory_utilization=0.90
INFO 03-29 16:30:33 gpu_executor.py:113] # GPU blocks: 0, # CPU blocks: 8192
INFO 03-29 16:30:33 gpu_executor.py:117] Maximum concurrency for 131072 tokens per request: 0.00x


ValueError: No available memory for the cache blocks. Try increasing `gpu_memory_utilization` when initializing the engine.

In [6]:
# Test documents (replace with your own examples)
test_documents = [
    "Diabetes also can causes problems with blood flow. The ulcer itself is usually caused by: Repetitive trauma or pressure on the foot; Puncture wound on the foot. Tell your doctor if you have any of these symptoms: Sores, ulcers, or blisters on the foot or lower leg; Pain.",
    
    "However, many fans can recall that shows do not get cancelled because of ratings alone. According to the publication, Blue Bloods Season 7 may be its last or close to the finale because the production costs are getting too high.",
    
    "The Golden State Warriors are an American professional basketball team based in San Francisco. The Warriors compete in the National Basketball Association (NBA), as a member of the league's Western Conference Pacific Division.",
    
    "Machine learning is a branch of artificial intelligence (AI) and computer science which focuses on the use of data and algorithms to imitate the way that humans learn, gradually improving its accuracy.",
    
    "Python is a high-level, general-purpose programming language. Its design philosophy emphasizes code readability with the use of significant indentation. Python is dynamically typed and garbage-collected.",

    "Hypertension, or high blood pressure, can lead to severe health complications if left untreated. Common symptoms include headaches, shortness of breath, and dizziness. Lifestyle changes and medication can help manage the condition effectively.",

    "The latest season of Stranger Things has captivated audiences worldwide. Critics praise its nostalgic 80s references and compelling storytelling. However, some fans argue that the pacing in the latest episodes feels slower compared to previous seasons.",

    "Real Madrid is one of the most successful football clubs in history, boasting multiple UEFA Champions League titles. Known for their attacking playstyle, the club has produced legends like Cristiano Ronaldo, Zinedine Zidane, and Raúl.",

    "Cloud computing allows users to store and access data remotely, reducing dependency on local storage. Major providers like Amazon Web Services (AWS), Google Cloud, and Microsoft Azure dominate the market with their scalable infrastructure.",

    "JavaScript is a versatile programming language used primarily for web development. It enables dynamic interactions on web pages and is supported by all major browsers. Frameworks like React and Angular simplify the development process."




]

In [7]:

# 2. Test batch processing with multiple queries per document
print("\nBATCH PROCESSING (80 queries per document):")
print("-" * 80)
results = batch_generate_queries_vllm(test_documents, num_queries=80)

for i, doc_queries in enumerate(results):
    print(f"Document {i+1}: {test_documents[i][:100]}...")
    print("Generated queries:")
    for j, query in enumerate(doc_queries):
        print(f"  {j+1}. {query}")
    print("-" * 80)


BATCH PROCESSING (80 queries per document):
--------------------------------------------------------------------------------


Processed prompts:   0%|          | 0/800 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed prompts:   1%|▏         | 10/800 [00:02<03:41,  3.57it/s, est. speed input: 207.26 toks/s, output: 1925.99 toks/s]

Document 1: Diabetes also can causes problems with blood flow. The ulcer itself is usually caused by: Repetitive...
Generated queries:
  1. what causes a foot ulcer
  2. what are the causes of a foot ulcer
  3. what causes foot ulcers to heal
  4. what causes an ulcer
  5. diabetes foot ulcers causes
  6. symptoms of a foot ulcer
  7. what causes foot ulcers
  8. causes of foot ulcers
  9. what can cause a foot ulcer
  10. what causes blood to flow in the legs when there is no blood pressure in the body
  11. what causes blisters on your leg
  12. can diabetes cause ulcers on legs
  13. what causes an ulcer?
  14. what can cause a diabetic ulcer
  15. diabetes and foot ulcers
  16. what can cause a sore in your foot
  17. can diabetes cause foot ulcers
  18. can diabetes cause foot ulcers
  19. can diabetes cause foot ulcers
  20. diabetes causes of foot ulcers
  21. causes of foot ulcers
  22. causes of foot ulcers
  23. what causes a foot ulcer
  24. causes of ulcers in feet
  25. ca




In [7]:
document = "The presence of communication amid scientific minds was equally important to the success of the Manhattan Project as scientific intellect was. The only cloud hanging over the impressive achievement of the atomic researchers and engineers is what their success truly meant; hundreds of thousands of innocent lives obliterated."
test_documents = [document]

In [8]:

# 2. Test batch processing with multiple queries per document
print("\nBATCH PROCESSING (80 queries per document):")
print("-" * 80)
results = batch_generate_queries_vllm(test_documents, num_queries=80)

for i, doc_queries in enumerate(results):
    print(f"Document {i+1}: {test_documents[i][:100]}...")
    print("Generated queries:")
    for j, query in enumerate(doc_queries):
        print(f"  {j+1}. {query}")
    print("-" * 80)


BATCH PROCESSING (80 queries per document):
--------------------------------------------------------------------------------


Processed prompts:   1%|▏         | 1/80 [00:00<00:57,  1.37it/s, est. speed input: 91.82 toks/s, output: 1210.03 toks/s]

Document 1: The presence of communication amid scientific minds was equally important to the success of the Manh...
Generated queries:
  1. what was the significance of the manhattan project
  2. what did the united states use for the manhattan project
  3. what was an important factor in the success of the atomic bomb project
  4. how important was communication for the scientists
  5. why was the manhattan project significant
  6. what is a scientific intellect
  7. what is the significance of manhattan project?
  8. what was the significance of the manhattan project
  9. significance of manhattan project
  10. why was the manhattan project so important to americans
  11. what caused the success of the atomic bomb
  12. what impact did scientists have on the manhattan project
  13. how did the success of the manhattan project impacted american society
  14. what were the challenges of working on the manhattan project?
  15. why was the manhattan project important
  16. what was the m




In [9]:
results[0]

['what was the significance of the manhattan project',
 'what did the united states use for the manhattan project',
 'what was an important factor in the success of the atomic bomb project',
 'how important was communication for the scientists',
 'why was the manhattan project significant',
 'what is a scientific intellect',
 'what is the significance of manhattan project?',
 'what was the significance of the manhattan project',
 'significance of manhattan project',
 'why was the manhattan project so important to americans',
 'what caused the success of the atomic bomb',
 'what impact did scientists have on the manhattan project',
 'how did the success of the manhattan project impacted american society',
 'what were the challenges of working on the manhattan project?',
 'why was the manhattan project important',
 "what was the manhattan project's major achievement",
 'was the development of the atomic bomb an important part of the manhattan project',
 'what are some important accomplis

In [10]:
results_str = ' '.join(results[0])

In [14]:
results_str

"what was the significance of the manhattan project what did the united states use for the manhattan project what was an important factor in the success of the atomic bomb project how important was communication for the scientists why was the manhattan project significant what is a scientific intellect what is the significance of manhattan project? what was the significance of the manhattan project significance of manhattan project why was the manhattan project so important to americans what caused the success of the atomic bomb what impact did scientists have on the manhattan project how did the success of the manhattan project impacted american society what were the challenges of working on the manhattan project? why was the manhattan project important what was the manhattan project's major achievement was the development of the atomic bomb an important part of the manhattan project what are some important accomplishments of the scientific community in the development of the atomic b

In [16]:
unique_joined = document

for words in results_str.split():
    if words not in unique_joined:
        unique_joined += words
        unique_joined += " "


In [17]:
unique_joined

"The presence of communication amid scientific minds was equally important to the success of the Manhattan Project as scientific intellect was. The only cloud hanging over the impressive achievement of the atomic researchers and engineers is what their success truly meant; hundreds of thousands of innocent lives obliterated.significance manhattan project did united states use for factor bomb how scientists why significant project? so americans caused impact have impacted society were challenges working project's major development part are some accomplishments community science contribute involved decisions made guide importance called getting bombs role world history learn from mindset benefit benefits greater than war. considered many lost there enough happened that successful bomb? discovery important? does relate related play successful? during affect with special about knowledge energy had people same "

In [15]:
document

'The presence of communication amid scientific minds was equally important to the success of the Manhattan Project as scientific intellect was. The only cloud hanging over the impressive achievement of the atomic researchers and engineers is what their success truly meant; hundreds of thousands of innocent lives obliterated.'