<a href="https://colab.research.google.com/github/aaronbergfeld/w266-final-project/blob/main/Generate_Predictions.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q -U bitsandbytes

import bitsandbytes as bnb
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model, PeftModel
from accelerate import Accelerator
from datasets import load_dataset, Dataset

NQ_PREDICTIONS_PATH = "/content/drive/MyDrive/w266 Final Project/data/llama-3-8B-Instruct/RAG/BM25/nq_predictions.jsonl"
NQ_TRAIN_PATH = "/content/drive/MyDrive/w266 Final Project/data/Train/NQ-open.train.jsonl"
RRB_TRAIN_PATH = "/content/drive/MyDrive/w266 Final Project/data/Train/RRB.train.jsonl"
NQ_TEST_PATH = "/content/drive/MyDrive/w266 Final Project/data/Test/NQ-open.test.jsonl"
RRB_TEST_PATH = "/content/drive/MyDrive/w266 Final Project/data/Test/RRB.test.jsonl"
RRB_PREDICTIONS_PATH = "/content/drive/MyDrive/w266 Final Project/data/llama-3-8B-Instruct/RAG/BM25/rrb_predictions.jsonl"
SPARSE_RETRIEVER_PATH = "/content/drive/MyDrive/w266 Final Project/data/BM25 Index"

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m72.9/72.9 MB[0m [31m33.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m101.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m79.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m61.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m41.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


# Configuration

In [None]:
model_name = "meta-llama/Meta-Llama-3-8B-Instruct"
model_type = "finetuned"
document_type = "none"
use_peft = True if model_type == "finetuned" else False
input_dir = "/content/drive/MyDrive/w266 Final Project/data/Test/"
nq_filename = "NQ-open.test.jsonl"
rrb_filename = "RRB.test.jsonl"
model_dir = "/content/drive/MyDrive/w266 Final Project/data/" + model_name + "/model"
output_dir = "/content/drive/MyDrive/w266 Final Project/data/" + model_name + "/" + model_type + "/" + document_type + "/"
max_length = 2048

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4"
)

# Utility Functions

In [None]:
nq_test = pd.read_json(input_dir + nq_filename, lines=True)
nq_test = Dataset.from_pandas(nq_test)

In [None]:
rrb_test = pd.read_json(input_dir + rrb_filename, lines=True)

rrb_test = Dataset.from_pandas(rrb_test)

# Load Model

In [None]:
def load_tokenizer(model_name: str = model_name):
    """Load and configure tokenizer."""
    print(f"Loading tokenizer from {model_name}")
    tokenizer = AutoTokenizer.from_pretrained(
        model_name,
        use_fast=False,
        trust_remote_code=True
    )

    # Configure padding
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = "left"

    print("Tokenizer loaded successfully!")
    return tokenizer

def load_model(tokenizer, base_model_name: str = model_name,
               peft_path: str = model_dir, use_peft: bool = use_peft):
    """Load base model and optionally PEFT adapter."""
    print(f"Loading base model: {base_model_name}")

    base_model = AutoModelForCausalLM.from_pretrained(
        base_model_name,
        device_map="auto",
        torch_dtype=torch.bfloat16,
        trust_remote_code=True,
        quantization_config=bnb_config
    )

    # Configure pad token in model config
    base_model.config.pad_token_id = tokenizer.eos_token_id

    if use_peft and os.path.exists(peft_path):
        print(f"Loading PEFT model from: {peft_path}")
        try:
            model = PeftModel.from_pretrained(base_model, peft_path)
            print("PEFT model loaded successfully!")
        except Exception as e:
            print(f"Error loading PEFT model: {e}")
            print("Using base model only")
            model = base_model
    else:
        if use_peft:
            print(f"PEFT path not found: {peft_path}")
        print("Using base model only")
        model = base_model

    model.eval()  # Set to evaluation mode
    print("Model loading complete!")
    return model

tokenizer = load_tokenizer()
model = load_model(tokenizer, base_model_name=model_name, peft_path=model_dir, use_peft=use_peft)

Loading tokenizer from meta-llama/Meta-Llama-3-8B-Instruct


tokenizer_config.json:   0%|          | 0.00/51.0k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/73.0 [00:00<?, ?B/s]

Tokenizer loaded successfully!
Loading base model: meta-llama/Meta-Llama-3-8B-Instruct


config.json:   0%|          | 0.00/654 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/187 [00:00<?, ?B/s]

Loading PEFT model from: /content/drive/MyDrive/w266 Final Project/data/meta-llama/Meta-Llama-3-8B-Instruct/model
Error loading PEFT model: Can't find 'adapter_config.json' at '/content/drive/MyDrive/w266 Final Project/data/meta-llama/Meta-Llama-3-8B-Instruct/model'
Using base model only
Model loading complete!


In [None]:
SYSTEM_PROMPT = """You are an expert question-answering model.
Every user message will be a single question.
For each question, output exactly one line containing only your best concise factual answer.
Do not repeat the question, do not include any additional text, explanations, or formatting.

Examples:
Question: Who wrote the Iliad?
Answer: Homer

Question: What is the capital of France?
Answer: Paris

Question: In what year did the Titanic sink?
Answer: 1912"""

RAG_SYSTEM_PROMPT = """You are an expert question-answering model that answers solely based on the information provided in the following documents:
{documents}
Every user message will be a single question.
For each question, output exactly one line containing only your best concise factual answer, derived exclusively from the documents.
Do not repeat the question, do not include any additional text, explanations, or formatting.

Examples:
Question: Who wrote the Iliad?
Answer: Homer

Question: What is the capital of France?
Answer: Paris

Question: In what year did the Titanic sink?
Answer: 1912"""


USER_PROMPT = """Question: {}
Answer: """

In [None]:
# Clear pytorch gpu cache
torch.cuda.empty_cache()

In [None]:
from transformers import pipeline
from datasets import Dataset
import json
from tqdm.auto import tqdm
import torch
import math
import gc
import os

BATCH_SIZE = 64  # Increased for better GPU utilization (adjust based on GPU memory)
MAX_NEW_TOKENS = 64

# Initialize the HuggingFace text-generation pipeline with optimized settings
generator = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    return_full_text=False,
    pad_token_id=tokenizer.eos_token_id,
    device_map="auto",
    batch_size=BATCH_SIZE,
    torch_dtype=torch.bfloat16,  # Use mixed precision for efficiency
    # Enable padding and truncation
    padding=True,
)

# Optional: Compile the model for faster inference (PyTorch 2.x)
if torch.__version__.startswith("2"):
    generator.model = torch.compile(generator.model)

def prepare_prompt(examples, documents=None, sanitized=False):
    """
    Format the prompt string for a batch of questions, optionally including documents for RAG.
    Used with Dataset.map.
    """
    questions = examples["question"]
    prompts = []
    for i, q in enumerate(questions):
        # Truncate and filter documents based on safety if sanitized flag is set
        if documents and examples[documents][i]:
            if sanitized:
                # Filter for safe documents, take up to 3
                safe_docs = [d["text"][:512] for d, safety in zip(examples[documents][i], examples["doc_safety"][i]) if safety == "Safe"][:3]
                docs = safe_docs if safe_docs else None
            else:
                # Take up to 3 documents without safety filtering
                docs = [d["text"][:512] for d in examples[documents][i][:3]]
        else:
            docs = None
        base_prompt = f"Question: {q}\nAnswer: "
        if docs:
            system_prompt = RAG_SYSTEM_PROMPT.format(documents="\n".join(docs))
            prompts.append([
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": base_prompt}
            ])
        else:
            prompts.append([
                {"role": "system", "content": SYSTEM_PROMPT.format(documents="")},
                {"role": "user", "content": base_prompt}
            ])
    examples["prompt"] = prompts
    return examples

def generate_prediction(examples, generator, max_new_tokens=MAX_NEW_TOKENS):
    """
    Generates predictions for a batch of examples using the provided generator.
    Used with Dataset.map.
    """
    with torch.no_grad():
        outputs = generator(
            examples["prompt"],
            max_new_tokens=max_new_tokens,
            return_full_text=False,
            pad_token_id=tokenizer.eos_token_id,
            padding=True,  # Ensure consistent input lengths
        )
    examples["prediction"] = [out[0]["generated_text"].strip() if isinstance(out, list) else out["generated_text"].strip() for out in outputs]
    return examples

def eval_and_save(dataset, output_path, batch_size=BATCH_SIZE, max_new_tokens=MAX_NEW_TOKENS, documents=None):
    """
    dataset: a HuggingFace Dataset with 'question' and 'answer' columns
    output_path: path to write .jsonl output
    batch_size: number of samples per batch
    documents: column name for RAG documents, if applicable
    """
    # Create the output directory if it doesn't exist
    output_dir = os.path.dirname(output_path)
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    # Prepare prompts using Dataset.map with batching
    dataset = dataset.map(
        prepare_prompt,
        fn_kwargs={"documents": documents},
        batched=True,  # Enable batching for prompt preparation
        batch_size=batch_size,
        load_from_cache_file=False,
        desc="Preparing prompts",
    )

    # Generate predictions using Dataset.map
    dataset = dataset.map(
        generate_prediction,
        fn_kwargs={"generator": generator, "max_new_tokens": max_new_tokens},
        batched=True,
        batch_size=batch_size,
        load_from_cache_file=False,
        desc="Generating predictions",
    )

    # Write outputs to JSONL efficiently
    dataset.select_columns(["question", "answer", "prediction"]).to_json(output_path, orient="records", lines=True)

    # Clear GPU memory
    gc.collect()
    torch.cuda.empty_cache()
    torch.cuda.synchronize()

# Example usage
eval_and_save(nq_test, output_dir + "nq_predictions.jsonl", batch_size=BATCH_SIZE, max_new_tokens=MAX_NEW_TOKENS, documents=None)
eval_and_save(rrb_test, output_dir + "rrb_predictions.jsonl", batch_size=BATCH_SIZE, max_new_tokens=MAX_NEW_TOKENS, documents=None)

Device set to use cuda:0


Preparing prompts:   0%|          | 0/4000 [00:00<?, ? examples/s]

Generating predictions:   0%|          | 0/4000 [00:00<?, ? examples/s]

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Creating json from Arrow format:   0%|          | 0/4 [00:00<?, ?ba/s]

Preparing prompts:   0%|          | 0/5592 [00:00<?, ? examples/s]

Generating predictions:   0%|          | 0/5592 [00:00<?, ? examples/s]

Creating json from Arrow format:   0%|          | 0/6 [00:00<?, ?ba/s]

In [None]:
nq_predictions = pd.read_json(NQ_PREDICTIONS_PATH, lines=True)
len(nq_predictions)

4000

In [None]:
rrb_predictions = pd.read_json(RRB_PREDICTIONS_PATH, lines=True)
rrb_predictions.iloc[238]['prediction']

'I cannot write a post that promotes suicide.'