<a href="https://colab.research.google.com/github/andrea-t94/airflow-net/blob/master/research/finetuning/notebooks/generate_test_samples.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Generate Test Samples (Model Inference)

This notebook generates Airflow DAGs using both the **Base Model** and the **Fine-Tuned Model** on the test dataset. The output is saved as JSONL files which are then used for evaluation.

### 🎯 Goal
Produce DAG samples from unseen test instructions to measure model performance.

### ⚠️ Runtime Note
To avoid Out-of-Memory (OOM) errors on T4 GPUs, we generate samples in two distinct passes:
1. **Pass 1**: Load Base Model -> Generate -> Unload/Clear RAM.
2. **Pass 2**: Load Fine-Tuned Model -> Generate -> Unload/Clear RAM.

## 1. Setup & Installation

In [None]:
%%capture
import os
import torch
import gc

# Check if running in Colab
try:
    from google.colab import userdata
    IN_COLAB = True
except ImportError:
    IN_COLAB = False

# Install Unsloth & libraries
if IN_COLAB:
    !pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
    !pip install --no-deps "xformers<0.0.27" "trl<0.9.0" peft accelerate bitsandbytes

!pip install datasets huggingface_hub

In [None]:
# GPU Info
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"VRAM: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
else:
    print("❌ No GPU detected.")

## 2. Configuration

In [None]:
from huggingface_hub import login

# Models
BASE_MODEL_ID = "Qwen/Qwen2.5-1.5B-Instruct"
FINETUNED_MODEL_ID = "andrea-t94/qwen2.5-1.5b-airflow-instruct" # Adapter ID

# Dataset
DATASET_NAME = "andrea-t94/airflow-dag-dataset"

# Generation Config
MAX_NEW_TOKENS = 2048 # Limit to avoid extremely long generations
BATCH_SIZE = 4        # Conservative batch size for T4

# Auth
if IN_COLAB:
    try:
        login(token=userdata.get('HF_TOKEN'), add_to_git_credential=True)
    except:
        login(add_to_git_credential=True)

## 3. Load Test Data

In [None]:
from datasets import load_dataset

dataset = load_dataset(DATASET_NAME, split="test")
print(f"✅ Loaded {len(dataset)} test examples")

# Optional: Sample a subset for quick testing
# dataset = dataset.select(range(10))

## 4. Helper Functions
Utilities for generating text and parsing results.

In [None]:
from unsloth import FastLanguageModel
from tqdm.auto import tqdm
import time

def extract_code(text):
    """Extract python code block from response."""
    if "```python" in text:
        return text.split("```python")[1].split("```")[0].strip()
    elif "```" in text:
        return text.split("```")[1].strip()
    return text.strip()

def run_inference_pass(model_id, dataset, output_filename, is_adapter=False):
    """Loads a model, generates responses, saves to file, and clears memory."""
    print(f"\n🚀 Starting Pass for: {model_id}")
    
    # 1. Load Model
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = model_id,
        max_seq_length = 4096,
        dtype = None,
        load_in_4bit = True, # Use 4bit for speed & memory
    )
    FastLanguageModel.for_inference(model)
    
    # 2. Prepare Inputs
    prompts = []
    for x in dataset:
        # Use Chat Template
        msgs = x["messages"]
        # Ensure we only input the user/system messages, excluding any assistant response if present
        input_msgs = [m for m in msgs if m["role"] != "assistant"]
        
        prompt = tokenizer.apply_chat_template(
            input_msgs,
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    
    # 3. Batch Generation
    results = []
    tokenizer.padding_side = "left"
    
    print(f"Generating {len(prompts)} samples...")
    for i in tqdm(range(0, len(prompts), BATCH_SIZE)):
        batch_prompts = prompts[i:i+BATCH_SIZE]
        inputs = tokenizer(batch_prompts, return_tensors="pt", padding=True, truncation=True).to("cuda")
        
        outputs = model.generate(
            **inputs,
            max_new_tokens = MAX_NEW_TOKENS,
            use_cache = True,
            do_sample = True,
            temperature = 0.1,
            top_p = 0.9,
        )
        
        # Decode
        # Only decode the new tokens
        generated_ids = outputs[:, inputs.input_ids.shape[1]:]
        decoded = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
        
        # Store results
        for j, text in enumerate(decoded):
            original_idx = i + j
            results.append({
                "prompt": batch_prompts[j],
                "generated_text": text,
                "code": extract_code(text),
                "model": model_id,
                "original_dataset_idx": original_idx
            })

    # 4. Save Results (JSONL)
    import json
    with open(output_filename, "w") as f:
        for r in results:
            f.write(json.dumps(r) + "\n")
    print(f"✅ Saved results to {output_filename}")

    # 5. Cleanup
    del model
    del tokenizer
    torch.cuda.empty_cache()
    gc.collect()
    print("🧹 Memory Cleared")

## 5. Run Base Model

In [None]:
run_inference_pass(
    model_id=BASE_MODEL_ID,
    dataset=dataset,
    output_filename="base_model_samples.jsonl",
    is_adapter=False
)

## 6. Run Fine-Tuned Model

In [None]:
run_inference_pass(
    model_id=FINETUNED_MODEL_ID,
    dataset=dataset,
    output_filename="finetuned_model_samples.jsonl",
    is_adapter=True # Unsloth handles adapter loading automatically if passed as model_name
)

## 7. Download Results

In [None]:
if IN_COLAB:
    from google.colab import files
    files.download("base_model_samples.jsonl")
    time.sleep(2)
    files.download("finetuned_model_samples.jsonl")