# Evaluation: Model Assessment

This notebook evaluates trained models on test sets and benchmark prompts.

For comprehensive evaluation, use the evaluation scripts from the terminal.


In [None]:
import sys, os
from pathlib import Path
os.chdir('..')
sys.path.insert(0, str(Path.cwd() / 'src'))

# Config
MODEL_PATH = 'models/qwen_7b_contemplative'
TEST_SPLIT = 'data/splits/default_split.json'

print(f"Model: {MODEL_PATH}")
print(f"Test split: {TEST_SPLIT}")


## Load Model and Test Data


In [None]:
import json
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel
import torch

# Load model
print("Loading model...")
base_model = AutoModelForCausalLM.from_pretrained(
    "Qwen/Qwen2.5-7B-Instruct",
    torch_dtype=torch.float16,
    device_map="cuda"
)
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-7B-Instruct")
model = PeftModel.from_pretrained(base_model, MODEL_PATH)

print("✅ Model loaded")


## Generate Test Responses


In [None]:
# Test prompts
test_prompts = [
    "How can I cultivate more compassion in difficult situations?",
    "What does it mean to see things as they really are?",
    "How should I approach conflicts with mindfulness?"
]

results = []
for prompt in test_prompts:
    print(f"\\n{'='*60}")
    print(f"Prompt: {prompt}")
    print(f"{'='*60}")
    
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=200,
            do_sample=True,
            temperature=0.7
        )
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    print(f"\\nResponse:\\n{response[len(prompt):].strip()}")
    
    results.append({'prompt': prompt, 'response': response})


In [None]:
# Save and sync results
import yaml
from utils.sagemaker_utils import sync_to_s3

eval_file = 'results/evaluation_results.json'
with open(eval_file, 'w') as f:
    json.dump(results, f, indent=2)

print(f"\\n✅ Evaluation results saved to {eval_file}")

# Sync to S3
with open('configs/sagemaker_configs.yaml') as f:
    cfg = yaml.safe_load(f)

if cfg['s3']['bucket'] != "your-bucket-contemplative-ai":
    sync_to_s3(eval_file, f"s3://{cfg['s3']['bucket']}/results/evaluations/evaluation_results.json")
    print("✅ Results synced to S3")
