In [5]:
import os
import torch
import random
import argparse
import warnings

from peft import AutoPeftModelForCausalLM
from trl import setup_chat_format, SFTTrainer
from datasets import load_dataset, disable_caching
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TrainingArguments, pipeline

from codecarbon import EmissionsTracker

from utils import init_wandb, build_prompts, get_preds, get_labels, evaluate

warnings.filterwarnings("ignore")

In [6]:
# Free CUDA memory
torch.cuda.empty_cache()

ADAPTER_ID = "kahliahogg/climate-test"
BASE_ID = "eci-io/climategpt-7b"

# GPU/CPU
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f"Running evaluation on {device}")

# Load test dataset
test_prompts = load_dataset("json", data_files="data/test_prompts.json", split="train")
print(f"Loaded {test_prompts.num_rows} test samples")

Running evaluation on cuda:0
Loaded 320 test samples


### Evaluate Baseline Model

In [9]:
tokenizer = AutoTokenizer.from_pretrained(BASE_ID)
model = AutoModelForCausalLM.from_pretrained(BASE_ID)

# ChatML Config
model, tokenizer = setup_chat_format(model, tokenizer)

# Load merged model into pipeline
pipe = pipeline(
    task="text-generation", 
    model=model, 
    tokenizer=tokenizer
)

# Test Preds
y_preds = get_preds(test_prompts, pipe)
y_true = get_labels(test_prompts)

# Evaluate
evaluate(y_true, y_preds, ".", log_to_wandb=False)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
 48%|████▊     | 153/320 [11:25<13:21,  4.80s/it]

### Evaluate Finetuned Model

In [None]:
# Free CUDA memory..again
del model
del tokenizer
torch.cuda.empty_cache()

# Load model with PEFT adapter
model = AutoPeftModelForCausalLM.from_pretrained(
    ADAPTER_ID,
    torch_dtype=torch.float16,
    low_cpu_mem_usage=True,
    device_map=device,
)

tokenizer = AutoTokenizer.from_pretrained(ADAPTER_ID)

# Load merged model into pipeline
pipe = pipeline(
    task="text-generation", 
    model=model, 
    tokenizer=tokenizer
)

# Test Preds
y_preds = get_preds(test_prompts, pipe)
y_true = get_labels(test_prompts)

# Evaluate
evaluate(y_true, y_preds, ".", log_to_wandb=False)