# AgricGPT - Fine-tuning with CROP-benchmark Evaluation

- **QLoRA** fine-tuning on AI4Agr/CROP-dataset
- **Evaluation** using official CROP-benchmark (accuracy metrics)
- **Model card** with benchmark results pushed to HuggingFace

In [None]:
%pip install -q torch transformers datasets peft bitsandbytes accelerate huggingface_hub

In [None]:
from huggingface_hub import login
login()

## Configuration

In [None]:
import torch, math, json, re, os
from collections import defaultdict

# Model
MODEL_NAME = "microsoft/phi-2"
OUTPUT_DIR = "./agri_model_results"

# HuggingFace
HF_MODEL_NAME = "agricgpt-phi2"
PUSH_TO_HUB = True
SAVE_STEPS = 100

# Data
DATASET_SIZE = 5000
VALIDATION_SPLIT = 0.1
MAX_SEQ_LENGTH = 512
BENCHMARK_SAMPLE_SIZE = 500  # English benchmark questions

# LoRA
LORA_R, LORA_ALPHA, LORA_DROPOUT = 16, 32, 0.05
TARGET_MODULES = ["fc1", "fc2", "q_proj", "k_proj", "v_proj", "dense"]

# Training
NUM_EPOCHS, BATCH_SIZE = 3, 2
GRADIENT_ACCUMULATION_STEPS = 4
LEARNING_RATE = 2e-4
LOGGING_STEPS, EVAL_STEPS = 10, 50

assert torch.cuda.is_available(), "GPU required!"
print(f"GPU: {torch.cuda.get_device_name(0)}")

## Load Model

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, GenerationConfig, pipeline

bnb_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.float16)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, quantization_config=bnb_config, trust_remote_code=True, device_map={"":0})
model.config.use_cache = False
print(f"Loaded: {MODEL_NAME}")

## Load CROP-benchmark (English)

In [None]:
from datasets import load_dataset

benchmark = load_dataset("AI4Agr/CROP-benchmark", split="train")

def is_english(text):
    if not text: return False
    return sum(1 for c in text if ord(c) < 128) / len(text) > 0.7

english_benchmark = benchmark.filter(lambda x: is_english(x.get("Question", "")))
print(f"English questions: {len(english_benchmark)}")

if BENCHMARK_SAMPLE_SIZE:
    english_benchmark = english_benchmark.shuffle(seed=42).select(range(min(BENCHMARK_SAMPLE_SIZE, len(english_benchmark))))
    print(f"Using {len(english_benchmark)} for evaluation")

## MCQ Evaluation Functions

In [None]:
def format_mcq_prompt(q, opts):
    return f"""### Instruction:
Answer the following agricultural question by selecting the correct option (A, B, C, or D).

Question: {q}

A) {opts['A']}
B) {opts['B']}
C) {opts['C']}
D) {opts['D']}

Reply with only the letter of the correct answer.

### Response:
"""

def extract_answer(response):
    response = response.strip().upper()
    match = re.search(r'\b([ABCD])\b', response)
    return match.group(1) if match else (response[0] if response and response[0] in 'ABCD' else None)

def evaluate_mcq(pipe, gen_config, data):
    correct, total = 0, 0
    by_level = defaultdict(lambda: {"correct": 0, "total": 0})
    
    for item in data:
        q = item.get("Question", "")
        opts = {"A": item.get("Option A", ""), "B": item.get("Option B", ""), "C": item.get("Option C", ""), "D": item.get("Option D", "")}
        ans = item.get("Answer", "").strip().upper()
        level = item.get("Level", "Unknown")
        
        if not q or not ans: continue
        
        torch.manual_seed(42)
        result = pipe(format_mcq_prompt(q, opts), generation_config=gen_config)
        pred = extract_answer(result[0]['generated_text'].split("### Response:")[-1])
        
        if pred == ans:
            correct += 1
            by_level[level]["correct"] += 1
        total += 1
        by_level[level]["total"] += 1
    
    return {"accuracy": correct/total if total else 0, "correct": correct, "total": total, "by_level": dict(by_level)}

print("Evaluation functions ready")

## Base Model Evaluation (BEFORE Training)

In [None]:
base_pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer)
gen_config = GenerationConfig(max_new_tokens=10, do_sample=False, eos_token_id=tokenizer.eos_token_id, pad_token_id=tokenizer.eos_token_id)

print("Evaluating base model on CROP-benchmark...")
base_results = evaluate_mcq(base_pipe, gen_config, english_benchmark)
print(f"\nðŸ“Š Base Model Accuracy: {base_results['accuracy']:.2%} ({base_results['correct']}/{base_results['total']})")

## Prepare Training Data

In [None]:
dataset = load_dataset("AI4Agr/CROP-dataset", data_files="**/*_en/**/*.json", split="train")
if DATASET_SIZE: dataset = dataset.select(range(min(DATASET_SIZE, len(dataset))))

def format_instruction(s):
    return {"text": f"### Instruction:\n{s['instruction']}\n\n### Response:\n{s['output']}{tokenizer.eos_token}"}

dataset = dataset.map(format_instruction)
dataset = dataset.train_test_split(test_size=VALIDATION_SPLIT, seed=42)
train_ds, eval_ds = dataset["train"], dataset["test"]

def tokenize(examples):
    return tokenizer(examples["text"], truncation=True, max_length=MAX_SEQ_LENGTH, padding="max_length")

tok_train = train_ds.map(tokenize, batched=True, remove_columns=train_ds.column_names)
tok_eval = eval_ds.map(tokenize, batched=True, remove_columns=eval_ds.column_names)
print(f"Train: {len(tok_train)}, Eval: {len(tok_eval)}")

## Configure LoRA

In [None]:
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

model = prepare_model_for_kbit_training(model)
peft_config = LoraConfig(r=LORA_R, lora_alpha=LORA_ALPHA, target_modules=TARGET_MODULES, lora_dropout=LORA_DROPOUT, bias="none", task_type="CAUSAL_LM")
model = get_peft_model(model, peft_config)

trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
total = sum(p.numel() for p in model.parameters())
print(f"Trainable: {trainable:,} / {total:,} ({100*trainable/total:.2f}%)")

## Training

In [None]:
from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling

training_args = TrainingArguments(
    output_dir=OUTPUT_DIR, num_train_epochs=NUM_EPOCHS,
    per_device_train_batch_size=BATCH_SIZE, per_device_eval_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS, learning_rate=LEARNING_RATE,
    logging_steps=LOGGING_STEPS, fp16=True, optim="paged_adamw_32bit",
    warmup_ratio=0.03, lr_scheduler_type="cosine",
    eval_strategy="steps", eval_steps=EVAL_STEPS,
    save_strategy="steps", save_steps=SAVE_STEPS, save_total_limit=3,
    load_best_model_at_end=True, metric_for_best_model="eval_loss", greater_is_better=False,
    push_to_hub=PUSH_TO_HUB, hub_model_id=HF_MODEL_NAME if PUSH_TO_HUB else None,
    hub_strategy="every_save", report_to="none", seed=42
)

trainer = Trainer(model=model, train_dataset=tok_train, eval_dataset=tok_eval, args=training_args,
                  data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False))

print("Starting training...")
trainer.train()

## Fine-tuned Model Evaluation (AFTER Training)

In [None]:
from transformers import logging
logging.set_verbosity(logging.CRITICAL)
model.eval()

ft_pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer)

print("Evaluating fine-tuned model on CROP-benchmark...")
ft_results = evaluate_mcq(ft_pipe, gen_config, english_benchmark)
print(f"\nðŸ“Š Fine-tuned Accuracy: {ft_results['accuracy']:.2%} ({ft_results['correct']}/{ft_results['total']})")

## Evaluation Summary

In [None]:
# Perplexity
def calc_ppl(model, tokenizer, texts):
    model.eval()
    total_loss, total_tokens = 0, 0
    with torch.no_grad():
        for text in texts[:100]:
            inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
            inputs = {k: v.to(model.device) for k, v in inputs.items()}
            out = model(**inputs, labels=inputs["input_ids"])
            total_loss += out.loss.item() * inputs["input_ids"].size(1)
            total_tokens += inputs["input_ids"].size(1)
    return math.exp(total_loss / total_tokens)

perplexity = calc_ppl(model, tokenizer, [s["text"] for s in eval_ds])

# Get losses
history = trainer.state.log_history
train_losses = [(h['step'], h['loss']) for h in history if 'loss' in h and 'eval_loss' not in h]
eval_losses = [(h['step'], h['eval_loss']) for h in history if 'eval_loss' in h]

print("\n" + "="*60)
print("EVALUATION SUMMARY")
print("="*60)
print(f"\nðŸ“‰ Training Loss: {train_losses[0][1]:.4f} â†’ {train_losses[-1][1]:.4f}")
print(f"ðŸ“ˆ Validation Loss: {eval_losses[0][1]:.4f} â†’ {min(e[1] for e in eval_losses):.4f}")
print(f"ðŸŽ¯ Perplexity: {perplexity:.2f}")
print(f"\nðŸ“‹ CROP-benchmark Results:")
print(f"   Base Model:      {base_results['accuracy']:.2%}")
print(f"   Fine-tuned:      {ft_results['accuracy']:.2%}")
print(f"   Improvement:     +{(ft_results['accuracy']-base_results['accuracy'])*100:.1f}%")

print(f"\nðŸ“Š By Difficulty:")
for level, stats in sorted(ft_results['by_level'].items()):
    acc = stats['correct']/stats['total'] if stats['total'] else 0
    print(f"   Level {level}: {acc:.2%} ({stats['correct']}/{stats['total']})")

## Save & Push to HuggingFace

In [None]:
# Save results
os.makedirs(OUTPUT_DIR, exist_ok=True)
results = {
    "benchmark": "CROP-benchmark",
    "base_accuracy": base_results['accuracy'],
    "finetuned_accuracy": ft_results['accuracy'],
    "perplexity": perplexity,
    "best_eval_loss": min(e[1] for e in eval_losses),
    "accuracy_by_level": {k: v['correct']/v['total'] for k,v in ft_results['by_level'].items() if v['total']>0}
}
with open(f"{OUTPUT_DIR}/evaluation_results.json", "w") as f:
    json.dump(results, f, indent=2)

if PUSH_TO_HUB:
    from huggingface_hub import HfApi
    
    # Model card
    card = f"""---
language: [en]
license: apache-2.0
tags: [agriculture, phi-2, qlora, crop-science]
datasets: [AI4Agr/CROP-dataset]
base_model: microsoft/phi-2
model-index:
- name: AgricGPT-Phi2
  results:
  - task: {{type: question-answering, name: Agricultural MCQ}}
    dataset: {{name: CROP-benchmark, type: AI4Agr/CROP-benchmark}}
    metrics:
    - {{type: accuracy, value: {ft_results['accuracy']*100:.1f}, name: Accuracy}}
---

# AgricGPT - Agricultural QA Model

## Benchmark Results (CROP-benchmark)

| Model | Accuracy |
|-------|----------|
| Base Phi-2 | {base_results['accuracy']*100:.1f}% |
| **AgricGPT** | **{ft_results['accuracy']*100:.1f}%** (+{(ft_results['accuracy']-base_results['accuracy'])*100:.1f}%) |

| Metric | Value |
|--------|-------|
| Perplexity | {perplexity:.2f} |
| Val Loss | {results['best_eval_loss']:.4f} |
"""
    with open(f"{OUTPUT_DIR}/README.md", "w") as f:
        f.write(card)
    
    trainer.push_to_hub()
    api = HfApi()
    api.upload_file(f"{OUTPUT_DIR}/README.md", "README.md", f"{api.whoami()['name']}/{HF_MODEL_NAME}", "model")
    print(f"\nâœ… Done! https://huggingface.co/{api.whoami()['name']}/{HF_MODEL_NAME}")