# AgricGPT - Agricultural Domain Instruction Tuning with QLoRA

This notebook fine-tunes **Microsoft Phi-2** on the **AI4Agr/CROP-dataset** for agricultural Q&A using:
- **QLoRA** (4-bit quantization + Low-Rank Adaptation)
- **Instruction tuning** format

**Requirements**: T4 GPU or better

## 1. Install Dependencies

Run this cell and **restart the runtime** if prompted.

In [None]:
!pip install -q \
    torch>=2.0.0 \
    transformers>=4.40.0 \
    datasets>=2.0.0 \
    peft>=0.10.0 \
    bitsandbytes>=0.43.0 \
    accelerate>=0.27.0 \
    huggingface_hub

## 2. Configuration

In [None]:
import torch

# Model
MODEL_NAME = "microsoft/phi-2"
OUTPUT_DIR = "./agri_model_results"

# Hugging Face Hub
HF_MODEL_NAME = "agricgpt-phi2"  # <- Change this!

# Dataset
DATASET_SIZE = 5000  # Set to None for full dataset
MAX_SEQ_LENGTH = 512

# LoRA
LORA_R = 16
LORA_ALPHA = 32
LORA_DROPOUT = 0.05
TARGET_MODULES = ["fc1", "fc2", "q_proj", "k_proj", "v_proj", "dense"]

# Training
NUM_EPOCHS = 3
BATCH_SIZE = 2
GRADIENT_ACCUMULATION_STEPS = 4
LEARNING_RATE = 2e-4
LOGGING_STEPS = 10

# Check GPU
if not torch.cuda.is_available():
    raise ValueError("GPU required!")
print(f"GPU: {torch.cuda.get_device_name(0)}")

## 3. Load Model with 4-bit Quantization

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

torch.manual_seed(42)
torch.cuda.manual_seed(42)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=False,
)

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=bnb_config,
    trust_remote_code=True,
    device_map={"":0}
)
model.config.use_cache = False

print(f"Model loaded: {MODEL_NAME}")

## 4. Base Model Output (BEFORE Training)

Let's see how the model responds **before** fine-tuning on agricultural data.

In [None]:
from transformers import GenerationConfig, pipeline

# Create pipeline for base model
base_pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer)

base_gen_config = GenerationConfig(
    max_new_tokens=150,
    do_sample=True,
    temperature=0.7,
    eos_token_id=tokenizer.eos_token_id,
    pad_token_id=tokenizer.eos_token_id
)

# Test base model with agricultural question
torch.manual_seed(42)
test_prompt = "### Instruction:\nWhat is crop rotation?\n\n### Response:\n"

print("=" * 60)
print("BASE MODEL OUTPUT (before training)")
print("=" * 60)
result = base_pipe(test_prompt, generation_config=base_gen_config)
print(result[0]['generated_text'])

## 5. Load and Prepare Dataset

In [None]:
from datasets import load_dataset

dataset = load_dataset(
    "AI4Agr/CROP-dataset",
    data_files="**/*_en/**/*.json",
    split="train"
)

if DATASET_SIZE:
    dataset = dataset.select(range(min(DATASET_SIZE, len(dataset))))

print(f"Dataset size: {len(dataset)} samples")

In [None]:
def format_instruction(sample):
    prompt = (
        f"### Instruction:\n{sample['instruction']}\n\n"
        f"### Response:\n{sample['output']}{tokenizer.eos_token}"
    )
    return {"text": prompt}

dataset = dataset.map(format_instruction)

def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, max_length=MAX_SEQ_LENGTH, padding="max_length")

tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=dataset.column_names)
print(f"Tokenized: {len(tokenized_dataset)} samples")

## 6. Configure LoRA

In [None]:
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

model = prepare_model_for_kbit_training(model)

peft_config = LoraConfig(
    r=LORA_R,
    lora_alpha=LORA_ALPHA,
    target_modules=TARGET_MODULES,
    lora_dropout=LORA_DROPOUT,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, peft_config)

trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
total = sum(p.numel() for p in model.parameters())
print(f"Trainable: {trainable:,} / {total:,} ({100 * trainable / total:.2f}%)")

## 7. Training

In [None]:
from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling

training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=NUM_EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
    learning_rate=LEARNING_RATE,
    logging_steps=LOGGING_STEPS,
    fp16=True,
    optim="paged_adamw_32bit",
    warmup_ratio=0.03,
    lr_scheduler_type="cosine",
    save_strategy="epoch",
    report_to="none",
    seed=42
)

trainer = Trainer(
    model=model,
    train_dataset=tokenized_dataset,
    args=training_args,
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False)
)

print("Starting training...")
trainer.train()

## 8. Fine-Tuned Model Output (AFTER Training)

In [None]:
from transformers import logging

logging.set_verbosity(logging.CRITICAL)
model.eval()

pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer)

generation_config = GenerationConfig(
    max_new_tokens=256,
    do_sample=True,
    temperature=0.7,
    top_p=0.9,
    repetition_penalty=1.2,
    eos_token_id=tokenizer.eos_token_id,
    pad_token_id=tokenizer.eos_token_id
)

def ask_agrigpt(question: str) -> str:
    prompt = f"### Instruction:\n{question}\n\n### Response:\n"
    result = pipe(prompt, generation_config=generation_config)
    response = result[0]['generated_text'].split("### Response:\n")[-1]
    return response.split("### Instruction:")[0].strip()

# Test with SAME question as before training
torch.manual_seed(42)
print("=" * 60)
print("FINE-TUNED MODEL OUTPUT (after training)")
print("=" * 60)
print(f"Q: What is crop rotation?")
print(f"A: {ask_agrigpt('What is crop rotation?')}")

## 9. More Test Questions

In [None]:
questions = [
    "How can I prevent soil erosion on my farm?",
    "What are the benefits of organic farming?",
    "How do I manage pests naturally?"
]

for q in questions:
    print(f"Q: {q}")
    print(f"A: {ask_agrigpt(q)}")
    print("-" * 40)

## 10. Save & Push to Hugging Face Hub ðŸ¤—

In [None]:
# Save locally
save_path = f"{OUTPUT_DIR}/final_model"
trainer.save_model(save_path)
tokenizer.save_pretrained(save_path)
print(f"Saved to: {save_path}")

In [None]:
from huggingface_hub import login
login()  # Enter your HF token

In [None]:
print(f"Pushing to Hugging Face as '{HF_MODEL_NAME}'...")
model.push_to_hub(HF_MODEL_NAME)
tokenizer.push_to_hub(HF_MODEL_NAME)
print(f"âœ… Done! View at: https://huggingface.co/YOUR_USERNAME/{HF_MODEL_NAME}")