# AgricGPT - Agricultural Domain Instruction Tuning with QLoRA

This notebook fine-tunes **Microsoft Phi-2** on the **AI4Agr/CROP-dataset** for agricultural Q&A using:
- **QLoRA** (4-bit quantization + Low-Rank Adaptation)
- **Instruction tuning** format

**Requirements**: T4 GPU or better

## 1. Install Dependencies

Run this cell and **restart the runtime** if prompted.

In [None]:
!pip install -q \
    torch>=2.0.0 \
    transformers>=4.40.0 \
    datasets>=2.0.0 \
    peft>=0.10.0 \
    bitsandbytes>=0.43.0 \
    accelerate>=0.27.0 \
    huggingface_hub

## 2. Configuration

All hyperparameters in one place for easy experimentation.

In [None]:
import torch

# Model
MODEL_NAME = "microsoft/phi-2"
OUTPUT_DIR = "./agri_model_results"

# Hugging Face Hub
HF_MODEL_NAME = "agricgpt-phi2"  # <- Change this to your desired name!

# Dataset
DATASET_SIZE = 5000  # Set to None for full dataset
MAX_SEQ_LENGTH = 512

# LoRA
LORA_R = 16
LORA_ALPHA = 32
LORA_DROPOUT = 0.05
TARGET_MODULES = ["fc1", "fc2", "q_proj", "k_proj", "v_proj", "dense"]

# Training
NUM_EPOCHS = 3
BATCH_SIZE = 2
GRADIENT_ACCUMULATION_STEPS = 4
LEARNING_RATE = 2e-4
LOGGING_STEPS = 10

# Check GPU
if not torch.cuda.is_available():
    raise ValueError("GPU required! Enable T4 GPU in Runtime > Change runtime type")
print(f"GPU: {torch.cuda.get_device_name(0)}")

## 3. Load Model with 4-bit Quantization

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

# Set seeds
torch.manual_seed(42)
torch.cuda.manual_seed(42)

# 4-bit quantization config (QLoRA)
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=False,
)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token

# Load model
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=bnb_config,
    trust_remote_code=True,
    device_map={"":0}
)
model.config.use_cache = False

print(f"Model loaded: {MODEL_NAME}")

## 4. Load and Prepare Dataset

In [None]:
from datasets import load_dataset

# Load AI4Agr/CROP-dataset (English subset)
dataset = load_dataset(
    "AI4Agr/CROP-dataset",
    data_files="**/*_en/**/*.json",
    split="train"
)

# Limit for pilot run
if DATASET_SIZE:
    dataset = dataset.select(range(min(DATASET_SIZE, len(dataset))))

print(f"Dataset size: {len(dataset)} samples")
print(f"Sample: {dataset[0]}")

In [None]:
# Format as instruction-response pairs with EOS token
def format_instruction(sample):
    """Format sample as instruction-response with EOS token for clean stopping."""
    prompt = (
        f"### Instruction:\n{sample['instruction']}\n\n"
        f"### Response:\n{sample['output']}{tokenizer.eos_token}"
    )
    return {"text": prompt}

dataset = dataset.map(format_instruction)

# Show formatted example
print("Formatted example:")
print(dataset[0]["text"][:500])

In [None]:
# Tokenize
def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        truncation=True,
        max_length=MAX_SEQ_LENGTH,
        padding="max_length"
    )

tokenized_dataset = dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=dataset.column_names
)

print(f"Tokenized dataset ready: {len(tokenized_dataset)} samples")

## 5. Configure LoRA Adapters

In [None]:
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

# Prepare model for k-bit training
model = prepare_model_for_kbit_training(model)

# LoRA configuration
peft_config = LoraConfig(
    r=LORA_R,
    lora_alpha=LORA_ALPHA,
    target_modules=TARGET_MODULES,
    lora_dropout=LORA_DROPOUT,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, peft_config)

# Print trainable parameters
trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
total = sum(p.numel() for p in model.parameters())
print(f"Trainable: {trainable:,} / {total:,} ({100 * trainable / total:.2f}%)")

## 6. Training

In [None]:
from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling

training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=NUM_EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
    learning_rate=LEARNING_RATE,
    logging_steps=LOGGING_STEPS,
    fp16=True,
    optim="paged_adamw_32bit",
    warmup_ratio=0.03,
    lr_scheduler_type="cosine",
    save_strategy="epoch",
    report_to="none",
    seed=42
)

trainer = Trainer(
    model=model,
    train_dataset=tokenized_dataset,
    args=training_args,
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False)
)

print("Starting training...")
trainer.train()

## 7. Inference Helper

In [None]:
from transformers import GenerationConfig, pipeline, logging

logging.set_verbosity(logging.CRITICAL)
model.eval()

pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer)

generation_config = GenerationConfig(
    max_new_tokens=256,
    do_sample=True,
    temperature=0.7,
    top_p=0.9,
    repetition_penalty=1.2,
    eos_token_id=tokenizer.eos_token_id,
    pad_token_id=tokenizer.eos_token_id
)

def ask_agrigpt(question: str) -> str:
    """Ask AgricGPT a question about agriculture."""
    prompt = f"### Instruction:\n{question}\n\n### Response:\n"
    result = pipe(prompt, generation_config=generation_config)
    response = result[0]['generated_text'].split("### Response:\n")[-1]
    response = response.split("### Instruction:")[0].strip()
    return response

print("Inference helper ready!")

## 8. Test the Model

In [None]:
# Test with crop rotation question
torch.manual_seed(42)
question = "What is crop rotation?"
print(f"Q: {question}")
print(f"A: {ask_agrigpt(question)}")

In [None]:
# Test with soil erosion question
torch.manual_seed(42)
question = "How can I prevent soil erosion on my farm?"
print(f"Q: {question}")
print(f"A: {ask_agrigpt(question)}")

In [None]:
# Test with organic farming question
torch.manual_seed(42)
question = "What are the benefits of organic farming?"
print(f"Q: {question}")
print(f"A: {ask_agrigpt(question)}")

## 9. Save Model Locally

In [None]:
# Save the fine-tuned model locally
save_path = f"{OUTPUT_DIR}/final_model"
trainer.save_model(save_path)
tokenizer.save_pretrained(save_path)
print(f"Model saved to: {save_path}")

## 10. Push to Hugging Face Hub ðŸ¤—

Upload your trained model to your Hugging Face account.

1. Get your token at: https://huggingface.co/settings/tokens
2. Make sure you have **write** access enabled

In [None]:
from huggingface_hub import login

# Login to Hugging Face (will prompt for your token)
login()

In [None]:
# Push model and tokenizer to Hugging Face Hub
print(f"Pushing model to Hugging Face as '{HF_MODEL_NAME}'...")

model.push_to_hub(HF_MODEL_NAME)
tokenizer.push_to_hub(HF_MODEL_NAME)

print(f"\nâœ… Model uploaded successfully!")
print(f"View at: https://huggingface.co/YOUR_USERNAME/{HF_MODEL_NAME}")