In [1]:
# %% 
from datasets import load_dataset

ds = load_dataset("YuvrajSingh9886/Agriculture-Soil-QA-Pairs-Dataset")

In [2]:
# %% 
ds

DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'ANSWER', 'QUESTION.question', 'QUESTION.paragraph'],
        num_rows: 3447
    })
})

In [3]:
# %% 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import evaluate
import re
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    DataCollatorForSeq2Seq
)

from transformers import pipeline
from transformers import set_seed
set_seed(42)


In [4]:
# %% 
split_dataset = ds["train"].train_test_split(test_size=0.2, seed=42)
split_dataset

DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'ANSWER', 'QUESTION.question', 'QUESTION.paragraph'],
        num_rows: 2757
    })
    test: Dataset({
        features: ['Unnamed: 0', 'ANSWER', 'QUESTION.question', 'QUESTION.paragraph'],
        num_rows: 690
    })
})

In [5]:
# %% 
questions = ["question: " + q for q in split_dataset['train']['QUESTION.question']]
answers = [a for a in split_dataset['train']['ANSWER']]

df = pd.DataFrame({'questions': questions, 'answers': answers})
df

Unnamed: 0,questions,answers
0,question: What is often rewarded under agri-en...,Semi-natural habitats
1,question: When can the harvesting process for ...,The harvesting process for these crops can gen...
2,question: What can help optimize nutrient cycl...,Variety and species mixtures (intercrops).
3,question: What is included in the assessment o...,Number of earthworm burrows and extent of visi...
4,question: How can soil-borne pest build-up ass...,Soil-borne pest build-up can be avoided by pra...
...,...,...
2752,question: Why is it important to incorporate t...,To ensure minimal nitrogen loss
2753,question: How do microbes contribute to nutrie...,Microbes contribute to nutrient availability b...
2754,question: How does the hand texturing method c...,"In the hand texturing method, soil that forms ..."
2755,question: List methods to minimize soil compac...,"To minimize soil compaction from machinery, on..."


In [7]:
# %% 
# Use a fine-tuned agriculture-specific T5 model
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
model_name = "google/mt5-small"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# Ensure CUDA is used if available
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Define max lengths
max_input_length = 256
max_target_length = 64

# Define the preprocessing function
def preprocess(batch):
    inputs = ["question: " + q.strip() for q in batch["QUESTION.question"]]
    targets = [a[0] if isinstance(a, list) else a for a in batch["ANSWER"]]
    
    model_inputs = tokenizer(inputs, 
                             max_length=max_input_length, 
                             truncation=True, 
                             padding=True)
    
    labels = tokenizer(targets, 
                       max_length=max_target_length, 
                       truncation=True, 
                       padding=True)
    
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Apply the preprocessing
train_dataset = split_dataset["train"].map(preprocess, batched=True)
test_dataset = split_dataset["test"].map(preprocess, batched=True)

tokenizer_config.json:   0%|          | 0.00/82.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config.json:   0%|          | 0.00/553 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


pytorch_model.bin:   0%|          | 0.00/1.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Map:   0%|          | 0/2757 [00:00<?, ? examples/s]

Map:   0%|          | 0/690 [00:00<?, ? examples/s]

In [8]:
# %% 
rouge = evaluate.load("rouge")

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if hasattr(preds, "dim"):
        preds = preds.argmax(dim=-1)
    else:
        preds = np.argmax(preds, axis=-1)

    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    labels = [[(l if l != -100 else tokenizer.pad_token_id) for l in label] for label in labels]
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds = ["\n".join(pred.strip().split(". ")) for pred in decoded_preds]
    decoded_labels = ["\n".join(label.strip().split(". ")) for label in decoded_labels]

    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    
    result = {key: round(value * 100, 4) for key, value in result.items()}

    return result

In [9]:
# %% 
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

training_args = Seq2SeqTrainingArguments(
    output_dir = "./t5-agri-qa",
    per_device_train_batch_size = 8,
    gradient_accumulation_steps = 4,
    num_train_epochs = 3,
    learning_rate = 5e-5,
    evaluation_strategy = "epoch",      
    save_strategy = "epoch",             
    logging_dir = "./logs",             
    logging_steps = 10,
    predict_with_generate = True,
    fp16 = True,
    save_total_limit = 2,
)

trainer = Seq2SeqTrainer(
    model             = model,
    args              = training_args,
    train_dataset     = train_dataset,
    eval_dataset      = test_dataset,
    data_collator     = data_collator,
    tokenizer         = tokenizer,
    compute_metrics   = compute_metrics,
)

In [10]:
# %% 
trainer.train()
trainer.save_model("./t5-agri-qa") 

  0%|          | 0/258 [00:00<?, ?it/s]

{'loss': 0.0, 'grad_norm': nan, 'learning_rate': 5e-05, 'epoch': 0.12}
{'loss': 0.0, 'grad_norm': nan, 'learning_rate': 5e-05, 'epoch': 0.23}
{'loss': 0.0, 'grad_norm': nan, 'learning_rate': 5e-05, 'epoch': 0.35}
{'loss': 0.0, 'grad_norm': nan, 'learning_rate': 5e-05, 'epoch': 0.46}
{'loss': 0.0, 'grad_norm': nan, 'learning_rate': 5e-05, 'epoch': 0.58}
{'loss': 0.0, 'grad_norm': nan, 'learning_rate': 5e-05, 'epoch': 0.7}
{'loss': 0.0, 'grad_norm': nan, 'learning_rate': 5e-05, 'epoch': 0.81}
{'loss': 0.0, 'grad_norm': nan, 'learning_rate': 5e-05, 'epoch': 0.93}




  0%|          | 0/87 [00:00<?, ?it/s]

{'eval_loss': nan, 'eval_rouge1': 0.0, 'eval_rouge2': 0.0, 'eval_rougeL': 0.0, 'eval_rougeLsum': 0.0, 'eval_runtime': 160.9356, 'eval_samples_per_second': 4.287, 'eval_steps_per_second': 0.541, 'epoch': 1.0}
{'loss': 0.0, 'grad_norm': nan, 'learning_rate': 5e-05, 'epoch': 1.04}
{'loss': 0.0, 'grad_norm': nan, 'learning_rate': 5e-05, 'epoch': 1.16}
{'loss': 0.0, 'grad_norm': nan, 'learning_rate': 5e-05, 'epoch': 1.28}
{'loss': 0.0, 'grad_norm': nan, 'learning_rate': 5e-05, 'epoch': 1.39}
{'loss': 0.0, 'grad_norm': nan, 'learning_rate': 5e-05, 'epoch': 1.51}
{'loss': 0.0, 'grad_norm': nan, 'learning_rate': 5e-05, 'epoch': 1.62}
{'loss': 0.0, 'grad_norm': nan, 'learning_rate': 5e-05, 'epoch': 1.74}
{'loss': 0.0, 'grad_norm': nan, 'learning_rate': 5e-05, 'epoch': 1.86}
{'loss': 0.0, 'grad_norm': nan, 'learning_rate': 5e-05, 'epoch': 1.97}




  0%|          | 0/87 [00:00<?, ?it/s]

{'eval_loss': nan, 'eval_rouge1': 0.0, 'eval_rouge2': 0.0, 'eval_rougeL': 0.0, 'eval_rougeLsum': 0.0, 'eval_runtime': 159.9271, 'eval_samples_per_second': 4.314, 'eval_steps_per_second': 0.544, 'epoch': 1.99}
{'loss': 0.0, 'grad_norm': nan, 'learning_rate': 5e-05, 'epoch': 2.09}
{'loss': 0.0, 'grad_norm': nan, 'learning_rate': 5e-05, 'epoch': 2.2}
{'loss': 0.0, 'grad_norm': nan, 'learning_rate': 5e-05, 'epoch': 2.32}
{'loss': 0.0, 'grad_norm': nan, 'learning_rate': 5e-05, 'epoch': 2.43}
{'loss': 0.0, 'grad_norm': nan, 'learning_rate': 5e-05, 'epoch': 2.55}
{'loss': 0.0, 'grad_norm': nan, 'learning_rate': 5e-05, 'epoch': 2.67}
{'loss': 0.0, 'grad_norm': nan, 'learning_rate': 5e-05, 'epoch': 2.78}
{'loss': 0.0, 'grad_norm': nan, 'learning_rate': 5e-05, 'epoch': 2.9}




  0%|          | 0/87 [00:00<?, ?it/s]

{'eval_loss': nan, 'eval_rouge1': 0.0, 'eval_rouge2': 0.0, 'eval_rougeL': 0.0, 'eval_rougeLsum': 0.0, 'eval_runtime': 239.9257, 'eval_samples_per_second': 2.876, 'eval_steps_per_second': 0.363, 'epoch': 2.99}
{'train_runtime': 2292.9369, 'train_samples_per_second': 3.607, 'train_steps_per_second': 0.113, 'train_loss': 0.0, 'epoch': 2.99}


In [1]:

# %% 
import json

# Read all log files (they are saved in 'trainer_state.json')
with open("t5-agri-qa/checkpoint-258/trainer_state.json", "r") as f:
    logs = json.load(f)

training_logs = logs["log_history"]

# Extract rougeL scores and epochs
epochs = []
rougeL = []

for log in training_logs:
    if "eval_rougeL" in log:
        epochs.append(log["epoch"])
        rougeL.append(log["eval_rougeL"])

# Plot
plt.figure(figsize=(8,5))
plt.plot(epochs, rougeL, marker="o", label="ROUGE-L Score")
plt.title("ROUGE-L Score vs Epochs")
plt.xlabel("Epochs")
plt.ylabel("ROUGE-L (%)")
plt.grid(True)
plt.legend()
plt.show()


NameError: name 'plt' is not defined