In [None]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.6


#Import Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding, EarlyStoppingCallback
import torch
import evaluate
import joblib

In [None]:
# Use GPU if available
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [None]:
df = pd.read_csv("/content/drive/MyDrive/cleaned_dataset.csv")
df

Unnamed: 0,Disease_Name,Descriptions
0,Tomato healthy,A vibrant green and healthy tomato leaf with s...
1,Tomato healthy,"A healthy Solanum lycopersicum leaf, free of d..."
2,Tomato healthy,"A fresh tomato leaf outdoors, glowing in sunli..."
3,Tomato healthy,"A clean and healthy tomato leaf image, perfect..."
4,Tomato Late blight,A tomato leaf showing dark brown lesions and w...
...,...,...
82547,Tomato Spider mites Two spotted spider mite,An image of a tomato leaf displaying symptoms ...
82548,Tomato Septoria leaf spot,"A tomato leaf showing small, circular brown sp..."
82549,Tomato Septoria leaf spot,A tomato plant leaf infected with Septoria lyc...
82550,Tomato Septoria leaf spot,A tomato leaf outdoors with visible signs of S...


In [None]:
len(df['Disease_Name'].unique().tolist())

15

In [None]:
encoder = LabelEncoder()
encoder.fit(df["Disease_Name"].tolist())
df["label"] = encoder.transform(df["Disease_Name"].tolist())
df

Unnamed: 0,Disease_Name,Descriptions,label
0,Tomato healthy,A vibrant green and healthy tomato leaf with s...,13
1,Tomato healthy,"A healthy Solanum lycopersicum leaf, free of d...",13
2,Tomato healthy,"A fresh tomato leaf outdoors, glowing in sunli...",13
3,Tomato healthy,"A clean and healthy tomato leaf image, perfect...",13
4,Tomato Late blight,A tomato leaf showing dark brown lesions and w...,7
...,...,...,...
82547,Tomato Spider mites Two spotted spider mite,An image of a tomato leaf displaying symptoms ...,10
82548,Tomato Septoria leaf spot,"A tomato leaf showing small, circular brown sp...",9
82549,Tomato Septoria leaf spot,A tomato plant leaf infected with Septoria lyc...,9
82550,Tomato Septoria leaf spot,A tomato leaf outdoors with visible signs of S...,9


In [None]:
joblib.dump(encoder, "/content/drive/MyDrive/encoder.pkl")

print("✅ Encoder saved successfully!")

✅ Encoder saved successfully!


#Train/Test split

In [None]:
df_train, df_test = train_test_split(df, train_size=0.8)

#Reset index to drop extra columns
df_train = df_train.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

train_dataset = Dataset.from_pandas(df_train)
test_dataset = Dataset.from_pandas(df_test)

print(train_dataset.features)
print(test_dataset.features)


{'Disease_Name': Value('string'), 'Descriptions': Value('string'), 'label': Value('int64')}
{'Disease_Name': Value('string'), 'Descriptions': Value('string'), 'label': Value('int64')}


#Model Defined

In [None]:
model = "distilbert-base-uncased"

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model)

In [None]:
def mapper_function(data):
  return tokenizer(data['Descriptions'], truncation=True)

In [None]:
tokenized_train = train_dataset.map(mapper_function, batched=True)
tokenizer_test = test_dataset.map(mapper_function, batched=True)

Map:   0%|          | 0/66041 [00:00<?, ? examples/s]

Map:   0%|          | 0/16511 [00:00<?, ? examples/s]

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(model, num_labels=15)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


#Training the Model

In [None]:
data_collator = DataCollatorWithPadding(tokenizer)

In [None]:
eval_metric = evaluate.load("accuracy")

In [None]:
def compute_metrics(eval_pred):
  logits, labels = eval_pred
  predictions = np.argmax (logits, axis=1)
  return eval_metric.compute(predictions=predictions, references=labels)

In [None]:
training_arguments = TrainingArguments(
    output_dir = "checkpoints",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.05,
    logging_strategy="epoch",
    eval_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    learning_rate=2e-5,
    report_to="none",

    load_best_model_at_end=True,
    metric_for_best_model = "accuracy",
    greater_is_better = True,

    logging_dir = "logs", # for tensorboard log
    warmup_ratio = 0.1, #learning rate warmup
    gradient_accumulation_steps = 2, #for double the effective batch size
    fp16 = True, #for faster learning rate
  )

In [None]:
trainer = Trainer(
    model=model,
    args=training_arguments,
    train_dataset = tokenized_train,
    eval_dataset = tokenizer_test,
    processing_class = tokenizer, #instead of tokenizer
    data_collator = data_collator,
    compute_metrics=compute_metrics,
    )

In [None]:
#Training
trainer.train()

#Evaluate the loaded best model
metrics = trainer.evaluate()
print("Metrics of the loaded best model:", metrics)

#Save the best model permanently
trainer.save_model("/content/drive/MyDrive/best_model")



Epoch,Training Loss,Validation Loss,Accuracy
1,0.4132,0.000826,1.0
2,0.0007,0.000155,1.0
3,0.0002,5.3e-05,1.0
4,0.0001,2.3e-05,1.0
5,0.0,1.6e-05,1.0


Metrics of the loaded best model: {'eval_loss': 0.0008255979046225548, 'eval_accuracy': 1.0, 'eval_runtime': 8.5418, 'eval_samples_per_second': 1932.972, 'eval_steps_per_second': 120.818, 'epoch': 5.0}


#Load Model for Prediction

In [None]:
#save the encoder
encoder = joblib.load("/content/drive/MyDrive/encoder.pkl")

In [None]:
#Load best model for prediction
best_model = AutoModelForSequenceClassification.from_pretrained("/content/drive/MyDrive/best_model")
tokenizer = AutoTokenizer.from_pretrained("/content/drive/MyDrive/best_model")

Prediction Function

In [None]:
#define prediction function
def predict(texts):
  if isinstance(texts, str): # convert single string to list
    texts = [texts]
  elif isinstance(texts, tuple): # convert tuple to list
    texts = list(texts)
  inputs = tokenizer(texts, return_tensors="pt", truncation=True, padding=True) #tokenize the texts
  with torch.no_grad():
    outputs = best_model(**inputs)
  logits = outputs.logits #get predicted class index
  predicted_class_id = torch.argmax(logits, dim=-1).cpu().numpy()

  predicted_label = encoder.inverse_transform(predicted_class_id)
  predicted_label = [str(label) for label in predicted_label] #np_str to python_str
  return predicted_label[0] if len(predicted_label) == 1 else list(predicted_label) #single input=single string, batch input=list of strings

In [None]:
#text prediction call
texts = "An image of a tomato plant leaf with symptoms of Tomato Yellow Leaf Curl Virus for agricultural datasets.", "An image of a tomato leaf displaying symptoms of a two-spotted spider mite infestation, ideal for pest identification datasets.","A tomato leaf showing small, circular brown spots with yellow halos, typical of Septoria leaf spot."
print(predict(texts))

['Tomato YellowLeaf Curl Virus', 'Tomato Spider mites Two spotted spider mite', 'Tomato Septoria leaf spot']
