In [1]:
import numpy as np
import torch
import random

In [2]:
def seed(value=42):
    """Set random seed for everything.
    Args:
        value (int): Seed
    """
    np.random.seed(value)
    torch.manual_seed(value)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    random.seed(value)

### Parameters

In [3]:
# Data Params

BATCH_SIZE = 64
MEAN = [0.485, 0.456, 0.406]
STD = [0.229, 0.224, 0.225]

# Training Params

EPOCHS = 25

# Early Stopping
PATIENCE_PERIOD = 5
DELTA = 0 # minimum change to qualify as improvement

CKPT_PATH = 'best_checkpoint_vgg16_c0.pt'

## Dataset

### Download Kaggle Dataset

Downloading the dataset
- Need Kaggle API token
- Check this link: https://www.analyticsvidhya.com/blog/2021/06/how-to-load-kaggle-datasets-directly-into-google-colab/




In [4]:
import os
import pandas as pd
from sklearn.preprocessing import LabelEncoder

import numpy as np

np.random.seed(42)

PARENT_DIR = '../input/new-plant-diseases-dataset/New Plant Diseases Dataset(Augmented)/New Plant Diseases Dataset(Augmented)'
le = LabelEncoder()

for DIR in ['train', 'valid']:
    path = os.path.join(PARENT_DIR, DIR)
    image_paths = []
    image_class = []
    for folder in os.listdir(path):
        if folder.endswith('.py') or folder.endswith('.csv'):
            continue
        else:
            for image_file in os.listdir(os.path.join(PARENT_DIR, DIR, folder)):
                image_paths.append(f"{os.path.join(PARENT_DIR, DIR, folder)}/{image_file}")
                image_class.append(f"{folder}")
    if DIR == 'train':
        image_labels = le.fit_transform(image_class)
    else:
        image_labels = le.transform(image_class)
    df = pd.DataFrame({"image_path": image_paths, "image_class": image_class, "image_label":image_labels})
    df.to_csv(f'{DIR}.csv', index=False)

In [5]:
labels = le.inverse_transform([i for i in range(38)]).tolist()
labels

### Create PyTorch Dataset class


In [6]:
from torch.utils.data import Dataset, DataLoader

In [7]:
"""Plant Village Dataset"""
import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset
from torchvision import transforms
import os

from PIL import Image

class PlantVillage(Dataset):
    def __init__(self, data_csv, transform=None):
        super().__init__()
        self.data_df = pd.read_csv(data_csv)
        self.data_df["image_path"] = self.data_df["image_path"].apply(
            lambda x: os.path.join("/".join(data_csv.split("/")[:-1]), x)
        )
        self.transform = transform

    def __len__(self):
        return self.data_df.shape[0]

    def __getitem__(self, idx):
        image = Image.open(self.data_df.iloc[idx]["image_path"]).convert('RGB')
        label = self.data_df.iloc[idx]["image_label"]

        if self.transform is not None:
            image = self.transform(image)
        return {"image": image, "label": torch.tensor(label, dtype=torch.long)}

Create train and val dataloaders

In [8]:
from torchvision.transforms import Normalize, ToTensor, Resize, CenterCrop
resnet_preprocess = transforms.Compose([
    transforms.Resize(224),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    # TODO: Change mean and std for each model. Ensure this.
    transforms.Normalize(mean=MEAN, std=STD),
])

In [9]:
train_dataset = PlantVillage('train.csv', resnet_preprocess)
valid_dataset = PlantVillage('valid.csv', resnet_preprocess)

In [10]:
seed(42)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(valid_dataset, batch_size=BATCH_SIZE, shuffle=False)

In [11]:
# batch = next(iter(train_loader))

## Model

In [12]:
import torch
import torch.nn as nn

# VGG
model = torch.hub.load('pytorch/vision:v0.10.0', 'vgg16', pretrained=True)
in_features = model.classifier[6].in_features
model.classifier[6].in_features = nn.Linear(in_features, len(labels))
# ResNet
# model = torch.hub.load('pytorch/vision:v0.10.0', 'vgg16', pretrained=True)
# in_features = model.fc.in_features
# model.fc = nn.Linear(in_features, len(labels))

In [13]:
# !pip install torchinfo
# from torchinfo import summary
# summary(model, input_size = [32, 3, 224, 224])

## Training

- [x] Train Loop
- [x] Validation Loop
- [x] Precision/F1/Recall/Accuracy/Loss - History
- [x] EarlyStopping on Val Loss
- [x] Model Checkpointing based on Val Loss
- [x] Adam with 2e-5 LR
- [x] 25 epochs
- [x] 32 batch size

In [14]:
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

In [15]:
from tqdm.auto import tqdm as tqdm

In [16]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [17]:
def calculate_metrics(np_labels: np.ndarray, np_preds: np.ndarray, average: str = "micro"):
    acc = accuracy_score(np_labels, np_preds)
    p = precision_score(np_labels, np_preds, average=average)
    r = recall_score(np_labels, np_preds, average=average)
    f1 = f1_score(np_labels, np_preds, average=average)

    return acc, p, r, f1


In [18]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)

In [19]:
model = model.to(device)

In [20]:
optimizer = optim.Adam(model.parameters(), lr=2e-5)
criterion = nn.CrossEntropyLoss()

In [None]:
val_loss_min = np.Inf
early_stop = False
best_score = None
counter = 0
history = {
    'train_loss': [],
    'train_acc': [],
    'train_p': [],
    'train_r': [],
    'train_f1': [],
    'val_loss': [],
    'val_acc': [],
    'val_p': [],
    'val_r': [],
    'val_f1': []
}

for epoch in tqdm(range(EPOCHS)):
    model.train()
    avg_loss = 0
    avg_val_loss = 0
    train_preds = np.array([])
    train_labels = np.array([])
    val_preds = np.array([])
    val_labels = np.array([])
    for train_idx, batch in tqdm(enumerate(train_loader), total=len(train_loader)):
        images, labels = batch['image'].to(device), batch['label'].to(device)
        optimizer.zero_grad()

        outputs = model(images)
        y_pred = torch.argmax(outputs, dim=1)
        train_preds = np.append(train_preds, y_pred.detach().cpu().numpy())
        train_labels = np.append(train_labels, labels.detach().cpu().numpy())
        
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        avg_loss += loss.item() / len(train_loader)


    model.eval()
    for val_idx, val_batch in enumerate(val_loader):
        with torch.no_grad():
            images, labels = val_batch['image'].to(device), val_batch['label'].to(device)
            outputs = model(images)
            y_pred = torch.argmax(outputs, dim=1)
            val_preds = np.append(val_preds, y_pred.detach().cpu().numpy())
            val_labels = np.append(val_labels, labels.detach().cpu().numpy())
            loss = criterion(outputs, labels)
            avg_val_loss  += loss.item() / len(val_loader)

    train_acc, train_p, train_r, train_f1 = calculate_metrics(train_labels, train_preds)
    val_acc, val_p, val_r, val_f1 = calculate_metrics(val_labels, val_preds)

    # Early Stopping  + Model Checkpointing on Val Loss
    score = -avg_val_loss
    if best_score is None:
        best_score = score
        print(f'Validation loss decreased ({val_loss_min:.6f} --> {avg_val_loss:.6f}) at Epoch {epoch}.  Saving model ...')
        torch.save(model, CKPT_PATH)
    elif score < best_score + DELTA:
        counter +=1
        if counter >= PATIENCE_PERIOD:
            early_stop = True
    else:
        best_score = score
        counter = 0
        print(f'Validation loss decreased ({val_loss_min:.6f} --> {avg_val_loss:.6f}) at Epoch {epoch}.  Saving model ...')
        torch.save(model, CKPT_PATH)

    if early_stop:
        print(f'Early Stopping at Epoch {epoch}.')
        break
    
    history['train_loss'].append(avg_loss)
    history['train_acc'].append(train_acc)
    history['train_p'].append(train_p)
    history['train_r'].append(train_r)
    history['train_f1'].append(train_f1)

    history['val_loss'].append(avg_val_loss)
    history['val_acc'].append(val_acc)
    history['val_p'].append(val_p)
    history['val_r'].append(val_r)
    history['val_f1'].append(val_f1)

In [None]:
import pandas as pd
history_df = pd.DataFrame(history)
history_df.to_csv('history_vgg16_c0.csv', index=False)

In [None]:
history_df

In [None]:
hist_json_file = 'history_vgg16_c0.json' 
with open(hist_json_file, mode='w') as f:
    history_df.to_json(f)

In [None]:
cd ./output

In [None]:
!ls

In [None]:
import os 
os.chdir(r'/kaggle/working') 

In [None]:
from IPython.display import FileLink 
FileLink(r'best_checkpoint_vgg16_c0.pt')

In [None]:
!ls