03 — Baseline CNN (Training a Simple Convolutional Model From Scratch)

This notebook builds a simple Convolutional Neural Network to classify plant diseases from the PlantVillage dataset.
The purpose of this model is not to achieve high accuracy — instead, it helps understand:
  - how images move through a CNN
  - how loss and backpropagation work
  - how training + validation loops work
  - how PyTorch updates model weights

This foundation prepares us for more advanced models like ResNet, which we will use in Notebook 04.

In [4]:
import sys
import os
project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
sys.path.append(project_root)
print("Project root added to Python path: ", project_root)
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader

import matplotlib.pyplot as plt

from dataset import PlantVillageDataset

from torchvision import transforms

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device: ", device)

Project root added to Python path:  c:\workspace\plantDiseaseDetection
Using device:  cpu


In [None]:
batch_size = 32

train_dir = "../data/PlantVillage/train"
val_dir = "../data/PlantVillage/val"

classes = sorted(os.listdir(train_dir))
train_transforms = transforms.Compose([
  transforms.Resize((224,224)), #size that ResNet needs
  transforms.ToTensor(), #converts image to tensor
  # images are made of pixels 0-255, converting to tensor turns it into values 0-1, floats
])

val_transforms = transforms.Compose([
  transforms.Resize((224,224)),
  transforms.ToTensor(),
])
train_dataset = PlantVillageDataset(train_dir, transform=train_transforms)
val_dataset = PlantVillageDataset(val_dir, transform=val_transforms)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)
len(train_loader), len(val_loader)

(1358, 340)

In [6]:
"""
Simple neural network model.
  - sets baseline performance (starting accuracy to beat)
  - confirms pipeline (data -> model -> loss -> training) is working

Two parts:
1. Feature extractor (conv.)
  - learns patterns from the image
2. Classifier
  - takes the learned patterns and decides 'which class is this?'  
  
"""

class BaselineCNN(nn.Module): #nn.Module is like the house blueprint and BaselineCNN is custom house built from it
  def __init__(self, num_classes):
    super(BaselineCNN, self).__init__()

    #Feature extractor
    self.features = nn.Sequential( #container that stacks layers in order (pytorch runs them in order)
      #Chose 3 layers only for this model to keep it simple
      
      #First conv layer: image -> low-level features
      #2D Convolition, extracts features like edges and textures, uses 3X3 filter, transforming the 3 initial color channels RGB into 16 different feature maps
      nn.Conv2d(3,16,kernel_size=3, padding=1), 
      #Rectified Linear Unit -> activation function that sets neg vals in feature map to 0
      nn.ReLU(),
      #Max pooling layer that reduces HW of feature maps by keeping max val within 2x2 window
      nn.MaxPool2d(2),

      #Second conv layer: deeper patterns
      nn.Conv2d(16, 32, kernel_size=3,padding=1),
      nn.ReLU(),
      nn.MaxPool2d(2),

      #Third Conv layer
      nn.Conv2d(32,64, kernel_size=3,padding=1),
      nn.ReLU(),
      nn.MaxPool2d(2)
    )

    #classifier -> takes learned features and decides class
    self.classifier = nn.Sequential(
      nn.Flatten(),  #from last conv layer, output is 64 feature maps where each is 28x28 (3d tensor) which is 50176 features (too many), turn feature maps into a long vector
      nn.Linear(64 * 28 * 28, 128), #given 50176 features, output 128 
      nn.ReLU(),
      nn.Linear(128, num_classes) #final prediciton, given 128 pieces of evidence, how much does each class match
    )

  def forward(self, x):
    #pass image through feature extractor
    x = self.features(x)

    #pass extracted features through classifier
    x = self.classifier(x)

    return x

num_classes = len(classes)
model = BaselineCNN(num_classes).to(device)
model

BaselineCNN(
  (features): Sequential(
    (0): Conv2d(3, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU()
    (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (3): Conv2d(16, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (4): ReLU()
    (5): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (6): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (7): ReLU()
    (8): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (classifier): Sequential(
    (0): Flatten(start_dim=1, end_dim=-1)
    (1): Linear(in_features=50176, out_features=128, bias=True)
    (2): ReLU()
    (3): Linear(in_features=128, out_features=38, bias=True)
  )
)

In [7]:
criterion = nn.CrossEntropyLoss() #compares predicitions vs labels : measures how wrong they are
optimizer = optim.Adam(model.parameters(), lr=0.001) #updates model to improve next time

In [8]:
#training loop

def train_one_epoch(model,loader,optimizer, criterion):
  model.train() #training mode
  running_loss = 0
  correct = 0
  total = 0

  for images, labels in loader:
    images = images.to(device)  #SENDS SMALL BATCH AT A TIME
    labels = labels.to(device)

    #1. forward pass: model makes predicitons
    outputs = model(images)

    #2. compute loss
    loss = criterion(outputs, labels)

    #3. reset gradients (clear old calc)
    optimizer.zero_grad()

    #4. backward pass: calculate gradients (figure out how much each weight contributed to the error)
    loss.backward()

    #5. update weights
    optimizer.step()

    running_loss += loss.item()

    # calculate training accuracy
    _, preds = torch.max(outputs, 1) #take highest predicted score for each image
    correct += (preds == labels).sum().item() #compare preds to true labels
    total += labels.size(0)
  #return loss and accuracy for the whole epoch
  return running_loss / len(loader), correct / total 


In [9]:
def validate(model, loader, criterion):
  model.eval() # evaluation mode
  running_loss = 0
  correct = 0
  total = 0

  with torch.no_grad(): #no gradients here
    for images, labels in loader:
      images = images.to(device)
      labels = labels.to(device)

      outputs = model(images)
      loss = criterion(outputs, labels)

      running_loss += loss.item()

      _,preds = torch.max(outputs, 1)
      correct += (preds == labels).sum().item()
      total += labels.size(0)

  return running_loss / len(loader), correct / total

In [10]:
num_epochs = 5

for epoch in range(num_epochs):
  train_loss, train_acc = train_one_epoch(model, train_loader, optimizer, criterion)
  val_loss, val_acc = validate(model, val_loader, criterion)

  print(f"Epoch {epoch+1}/{num_epochs}")
  print(f"Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}")
  print(f"Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}")
  print("-" * 50)

Epoch 1/5
Train Loss: 0.9530, Train Acc: 0.7210
Val Loss: 0.4919, Val Acc: 0.8467
--------------------------------------------------
Epoch 2/5
Train Loss: 0.3412, Train Acc: 0.8923
Val Loss: 0.3769, Val Acc: 0.8814
--------------------------------------------------
Epoch 3/5
Train Loss: 0.1970, Train Acc: 0.9370
Val Loss: 0.3157, Val Acc: 0.9052
--------------------------------------------------
Epoch 4/5
Train Loss: 0.1327, Train Acc: 0.9569
Val Loss: 0.3067, Val Acc: 0.9058
--------------------------------------------------
Epoch 5/5
Train Loss: 0.0964, Train Acc: 0.9677
Val Loss: 0.3021, Val Acc: 0.9147
--------------------------------------------------


In [11]:
torch.save(model.state_dict(), "baseline_cnn.pth")