# Food Image Classification
We implement CNN to solve image classfication problem.

The images are collected from the food-11 dataset classified into 11 classes.

We try to use original CNN model and CNN with residual to evlaute the classifier.

Due to the usage limitation of CUDA, we don't include cross-validation and ensumble so far.

In [1]:
! wget https://www.dropbox.com/s/6l2vcvxl54b0b6w/food11.zip

--2024-05-19 00:06:22--  https://www.dropbox.com/s/6l2vcvxl54b0b6w/food11.zip
Resolving www.dropbox.com (www.dropbox.com)... 162.125.2.18, 2620:100:6017:18::a27d:212
Connecting to www.dropbox.com (www.dropbox.com)|162.125.2.18|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: /s/raw/6l2vcvxl54b0b6w/food11.zip [following]
--2024-05-19 00:06:23--  https://www.dropbox.com/s/raw/6l2vcvxl54b0b6w/food11.zip
Reusing existing connection to www.dropbox.com:443.
HTTP request sent, awaiting response... 302 Found
Location: https://uc33ade0547b20d59b42208a30dd.dl.dropboxusercontent.com/cd/0/inline/CTI7q73dYs_kDOLAIHHuXNRu6mJ0PtiZJ7wQCVyUDG066Lvc1FQK2Cy8XAOsH-XjD6WZDmaqevLgx8mwVQ7kLQqDJo9uhsPAeyeMDBTF9QkKkyzFmw0hzEGrl4AbB2ZBsvdfKNDW3JfjTWQQa9V4HHNp/file# [following]
--2024-05-19 00:06:23--  https://uc33ade0547b20d59b42208a30dd.dl.dropboxusercontent.com/cd/0/inline/CTI7q73dYs_kDOLAIHHuXNRu6mJ0PtiZJ7wQCVyUDG066Lvc1FQK2Cy8XAOsH-XjD6WZDmaqevLgx8mwVQ7kLQqDJo9uhsPAeyeMDBTF9QkK

In [2]:
! unzip food11.zip

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  inflating: food11/training/8_806.jpg  
  inflating: food11/training/8_807.jpg  
  inflating: food11/training/8_808.jpg  
  inflating: food11/training/8_809.jpg  
  inflating: food11/training/8_81.jpg  
  inflating: food11/training/8_810.jpg  
  inflating: food11/training/8_811.jpg  
  inflating: food11/training/8_812.jpg  
  inflating: food11/training/8_813.jpg  
  inflating: food11/training/8_814.jpg  
  inflating: food11/training/8_815.jpg  
  inflating: food11/training/8_816.jpg  
  inflating: food11/training/8_817.jpg  
  inflating: food11/training/8_818.jpg  
  inflating: food11/training/8_819.jpg  
  inflating: food11/training/8_82.jpg  
  inflating: food11/training/8_820.jpg  
  inflating: food11/training/8_821.jpg  
  inflating: food11/training/8_822.jpg  
  inflating: food11/training/8_823.jpg  
  inflating: food11/training/8_824.jpg  
  inflating: food11/training/8_825.jpg  
  inflating: food11/training/8_826.

# Training

In [3]:
_exp_name = "sample"

In [4]:
# Import necessary packages.
import numpy as np
import pandas as pd
import torch
import os
import torch.nn as nn
import torchvision.transforms as transforms
from PIL import Image

from torch.utils.data import ConcatDataset, DataLoader, Subset, Dataset
from torchvision.datasets import DatasetFolder, VisionDataset

# for the progress bar.
from tqdm.auto import tqdm
import random

In [5]:
myseed = 4944  # set a random seed for reproducibility
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
np.random.seed(myseed)
torch.manual_seed(myseed)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(myseed)

## **Transforms**
Torchvision provides lots of useful utilities for image preprocessing, data wrapping as well as data augmentation.

In [6]:
test_tfm = transforms.Compose([
    transforms.Resize((128, 128)),
    transforms.ToTensor(),
])

train_tfm = transforms.Compose([
    transforms.Resize((128, 128)), # Resize the image into a fixed shape (height = width = 128)
    transforms.RandomHorizontalFlip(p=0.5),   # Randomly flip images horizontally with a probability of 0.5
    transforms.RandomRotation(15),            # Randomly rotate images by up to 15 degrees
    transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.2),  # Randomly change the brightness, contrast, saturation, and hue
    transforms.RandomResizedCrop(128, scale=(0.8, 1.0)),  # Randomly crop the image and resize to 128x128
    transforms.ToTensor(),
])


## **Datasets**
The data is labelled by the name, so we load images and label while calling '__getitem__'

In [7]:
class FoodDataset(Dataset):

    def __init__(self,path,tfm=test_tfm,files = None):
        super(FoodDataset).__init__()
        self.path = path
        self.files = sorted([os.path.join(path,x) for x in os.listdir(path) if x.endswith(".jpg")])
        if files != None:
            self.files = files
        print(f"One {path} sample",self.files[0])
        self.transform = tfm

    def __len__(self):
        return len(self.files)

    def __getitem__(self,idx):
        fname = self.files[idx]
        im = Image.open(fname)
        im = self.transform(im)
        #im = self.data[idx]
        try:
            label = int(fname.split("/")[-1].split("_")[0])
        except:
            label = -1 # test has no label
        return im,label



In [None]:
# original model without single layer
class Classifier(nn.Module):
    def __init__(self):
        super(Classifier, self).__init__()
        # torch.nn.Conv2d(in_channels, out_channels, kernel_size, stride, padding)
        # torch.nn.MaxPool2d(kernel_size, stride, padding)
        # input dims [3, 128, 128]
        self.cnn = nn.Sequential(
            nn.Conv2d(3, 64, 3, 1, 1),  # [64, 128, 128]
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.MaxPool2d(2, 2, 0),      # [64, 64, 64]

            nn.Conv2d(64, 128, 3, 1, 1), # [128, 64, 64]
            nn.BatchNorm2d(128),
            nn.ReLU(),
            nn.MaxPool2d(2, 2, 0),      # [128, 32, 32]

            nn.Conv2d(128, 256, 3, 1, 1), # [256, 32, 32]
            nn.BatchNorm2d(256),
            nn.ReLU(),
            nn.MaxPool2d(2, 2, 0),      # [256, 16, 16]

            nn.Conv2d(256, 512, 3, 1, 1), # [512, 16, 16]
            nn.BatchNorm2d(512),
            nn.ReLU(),
            nn.MaxPool2d(2, 2, 0),       # [512, 8, 8]

            nn.Conv2d(512, 512, 3, 1, 1), # [512, 8, 8]
            nn.BatchNorm2d(512),
            nn.ReLU(),
            nn.MaxPool2d(2, 2, 0),       # [512, 4, 4]
        )
        self.fc = nn.Sequential(
            nn.Linear(512*4*4, 1024),
            nn.ReLU(),
            nn.Linear(1024, 512),
            nn.ReLU(),
            nn.Linear(512, 11)
        )

    def forward(self, x):
        out = self.cnn(x)
        out = out.view(out.size()[0], -1)
        return self.fc(out)

In [8]:
# modified model with 6 layers
class Residual_Network(nn.Module):
    def __init__(self):
        super(Residual_Network, self).__init__()

        self.cnn_layer1 = nn.Sequential(
            nn.Conv2d(3, 64, 3, 1, 1),
            nn.BatchNorm2d(64),
        )

        self.cnn_layer2 = nn.Sequential(
            nn.Conv2d(64, 64, 3, 1, 1),
            nn.BatchNorm2d(64),
        )

        self.cnn_layer3 = nn.Sequential(
            nn.Conv2d(64, 128, 3, 2, 1),
            nn.BatchNorm2d(128),
        )

        self.cnn_layer4 = nn.Sequential(
            nn.Conv2d(128, 128, 3, 1, 1),
            nn.BatchNorm2d(128),
        )
        self.cnn_layer5 = nn.Sequential(
            nn.Conv2d(128, 256, 3, 2, 1),
            nn.BatchNorm2d(256),
        )
        self.cnn_layer6 = nn.Sequential(
            nn.Conv2d(256, 256, 3, 1, 1),
            nn.BatchNorm2d(256),
        )
        self.fc_layer = nn.Sequential(
            nn.Linear(256* 32* 32, 256),
            nn.ReLU(),
            nn.Linear(256, 11)
        )
        self.relu = nn.ReLU()

    def forward(self, x):
        # input (x): [batch_size, 3, 128, 128]
        # output: [batch_size, 11]

        # Extract features by convolutional layers.
        x1 = self.cnn_layer1(x)
        x1 = self.relu(x1)

        x2 = self.cnn_layer2(x1)
        x2 = self.relu(x2)

        x3 = self.cnn_layer3(x2)
        x3 = self.relu(x3)

        x4 = self.cnn_layer4(x3)
        x4 = self.relu(x4)

        x5 = self.cnn_layer5(x4)
        x5 = self.relu(x5)

        x6 = self.cnn_layer6(x5)
        x6 = self.relu(x6)

        # The extracted feature map must be flatten before going to fully-connected layers.
        xout = x6.flatten(1)

        # The features are transformed by fully-connected layers to obtain the final logits.
        xout = self.fc_layer(xout)
        return xout

In [14]:
# modified model with 6 layers, add ReLU and pooling
class Residual_Network_pooling(nn.Module):
    def __init__(self):
        super(Residual_Network, self).__init__()

        self.cnn_layer1 = nn.Sequential(
            nn.Conv2d(3, 64, 3, 1, 1),
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.MaxPool2d(2, 2, 0),
        )

        self.cnn_layer2 = nn.Sequential(
            nn.Conv2d(64, 64, 3, 1, 1),
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.MaxPool2d(2, 2, 0),
        )

        self.cnn_layer3 = nn.Sequential(
            nn.Conv2d(64, 128, 3, 2, 1),
            nn.BatchNorm2d(128),
            nn.ReLU(),
            nn.MaxPool2d(2, 2, 0),
        )

        self.cnn_layer4 = nn.Sequential(
            nn.Conv2d(128, 128, 3, 1, 1),
            nn.BatchNorm2d(128),
            nn.ReLU(),
            nn.MaxPool2d(2, 2, 0),
        )
        self.cnn_layer5 = nn.Sequential(
            nn.Conv2d(128, 256, 3, 2, 1),
            nn.BatchNorm2d(256),
            nn.ReLU(),
            nn.MaxPool2d(2, 2, 0),
        )
        self.cnn_layer6 = nn.Sequential(
            nn.Conv2d(256, 256, 3, 1, 1),
            nn.BatchNorm2d(256),
            nn.ReLU(),
            nn.MaxPool2d(2, 2, 0),
        )
        self.fc_layer = nn.Sequential(
            nn.Linear(256* 32* 32, 256),
            nn.ReLU(),
            nn.Linear(256, 11)
        )
        self.relu = nn.ReLU()

    def forward(self, x):
        # input (x): [batch_size, 3, 128, 128]
        # output: [batch_size, 11]

        # Extract features by convolutional layers.
        x1 = self.cnn_layer1(x)
        x1 = self.relu(x1)

        x2 = self.cnn_layer2(x1)
        x2 = self.relu(x2)

        x3 = self.cnn_layer3(x2)
        x3 = self.relu(x3)

        x4 = self.cnn_layer4(x3)
        x4 = self.relu(x4)

        x5 = self.cnn_layer5(x4)
        x5 = self.relu(x5)

        x6 = self.cnn_layer6(x5)
        x6 = self.relu(x6)

        # The extracted feature map must be flatten before going to fully-connected layers.
        xout = x6.flatten(1)

        # The features are transformed by fully-connected layers to obtain the final logits.
        xout = self.fc_layer(xout)
        return xout

In [9]:
batch_size = 64
_dataset_dir = "./food11"
# Construct datasets.
# The argument "loader" tells how torchvision reads the data.
train_set = FoodDataset(os.path.join(_dataset_dir,"training"), tfm=train_tfm)
train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True, num_workers=0, pin_memory=True)
valid_set = FoodDataset(os.path.join(_dataset_dir,"validation"), tfm=test_tfm)
valid_loader = DataLoader(valid_set, batch_size=batch_size, shuffle=True, num_workers=0, pin_memory=True)

One ./food11/training sample ./food11/training/0_0.jpg
One ./food11/validation sample ./food11/validation/0_0.jpg


In [11]:
# "cuda" only when GPUs are available.
device = "cuda" if torch.cuda.is_available() else "cpu"

# The number of training epochs and patience.
n_epochs = 60
patience = 300 # If no improvement in 'patience' epochs, early stop

# Initialize a model, and put it on the device specified.
# model = Classifier().to(device)
# model = Residual_Network().to(device)
model = Residual_Network_pooling

# For the classification task, we use cross-entropy as the measurement of performance.
criterion = nn.CrossEntropyLoss()

# Initialize optimizer, you may fine-tune some hyperparameters such as learning rate on your own.
optimizer = torch.optim.Adam(model.parameters(), lr=0.0003, weight_decay=1e-5)

# Initialize trackers, these are not parameters and should not be changed
stale = 0
best_acc = 0

# ---------- Training ----------
for epoch in range(n_epochs):
    model.train()

    train_loss = []
    train_accs = []

    for batch in tqdm(train_loader):

        imgs, labels = batch
        #imgs = imgs.half()
        # print(imgs.shape,labels.shape)

        # Forward the data
        logits = model(imgs.to(device))

        loss = criterion(logits, labels.to(device))

        # Gradients stored in the parameters in the previous step should be cleared out first.
        optimizer.zero_grad()

        loss.backward()

        grad_norm = nn.utils.clip_grad_norm_(model.parameters(), max_norm=10)
        optimizer.step()

        # Compute the accuracy for current batch.
        acc = (logits.argmax(dim=-1) == labels.to(device)).float().mean()

        # Record the loss and accuracy.
        train_loss.append(loss.item())
        train_accs.append(acc)

    train_loss = sum(train_loss) / len(train_loss)
    train_acc = sum(train_accs) / len(train_accs)

    print(f"[ Train | {epoch + 1:03d}/{n_epochs:03d} ] loss = {train_loss:.5f}, acc = {train_acc:.5f}")

    # ---------- Validation ----------
    model.eval()

    valid_loss = []
    valid_accs = []

    # Iterate the validation set by batches.
    for batch in tqdm(valid_loader):

        imgs, labels = batch
        #imgs = imgs.half()

        with torch.no_grad():
            logits = model(imgs.to(device))

        loss = criterion(logits, labels.to(device))

        acc = (logits.argmax(dim=-1) == labels.to(device)).float().mean()

        valid_loss.append(loss.item())
        valid_accs.append(acc)
        #break

    valid_loss = sum(valid_loss) / len(valid_loss)
    valid_acc = sum(valid_accs) / len(valid_accs)

    print(f"[ Valid | {epoch + 1:03d}/{n_epochs:03d} ] loss = {valid_loss:.5f}, acc = {valid_acc:.5f}")


    # update logs
    if valid_acc > best_acc:
        with open(f"./{_exp_name}_log.txt","a"):
            print(f"[ Valid | {epoch + 1:03d}/{n_epochs:03d} ] loss = {valid_loss:.5f}, acc = {valid_acc:.5f} -> best")
    else:
        with open(f"./{_exp_name}_log.txt","a"):
            print(f"[ Valid | {epoch + 1:03d}/{n_epochs:03d} ] loss = {valid_loss:.5f}, acc = {valid_acc:.5f}")


    # save models
    if valid_acc > best_acc:
        print(f"Best model found at epoch {epoch}, saving model")
        torch.save(model.state_dict(), f"{_exp_name}_best.ckpt")
        best_acc = valid_acc
        stale = 0
    else:
        stale += 1
        if stale > patience:
            print(f"No improvment {patience} consecutive epochs, early stopping")
            break

  0%|          | 0/155 [00:00<?, ?it/s]

[ Train | 001/020 ] loss = 4.37304, acc = 0.15942


  0%|          | 0/54 [00:00<?, ?it/s]

[ Valid | 001/020 ] loss = 2.23432, acc = 0.19719
[ Valid | 001/020 ] loss = 2.23432, acc = 0.19719 -> best
Best model found at epoch 0, saving model


  0%|          | 0/155 [00:00<?, ?it/s]

[ Train | 002/020 ] loss = 2.20494, acc = 0.21839


  0%|          | 0/54 [00:00<?, ?it/s]

[ Valid | 002/020 ] loss = 2.06284, acc = 0.27060
[ Valid | 002/020 ] loss = 2.06284, acc = 0.27060 -> best
Best model found at epoch 1, saving model


  0%|          | 0/155 [00:00<?, ?it/s]

[ Train | 003/020 ] loss = 2.10980, acc = 0.25260


  0%|          | 0/54 [00:00<?, ?it/s]

[ Valid | 003/020 ] loss = 2.01527, acc = 0.31622
[ Valid | 003/020 ] loss = 2.01527, acc = 0.31622 -> best
Best model found at epoch 2, saving model


  0%|          | 0/155 [00:00<?, ?it/s]

[ Train | 004/020 ] loss = 2.04411, acc = 0.28637


  0%|          | 0/54 [00:00<?, ?it/s]

[ Valid | 004/020 ] loss = 1.93958, acc = 0.33173
[ Valid | 004/020 ] loss = 1.93958, acc = 0.33173 -> best
Best model found at epoch 3, saving model


  0%|          | 0/155 [00:00<?, ?it/s]

[ Train | 005/020 ] loss = 1.96881, acc = 0.31841


  0%|          | 0/54 [00:00<?, ?it/s]

[ Valid | 005/020 ] loss = 1.86025, acc = 0.37182
[ Valid | 005/020 ] loss = 1.86025, acc = 0.37182 -> best
Best model found at epoch 4, saving model


  0%|          | 0/155 [00:00<?, ?it/s]

[ Train | 006/020 ] loss = 1.90497, acc = 0.34004


  0%|          | 0/54 [00:00<?, ?it/s]

[ Valid | 006/020 ] loss = 1.90153, acc = 0.34259
[ Valid | 006/020 ] loss = 1.90153, acc = 0.34259


  0%|          | 0/155 [00:00<?, ?it/s]

[ Train | 007/020 ] loss = 1.84532, acc = 0.36272


  0%|          | 0/54 [00:00<?, ?it/s]

[ Valid | 007/020 ] loss = 1.77246, acc = 0.38985
[ Valid | 007/020 ] loss = 1.77246, acc = 0.38985 -> best
Best model found at epoch 6, saving model


  0%|          | 0/155 [00:00<?, ?it/s]

[ Train | 008/020 ] loss = 1.79521, acc = 0.37770


  0%|          | 0/54 [00:00<?, ?it/s]

[ Valid | 008/020 ] loss = 1.96773, acc = 0.35889
[ Valid | 008/020 ] loss = 1.96773, acc = 0.35889


  0%|          | 0/155 [00:00<?, ?it/s]

[ Train | 009/020 ] loss = 1.75568, acc = 0.38835


  0%|          | 0/54 [00:00<?, ?it/s]

[ Valid | 009/020 ] loss = 1.77095, acc = 0.38723
[ Valid | 009/020 ] loss = 1.77095, acc = 0.38723


  0%|          | 0/155 [00:00<?, ?it/s]

[ Train | 010/020 ] loss = 1.71762, acc = 0.40212


  0%|          | 0/54 [00:00<?, ?it/s]

[ Valid | 010/020 ] loss = 1.73459, acc = 0.39592
[ Valid | 010/020 ] loss = 1.73459, acc = 0.39592 -> best
Best model found at epoch 9, saving model


  0%|          | 0/155 [00:00<?, ?it/s]

[ Train | 011/020 ] loss = 1.67018, acc = 0.41236


  0%|          | 0/54 [00:00<?, ?it/s]

[ Valid | 011/020 ] loss = 1.70420, acc = 0.42090
[ Valid | 011/020 ] loss = 1.70420, acc = 0.42090 -> best
Best model found at epoch 10, saving model


  0%|          | 0/155 [00:00<?, ?it/s]

[ Train | 012/020 ] loss = 1.63072, acc = 0.43367


  0%|          | 0/54 [00:00<?, ?it/s]

[ Valid | 012/020 ] loss = 1.64479, acc = 0.43228
[ Valid | 012/020 ] loss = 1.64479, acc = 0.43228 -> best
Best model found at epoch 11, saving model


  0%|          | 0/155 [00:00<?, ?it/s]

[ Train | 013/020 ] loss = 1.61311, acc = 0.43669


  0%|          | 0/54 [00:00<?, ?it/s]

[ Valid | 013/020 ] loss = 1.82528, acc = 0.38461
[ Valid | 013/020 ] loss = 1.82528, acc = 0.38461


  0%|          | 0/155 [00:00<?, ?it/s]

[ Train | 014/020 ] loss = 1.57485, acc = 0.44438


  0%|          | 0/54 [00:00<?, ?it/s]

[ Valid | 014/020 ] loss = 1.76601, acc = 0.41291
[ Valid | 014/020 ] loss = 1.76601, acc = 0.41291


  0%|          | 0/155 [00:00<?, ?it/s]

[ Train | 015/020 ] loss = 1.56158, acc = 0.45819


  0%|          | 0/54 [00:00<?, ?it/s]

[ Valid | 015/020 ] loss = 1.57278, acc = 0.45631
[ Valid | 015/020 ] loss = 1.57278, acc = 0.45631 -> best
Best model found at epoch 14, saving model


  0%|          | 0/155 [00:00<?, ?it/s]

[ Train | 016/020 ] loss = 1.53395, acc = 0.46518


  0%|          | 0/54 [00:00<?, ?it/s]

[ Valid | 016/020 ] loss = 1.57526, acc = 0.46409
[ Valid | 016/020 ] loss = 1.57526, acc = 0.46409 -> best
Best model found at epoch 15, saving model


  0%|          | 0/155 [00:00<?, ?it/s]

[ Train | 017/020 ] loss = 1.49179, acc = 0.47919


  0%|          | 0/54 [00:00<?, ?it/s]

[ Valid | 017/020 ] loss = 1.64966, acc = 0.43703
[ Valid | 017/020 ] loss = 1.64966, acc = 0.43703


  0%|          | 0/155 [00:00<?, ?it/s]

[ Train | 018/020 ] loss = 1.47498, acc = 0.48639


  0%|          | 0/54 [00:00<?, ?it/s]

[ Valid | 018/020 ] loss = 1.56829, acc = 0.47289
[ Valid | 018/020 ] loss = 1.56829, acc = 0.47289 -> best
Best model found at epoch 17, saving model


  0%|          | 0/155 [00:00<?, ?it/s]

[ Train | 019/020 ] loss = 1.45015, acc = 0.49147


  0%|          | 0/54 [00:00<?, ?it/s]

[ Valid | 019/020 ] loss = 1.69910, acc = 0.46179
[ Valid | 019/020 ] loss = 1.69910, acc = 0.46179


  0%|          | 0/155 [00:00<?, ?it/s]

[ Train | 020/020 ] loss = 1.42846, acc = 0.50863


  0%|          | 0/54 [00:00<?, ?it/s]

[ Valid | 020/020 ] loss = 1.75984, acc = 0.44850
[ Valid | 020/020 ] loss = 1.75984, acc = 0.44850


In [12]:
test_set = FoodDataset(os.path.join(_dataset_dir,"test"), tfm=test_tfm)
test_loader = DataLoader(test_set, batch_size=batch_size, shuffle=False, num_workers=0, pin_memory=True)

One ./food11/test sample ./food11/test/0001.jpg


# Testing and generate prediction CSV

In [16]:
# model_best = Classifier().to(device)
model_best = Residual_Network().to(device)

model_best.load_state_dict(torch.load(f"{_exp_name}_best.ckpt"))
model_best.eval()
prediction = []
with torch.no_grad():
    for data,_ in test_loader:
        test_pred = model_best(data.to(device))
        test_label = np.argmax(test_pred.cpu().data.numpy(), axis=1)
        prediction += test_label.squeeze().tolist()

In [17]:
#create test csv
def pad4(i):
    return "0"*(4-len(str(i)))+str(i)
df = pd.DataFrame()
df["Id"] = [pad4(i) for i in range(1,len(test_set)+1)]
df["Category"] = prediction
df.to_csv("submission.csv",index = False)