Download torchvision, huggingface's evaluate and transformers. Make sure that accelerate's version is compatible between transformers and torchvision.
We also load scikit image to load the images.

In [None]:
!pip install --quiet scikit-image
!pip install --quiet matplotlib
!pip install --quiet torchvision
!pip install --quiet evaluate
!pip install --quiet transformers
!pip install --quiet accelerate>=0.21.0 -U

# Prepare Colab's session by downloading the data from drive
Since we are using colab, the data is deleted every session connection. We download the data from google drive every time to prepare the notebook for running.

In [None]:
import gdown, os

DATASET_ROOT_DIR = "/content/data"

if not os.path.exists(DATASET_ROOT_DIR):

  file_id = "1Z5ZjEdrOvgcHdU6FSpzG53oQ0QlCGv3-"
  gdown.download(f"https://drive.google.com/uc?id={file_id}", output=f"/content/")
  !unzip /content/data.zip

In [None]:
import os
from itertools import chain
from torch.utils.data import Dataset
import torch
from skimage import io
import pandas as pd
import numpy as np

import numpy as np # for transformation
import torch # PyTorch package
import torchvision.transforms as transforms # transform data
import torch.nn as nn # basic building block for neural neteorks
import torch.nn.functional as F # import convolution functions like Relu
import torch.optim as optim # optimzer
from tqdm import tqdm
import evaluate

from transformers import TrainingArguments, Trainer, DefaultDataCollator

from sklearn.model_selection import train_test_split, StratifiedKFold

In [None]:
## Modeling Configurations

BATCH_SIZE = 8 # If you have less ram, use 4

# Dataset Definition

We use a custom dataset that loads the images on demand in order to save up RAM. Data is represented as a lookup table saving only the image path and label in memory, and loading image by image on demand.

First, we prepare a table in Panda's dataframe format that contains the image's information:
1. Root directory: the folder containing the image (e.g. /data/cv)
2. Image file name: the name of the image file. All images are in `.tif` extension
3. text_label: the label of the image. It is one of: ` 'ad', 'cv', 'doc', 'email', 'other'`

In [None]:
def prepare_data(root_dir:str):

  all_paths = os.walk(root_dir)
  all_paths = list(all_paths)

  img_paths_tabularized = [ [{ "parent_dir": path_[0], 'img_path': img_ } for img_ in path_[-1]]  for path_ in all_paths[1:] ]
  img_paths_tabularized  = list(chain.from_iterable(img_paths_tabularized))
  print(f"[Dataset][INFO] Preparing paths for {len(img_paths_tabularized)} images")

  images_lookup = pd.DataFrame.from_dict(img_paths_tabularized)
  images_lookup['text_label'] = images_lookup['parent_dir'].apply(lambda x: x.strip().split("/")[-1])

  label2id = { doc_type:id_   for id_, doc_type in enumerate([ 'ad', 'cv', 'doc', 'email', 'other' ]) }
  id2label = { id_:doc_type   for doc_type, id_ in label2id.items() }

  return images_lookup, label2id, id2label

Second, we define out custom dataset class `DocumentTypeDataset`.\
This class takes as input a dataframe containing the image information and a label-to-id mapping dictionary which is used to provide numerical number for each class.
It also accepts an image transformation pipeline.

In order to save RAM memory, this dataset loads the images on demand. We only keep the path to the image in-memory, and provide the images only when they are required.

In [None]:
class DocumentTypeDataset(Dataset):
  """Document Type Dataset."""

  def __init__(self,data_lookup:pd.DataFrame, label2id:dict, img_transformation=None):
    """
    Arguments:
        data_lookup: the dataframe of image paths and corresponding labels
        label2id: dictionary to map the labels to their corresponding Ids
        transform (callable, optional): Optional transformation to be applied
            on a sample, if any.
    """
    self.img_transformation = img_transformation

    self.images_lookup = data_lookup
    self.label2id = label2id


  def __len__(self):
    return len(self.images_lookup)

  def __getitem__(self, idx):
    """ Load images from their corresponding path.  """

    if torch.is_tensor(idx):
      idx = idx.tolist()

    # construct the image path from the parent dir (e.g. `/data/cv`) and the image name (e.g.  `image_1.tif`)
    img_name = os.path.join(self.images_lookup.iloc[idx]['parent_dir'], self.images_lookup.iloc[idx]['img_path'])

    # read the image from disk using scikit-image. It reads the image in greyscale by default.
    image = io.imread(img_name)
    # construct the data record <X, Y> where X is the image and Y is the corresponding true label
    sample = {'image': image, 'label': self.label2id[  self.images_lookup.iloc[idx]['text_label'] ] }
    # If we need to apply any transformation to the image, apply them
    if self.img_transformation:
      sample['image'] = self.img_transformation(sample['image'])

    return sample

We need to apply some transformations to the images to enhance the data quality and standardize them. The transformations we use are:
1. **ToTensor():** make sure that the image is in pytorch tensor format
2. **Resize():** Images of different sizes can confuse the model and reduce its performane. To avoid different image sizes, we resize all images to a fixed size of 64*64.
3. Normalize(): To facilitate learning the distribution of every class in the dataset, we normalize the images into μ=0.5, σ=0.5

In [None]:
# The image transformation pipleine
transformation_pipeline = transforms.Compose( #* composing several transforms together
    [transforms.ToTensor(), #* to tensor object
     transforms.Resize((64,64)),
     transforms.Normalize(0.5, 0.5)]) #* mean = 0.5, std = 0.5


Prepare the image lookup to the entire dataset. This will be split into train-validation in the K-fold cross validation step later on.

In [None]:
images_lookup, label2id, id2label = prepare_data(DATASET_ROOT_DIR)

The data collator used by HuggingFace's model trainer to stack data.

In [None]:
data_collator = DefaultDataCollator(return_tensors="pt")

# Model Building

We use a Convolutional Neural Network as to build our model and train it from scratsh. Since we have relatively moderate amount of data, we use a medium-sized CNN architecture. The model consists of:
1. The convolution bolcks consisting of:
  1. First 2D Convolution layer consisting of 3*3 kernels and 16 output channels. We use this small sized kernel to help the model learn local image information at early stage.
  2. 3*3 2D-MaxPooling layer.
  3. Second 2D convolution layer consisting of 5*5 kernels and 5 output channels. We use a larger kernel than the first one to allow the model understand more general features of the images.
  4. 2*2 2D-MaxPooling Layer.

2. classification block consisting of:
  1. First Fully Connected Layer of size 128.
  2. Second Fully Connected Layer of size 64.   
  3. Output layer consisting of 5 neurons for 5 classes.

For all layers we use LeakyReLU Activation.

In [None]:
class CNNModel(nn.Module):
  ''' Simple Convolutional Neural Network'''

  def __init__(self):
    ''' initialize the network '''
    super(CNNModel, self).__init__()

    self.convolution_blocks = nn.Sequential(
        nn.Conv2d(1, 16, 3), # we have a single input channel, and a 6 output channel. Use 5*5 convolution kernel
        nn.LeakyReLU(),
        nn.MaxPool2d(3,3),
        nn.Conv2d(16, 5, 5),
        nn.LeakyReLU(),
        nn.MaxPool2d(2,2)
    )

    self.flattening_length = 5*8*8

    self.classification_head = nn.Sequential(
        nn.Linear(self.flattening_length, 128),
        nn.LeakyReLU(),
        nn.Linear(128,64),
        nn.LeakyReLU(),
        nn.Linear(64, 5) # we have 5 output classes
    )

    self.loss_fn = nn.CrossEntropyLoss()


  def forward(self, image, labels):
    ''' the forward propagation algorithm '''
    x = self.convolution_blocks(image)
    #print(x.shape)
    x = x.view(-1, self.flattening_length)
    x = self.classification_head(x)

    one_hot_labels = F.one_hot(labels, num_classes=5)
    loss = self.loss_fn( x, one_hot_labels.float() )
    return { 'output':x, 'loss': loss}

net = CNNModel()
print(net)

## Training Utilities

Every training epoch, we need to evaluate the model's performance on the validation set. This helps us see if the model is improving and whether it starts to overfit on the training data or not.

Since we are using a classification task, the best metrics are the Accuracy and F1-score. For the F1-score, we report the macro-average.

In [None]:
# Define Evaluation Methods:
accuracy = evaluate.load("accuracy")
f1_score = evaluate.load("f1")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return { 'accuracy': accuracy.compute(predictions=predictions, references=labels)['accuracy'],
            'f1': f1_score.compute(predictions=predictions, references=labels, average='macro')['f1']
    }

# Training Loop with K-folds Cross Validation

Finally, we use K-fold Cross Validation to assess the model's performance, with k=5. Since we have imbalanced classes, we use the Stratified version of the K-fold CV. This way each fold has the same distribution of classes as the overall dataset, hence giving more stable and meaningful results.

In [None]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=5331) # initialize the K-fold CV object.

X = images_lookup[['parent_dir', 'img_path']]
Y = images_lookup['text_label']



for i, (train_index, test_index) in enumerate(skf.split(X, Y)):
  ## Every fold, construct the training and validation data subsets.

  print(f'{"-"*7} Fold #{i+1} { "-"*7 }')
  training_lookup, testing_lookup = images_lookup.iloc[train_index], images_lookup.iloc[test_index]
  training_data = DocumentTypeDataset(training_lookup, label2id, transformation_pipeline)
  validation_data = DocumentTypeDataset(testing_lookup, label2id, transformation_pipeline)
  # reset model every fold
  net = CNNModel()

  training_args = TrainingArguments(
    output_dir=f"/content/cnn_model/fold#{i}",
    remove_unused_columns=False,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=1e-3, # Use a relatively small learning rate to avoid overfitting on the data.
    per_device_train_batch_size=1,
    gradient_accumulation_steps=BATCH_SIZE,
    per_device_eval_batch_size=1,
    num_train_epochs=10, # this is the best number of epochs to train the model without allowing it to overfit.
    warmup_ratio=0.1,
    load_best_model_at_end=True,
    metric_for_best_model="f1", # The best model is the highest F1 score not the lowest.
    push_to_hub=False,
  )

  trainer = Trainer(
    model=net,
    args=training_args,
    data_collator=data_collator,
    train_dataset=training_data,
    eval_dataset=validation_data,
    compute_metrics=compute_metrics,
  )
  trainer.train()