In [1]:
# Install dependencies

! pip install -U transformers[torch] datasets wandb



In [2]:
import cv2
import wandb
import os
import numpy as np
import urllib.request as urllib
import pandas as pd
import random
import torch

from PIL import ImageDraw, ImageFont, Image
from datasets import load_dataset, load_metric
from google.colab import auth
from google.cloud import storage
from google.colab import drive
from transformers import ViTImageProcessor, ViTForImageClassification, TrainingArguments, Trainer

In [3]:
# Use HF datasets to download data stored locally on Google drive

drive.mount('/content/drive')
IMGS_DATA_ROOT = 'drive/MyDrive/AML_final_proj/Yoga-82-imgs/'

ds = load_dataset("imagefolder", data_dir=IMGS_DATA_ROOT)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Resolving data files:   0%|          | 0/11743 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/3351 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/1772 [00:00<?, ?it/s]

In [4]:
ds["train"][54]

{'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=240x160>,
 'label': 0}

In [5]:
# Define model name to pull from HF and get associated data processor for image preprocessing

model_name_or_path = 'google/vit-base-patch16-224-in21k'
processor = ViTImageProcessor.from_pretrained(model_name_or_path)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [6]:
processor(ds["train"][0]['image'], return_tensors='pt')

{'pixel_values': tensor([[[[-0.2078, -0.2078, -0.2549,  ..., -0.2235, -0.2078, -0.2078],
          [-0.2078, -0.2078, -0.2000,  ..., -0.2157, -0.2078, -0.2078],
          [-0.2078, -0.2000, -0.1608,  ..., -0.1765, -0.2000, -0.2078],
          ...,
          [-0.2471, -0.2392, -0.1059,  ..., -0.1294, -0.2000, -0.2078],
          [-0.2235, -0.2314, -0.1843,  ..., -0.2078, -0.2078, -0.2078],
          [-0.1922, -0.1922, -0.2000,  ..., -0.2078, -0.2078, -0.2078]],

         [[ 0.5843,  0.5922,  0.6000,  ...,  0.5922,  0.5922,  0.5922],
          [ 0.6000,  0.6000,  0.6000,  ...,  0.5843,  0.5922,  0.5922],
          [ 0.5843,  0.5843,  0.5765,  ...,  0.5765,  0.5922,  0.5922],
          ...,
          [ 0.6078,  0.6078,  0.5608,  ...,  0.5608,  0.5922,  0.5922],
          [ 0.6000,  0.5922,  0.5765,  ...,  0.5922,  0.5922,  0.5922],
          [ 0.5843,  0.5765,  0.5765,  ...,  0.5922,  0.5922,  0.5922]],

         [[ 0.9765,  0.9686,  0.9765,  ...,  0.9608,  0.9765,  0.9765],
          [ 0

In [7]:
processor(ds["train"][0]['image'], return_tensors='pt')['pixel_values'].shape

torch.Size([1, 3, 224, 224])

In [8]:
processor(ds["train"][0]['image'], return_tensors='pt')['pixel_values'].ndim

4

In [9]:
processor

ViTImageProcessor {
  "_valid_processor_keys": [
    "images",
    "do_resize",
    "size",
    "resample",
    "do_rescale",
    "rescale_factor",
    "do_normalize",
    "image_mean",
    "image_std",
    "return_tensors",
    "data_format",
    "input_data_format"
  ],
  "do_normalize": true,
  "do_rescale": true,
  "do_resize": true,
  "image_mean": [
    0.5,
    0.5,
    0.5
  ],
  "image_processor_type": "ViTImageProcessor",
  "image_std": [
    0.5,
    0.5,
    0.5
  ],
  "resample": 2,
  "rescale_factor": 0.00392156862745098,
  "size": {
    "height": 224,
    "width": 224
  }
}

In [10]:
def process_example(example):
    inputs = processor(example['image'], return_tensors='pt')
    inputs['labels'] = example['label']
    return inputs

In [11]:
process_example(ds['train'][0])

{'pixel_values': tensor([[[[-0.2078, -0.2078, -0.2549,  ..., -0.2235, -0.2078, -0.2078],
          [-0.2078, -0.2078, -0.2000,  ..., -0.2157, -0.2078, -0.2078],
          [-0.2078, -0.2000, -0.1608,  ..., -0.1765, -0.2000, -0.2078],
          ...,
          [-0.2471, -0.2392, -0.1059,  ..., -0.1294, -0.2000, -0.2078],
          [-0.2235, -0.2314, -0.1843,  ..., -0.2078, -0.2078, -0.2078],
          [-0.1922, -0.1922, -0.2000,  ..., -0.2078, -0.2078, -0.2078]],

         [[ 0.5843,  0.5922,  0.6000,  ...,  0.5922,  0.5922,  0.5922],
          [ 0.6000,  0.6000,  0.6000,  ...,  0.5843,  0.5922,  0.5922],
          [ 0.5843,  0.5843,  0.5765,  ...,  0.5765,  0.5922,  0.5922],
          ...,
          [ 0.6078,  0.6078,  0.5608,  ...,  0.5608,  0.5922,  0.5922],
          [ 0.6000,  0.5922,  0.5765,  ...,  0.5922,  0.5922,  0.5922],
          [ 0.5843,  0.5765,  0.5765,  ...,  0.5922,  0.5922,  0.5922]],

         [[ 0.9765,  0.9686,  0.9765,  ...,  0.9608,  0.9765,  0.9765],
          [ 0

In [12]:
process_example(ds['train'][0])['pixel_values'].shape

torch.Size([1, 3, 224, 224])

In [13]:
# Helper function to transform an entire batch of images in order to have them be ready to be fed into the model

def transform(example_batch):
    # Make sure all of the images are in 'RGB' mode with 3 channels
    rgb_inputs = [(lambda x: x.convert('RGB'))(item) for item in example_batch['image']]

    # Take a list of PIL images and turn them to pixel values
    inputs = processor([x for x in rgb_inputs], return_tensors='pt')

    # Include labels
    inputs['labels'] = example_batch['label']
    return inputs

In [14]:
transformed_ds = ds.with_transform(transform)

In [15]:
transformed_ds['train'][0:2]

{'pixel_values': tensor([[[[-0.2078, -0.2078, -0.2549,  ..., -0.2235, -0.2078, -0.2078],
          [-0.2078, -0.2078, -0.2000,  ..., -0.2157, -0.2078, -0.2078],
          [-0.2078, -0.2000, -0.1608,  ..., -0.1765, -0.2000, -0.2078],
          ...,
          [-0.2471, -0.2392, -0.1059,  ..., -0.1294, -0.2000, -0.2078],
          [-0.2235, -0.2314, -0.1843,  ..., -0.2078, -0.2078, -0.2078],
          [-0.1922, -0.1922, -0.2000,  ..., -0.2078, -0.2078, -0.2078]],

         [[ 0.5843,  0.5922,  0.6000,  ...,  0.5922,  0.5922,  0.5922],
          [ 0.6000,  0.6000,  0.6000,  ...,  0.5843,  0.5922,  0.5922],
          [ 0.5843,  0.5843,  0.5765,  ...,  0.5765,  0.5922,  0.5922],
          ...,
          [ 0.6078,  0.6078,  0.5608,  ...,  0.5608,  0.5922,  0.5922],
          [ 0.6000,  0.5922,  0.5765,  ...,  0.5922,  0.5922,  0.5922],
          [ 0.5843,  0.5765,  0.5765,  ...,  0.5922,  0.5922,  0.5922]],

         [[ 0.9765,  0.9686,  0.9765,  ...,  0.9608,  0.9765,  0.9765],
          [ 0

In [16]:
transformed_ds['train'][0:2]['pixel_values'].shape

torch.Size([2, 3, 224, 224])

In [17]:
# Custom collator function to stack samples in batch

def collate_func(batch):
    return {
        'pixel_values': torch.stack([x['pixel_values'] for x in batch]),
        'labels': torch.tensor([x['labels'] for x in batch])
    }

In [18]:
# Metrics use for performance evaluation

def compute_metrics(p):
    metrics = dict()
    acc_metric = load_metric("accuracy", trust_remote_code=True)
    pre_metric = load_metric("precision", trust_remote_code=True)
    re_metric = load_metric("recall", trust_remote_code=True)
    f1_metric = load_metric("f1", trust_remote_code=True)
    roc_auc_metric = load_metric("roc_auc", "multiclass", trust_remote_code=True)

    logits = p.predictions
    labels = p.label_ids
    preds = np.argmax(logits, axis=-1)

    exp_logits = np.exp(logits - np.max(logits, axis=1, keepdims=True))
    softmax_output = exp_logits / np.sum(exp_logits, axis=1, keepdims=True)

    metrics.update(acc_metric.compute(predictions=preds, references=labels))
    metrics.update(pre_metric.compute(predictions=preds, references=labels, average='weighted'))
    metrics.update(re_metric.compute(predictions=preds, references=labels, average='weighted'))
    metrics.update(f1_metric.compute(predictions=preds, references=labels, average='weighted'))
    metrics.update(roc_auc_metric.compute(prediction_scores=softmax_output, references=labels, multi_class="ovo"))

    return metrics


def compute_metrics_no_rocauc(p):
    metrics = dict()
    acc_metric = load_metric("accuracy", trust_remote_code=True)
    pre_metric = load_metric("precision", trust_remote_code=True)
    re_metric = load_metric("recall", trust_remote_code=True)
    f1_metric = load_metric("f1", trust_remote_code=True)

    logits = p.predictions
    labels = p.label_ids
    preds = np.argmax(logits, axis=-1)

    metrics.update(acc_metric.compute(predictions=preds, references=labels))
    metrics.update(pre_metric.compute(predictions=preds, references=labels, average='weighted'))
    metrics.update(re_metric.compute(predictions=preds, references=labels, average='weighted'))
    metrics.update(f1_metric.compute(predictions=preds, references=labels, average='weighted'))

    return metrics


In [19]:
# Get the model from HF with the proper label mappings

labels = ds['train'].features['label'].names

model = ViTForImageClassification.from_pretrained(
    model_name_or_path,
    num_labels=len(labels),
    id2label={str(i): c for i, c in enumerate(labels)},
    label2id={c: str(i) for i, c in enumerate(labels)}
)

Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224-in21k and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [20]:
# Leverage GPUs in Colab and set up Weights and Biases to point to the right project

if torch.cuda.is_available():
    device = torch.device('cuda')

torch.cuda.empty_cache()
os.environ['WANDB__SERVICE_WAIT'] = "300"
wandb.init(entity="aml-experiments", project="vit-yoga-classifier", settings=wandb.Settings(start_method="fork"), reinit=True)
# wandb.init(entity="aml-experiments", project="vit-yoga-classifier", settings=wandb.Settings(start_method="fork"), reinit=True)

[34m[1mwandb[0m: Currently logged in as: [33mcharlesa_m-18[0m ([33maml-experiments[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [21]:
# Define the training arguments for the baseline model

training_args = TrainingArguments(
  output_dir="drive/MyDrive/AML_final_proj/vit-base-yoga82",
  per_device_train_batch_size=128,
  per_device_eval_batch_size=128,
  save_strategy='epoch',
  evaluation_strategy="epoch",
  logging_strategy='epoch',
  num_train_epochs=5,
  fp16=True,
  learning_rate=2e-4,
  remove_unused_columns=False,
  push_to_hub=False,
  report_to='wandb',
  load_best_model_at_end=True,
)

In [22]:
# Instantiate the HF trainer object for training

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=collate_func,
    compute_metrics=compute_metrics,
    train_dataset=transformed_ds["train"],
    eval_dataset=transformed_ds["validation"],
    tokenizer=processor,
)

In [23]:
# Train the model and save model checkpoint

train_results = trainer.train()
trainer.save_model()
trainer.log_metrics("Evaluation during training on validation split", train_results.metrics)
trainer.save_metrics("Evaluation during training on validation split", train_results.metrics)
trainer.save_state()



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Roc Auc
1,3.3399,2.295976,0.618621,0.611905,0.618621,0.557122,0.987517
2,1.5362,1.218406,0.811996,0.806027,0.811996,0.792469,0.994406
3,0.6665,0.821685,0.853178,0.853692,0.853178,0.846948,0.995692
4,0.3243,0.665622,0.868099,0.870291,0.868099,0.865761,0.996511
5,0.2128,0.62728,0.874366,0.87537,0.874366,0.872266,0.996708


  acc_metric = load_metric("accuracy", trust_remote_code=True)


Downloading builder script:   0%|          | 0.00/1.65k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/2.58k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/2.52k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/3.20k [00:00<?, ?B/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


***** Evaluation during training on validation split metrics *****
  epoch                    =          5.0
  total_flos               = 4240502180GF
  train_loss               =       1.2159
  train_runtime            =   1:52:52.65
  train_samples_per_second =        8.669
  train_steps_per_second   =        0.068


In [24]:
# Evaluate the trained model on the tes split

metrics = trainer.evaluate(transformed_ds['test'])
trainer.log_metrics("Evaluation on test split", metrics)
trainer.save_metrics("Evaluation on test split", metrics)



  _warn_prf(average, modifier, msg_start, len(result))


***** Evaluation on test split metrics *****
  epoch                   =        5.0
  eval_accuracy           =     0.8657
  eval_f1                 =      0.863
  eval_loss               =     0.6439
  eval_precision          =     0.8688
  eval_recall             =     0.8657
  eval_roc_auc            =     0.9962
  eval_runtime            = 0:10:56.72
  eval_samples_per_second =      2.698
  eval_steps_per_second   =      0.021


In [25]:
# Define sweepm config for hyperparameter search

sweep_config = {
    "name": "vit-hp-sweep",
    "method": "random",
    "metric": {"name": "accuracy", "goal": "maximize"},
    "parameters": {
        "learning_rate": {"min": 0.0001, "max": 0.1},
        "dropout": {"min": 0.1, "max": 0.3},
        "weight_decay": {"values": [0.1, 0.03]},
    },
}

In [26]:
# Create sweep run hooked to WANDB

sweep_id = wandb.sweep(sweep_config, entity="aml-experiments", project="vit-yoga-classifier")

Create sweep with ID: yincexld
Sweep URL: https://wandb.ai/aml-experiments/vit-yoga-classifier/sweeps/yincexld


In [27]:
# Model init function for each iteration of the sweep

def model_init(trial):
    labels = ds['train'].features['label'].names

    return ViTForImageClassification.from_pretrained(
        model_name_or_path,
        num_labels=len(labels),
        id2label={str(i): c for i, c in enumerate(labels)},
        label2id={c: str(i) for i, c in enumerate(labels)}
    )

In [28]:
# Define an iteration for hyperparameter search sweeps

def train(config=None):
  with wandb.init(config=config):
    # set sweep configuration
    config = wandb.config
    training_args = TrainingArguments(
      output_dir="drive/MyDrive/AML_final_proj/vit-base-yoga82",
      per_device_train_batch_size=128,
      per_device_eval_batch_size=128,
      save_strategy='epoch',
      evaluation_strategy="epoch",
      logging_strategy='epoch',
      num_train_epochs=5,
      learning_rate=config.learning_rate,
      weight_decay=config.weight_decay,
      fp16=True,
      remove_unused_columns=False,
      push_to_hub=False,
      report_to='wandb',
      load_best_model_at_end=True,
    )

    hp_search_trainer = Trainer(
        model=None,
        args=training_args,
        data_collator=collate_func,
        compute_metrics=compute_metrics_no_rocauc,
        train_dataset=transformed_ds["train"],
        eval_dataset=transformed_ds["validation"],
        tokenizer=processor,
        model_init=model_init
    )

    hp_search_trainer.train()


In [None]:
wandb.agent(sweep_id, train, count=15)

[34m[1mwandb[0m: Agent Starting Run: x1dn0vbr with config:
[34m[1mwandb[0m: 	dropout: 0.16642942183056766
[34m[1mwandb[0m: 	learning_rate: 0.07956314227946773
[34m[1mwandb[0m: 	weight_decay: 0.1


Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224-in21k and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224-in21k and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Exception in thread ChkStopThr:
Traceback (most recent call last):
  File "/usr/lib/python3.10/threading.py", line 1016, in _bootstrap_inner
    self.run()
  File "/usr/lib/python3.10/threading.py", line 953, in run
    self._target(*self._args, **self._kwargs)
  File "/usr/local/lib/python3.10/dist-packages/wandb/sdk/wandb_run.py", line 286, in check_stop_status
    self._loop_check_sta

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,9.0702,,0.005073,2.6e-05,0.005073,5.1e-05
2,0.0,,0.005073,2.6e-05,0.005073,5.1e-05
3,0.0,,0.005073,2.6e-05,0.005073,5.1e-05
4,0.0,,0.005073,2.6e-05,0.005073,5.1e-05
5,0.0,,0.005073,2.6e-05,0.005073,5.1e-05


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


VBox(children=(Label(value='0.003 MB of 0.003 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
eval/accuracy,▁▁▁▁▁
eval/f1,▁▁▁▁▁
eval/precision,▁▁▁▁▁
eval/recall,▁▁▁▁▁
eval/runtime,█▄▆▃▁
eval/samples_per_second,▁▅▃▆█
eval/steps_per_second,▁▅▂▇█
train/epoch,▁▁▃▃▅▅▆▆███
train/global_step,▁▁▃▃▅▅▆▆███
train/learning_rate,▁▁▁▁▁

0,1
eval/accuracy,0.00507
eval/f1,5e-05
eval/loss,
eval/precision,3e-05
eval/recall,0.00507
eval/runtime,65.4718
eval/samples_per_second,51.182
eval/steps_per_second,0.412
total_flos,4.553204545621832e+18
train/epoch,5.0


[34m[1mwandb[0m: Agent Starting Run: gmvyfyxh with config:
[34m[1mwandb[0m: 	dropout: 0.2027861708398697
[34m[1mwandb[0m: 	learning_rate: 0.04693509432098646
[34m[1mwandb[0m: 	weight_decay: 0.1


Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224-in21k and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224-in21k and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,5.0826,,0.005073,2.6e-05,0.005073,5.1e-05
2,0.0,,0.005073,2.6e-05,0.005073,5.1e-05
3,0.0,,0.005073,2.6e-05,0.005073,5.1e-05
4,0.0,,0.005073,2.6e-05,0.005073,5.1e-05
5,0.0,,0.005073,2.6e-05,0.005073,5.1e-05


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


VBox(children=(Label(value='0.003 MB of 0.003 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
eval/accuracy,▁▁▁▁▁
eval/f1,▁▁▁▁▁
eval/precision,▁▁▁▁▁
eval/recall,▁▁▁▁▁
eval/runtime,▄▁█▂▅
eval/samples_per_second,▅█▁▇▄
eval/steps_per_second,▅█▁▇▅
train/epoch,▁▁▃▃▅▅▆▆███
train/global_step,▁▁▃▃▅▅▆▆███
train/learning_rate,▁▁▁▁▁

0,1
eval/accuracy,0.00507
eval/f1,5e-05
eval/loss,
eval/precision,3e-05
eval/recall,0.00507
eval/runtime,66.7432
eval/samples_per_second,50.207
eval/steps_per_second,0.405
total_flos,4.553204545621832e+18
train/epoch,5.0


[34m[1mwandb[0m: Agent Starting Run: 0nvyw2db with config:
[34m[1mwandb[0m: 	dropout: 0.1756146239648812
[34m[1mwandb[0m: 	learning_rate: 0.0734761829277811
[34m[1mwandb[0m: 	weight_decay: 0.03


Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224-in21k and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224-in21k and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,8.8258,,0.005073,0.000202,0.005073,0.000356
2,0.0,,0.005073,0.000202,0.005073,0.000356
3,0.0,,0.005073,0.000202,0.005073,0.000356
4,0.0,,0.005073,0.000202,0.005073,0.000356


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
