In [1]:
import torch

if torch.cuda.is_available():
    device = torch.device("cuda")
    print("There are %d GPU(s) available." % torch.cuda.device_count())
    print("We will use the GPU:", torch.cuda.get_device_name(0))
else:
    print("No GPU available, using the CPU instead.")
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: NVIDIA GeForce RTX 2070


## Load data

In [2]:
from datasets import load_dataset

dataset = load_dataset("imagefolder", data_dir="./sample_dataset/dataset")

Resolving data files:   0%|          | 0/6978 [00:00<?, ?it/s]

In [3]:
dataset

DatasetDict({
    train: Dataset({
        features: ['image', 'label'],
        num_rows: 6978
    })
})

In [4]:
from datasets import concatenate_datasets
import numpy as np

rng = np.random.default_rng()

dss = [dataset["train"].filter(lambda example: example["label"] == i) for i in range(15)]

dss_ = [d.select(rng.choice(len(d), 1000, replace=True)) for d in dss]

dataset = concatenate_datasets(dss_)
dataset

Dataset({
    features: ['image', 'label'],
    num_rows: 15000
})

In [5]:
dataset = dataset.train_test_split(0.3)
dataset

DatasetDict({
    train: Dataset({
        features: ['image', 'label'],
        num_rows: 10500
    })
    test: Dataset({
        features: ['image', 'label'],
        num_rows: 4500
    })
})

In [6]:
dataset["train"][0]

{'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=1024x1024>,
 'label': 4}

In [7]:
dataset["train"].features
original_dataset = dataset.copy()

## Preprocess data

In [8]:
from transformers import AutoImageProcessor

model_name = 'google/vit-base-patch16-224'
processor = AutoImageProcessor.from_pretrained(model_name)

In [9]:
from torchvision.transforms import RandomResizedCrop, Compose, Normalize, ToTensor, Resize

normalize = Normalize(mean=processor.image_mean, std=processor.image_std)
size = (
    processor.size["shortest_edge"]
    if "shortest_edge" in processor.size
    else (processor.size["height"], processor.size["width"])
)
# _transforms = Compose([RandomResizedCrop(size), ToTensor(), normalize])
_transforms = Compose([Resize(size), ToTensor(), normalize])

In [10]:
def transforms(examples):
    examples["pixel_values"] = [_transforms(img.convert("RGB")) for img in examples["image"]]
    del examples["image"]
    return examples

In [11]:
dataset = dataset.with_transform(transforms)

In [12]:
dataset["train"][0]

{'label': 4,
 'pixel_values': tensor([[[-0.3961, -0.5216, -0.6314,  ..., -0.5373, -0.4902, -0.4353],
          [-0.4118, -0.5294, -0.6314,  ..., -0.6941, -0.6314, -0.5765],
          [-0.4196, -0.5373, -0.6314,  ..., -0.8196, -0.7569, -0.6941],
          ...,
          [-0.4667, -0.3882, -0.3569,  ..., -0.3020, -0.0667,  0.0980],
          [-0.4902, -0.3647, -0.3176,  ..., -0.2863, -0.0510,  0.1059],
          [-0.5373, -0.3412, -0.2863,  ..., -0.2784, -0.0353,  0.1137]],
 
         [[-0.3961, -0.5216, -0.6314,  ..., -0.5373, -0.4902, -0.4353],
          [-0.4118, -0.5294, -0.6314,  ..., -0.6941, -0.6314, -0.5765],
          [-0.4196, -0.5373, -0.6314,  ..., -0.8196, -0.7569, -0.6941],
          ...,
          [-0.4667, -0.3882, -0.3569,  ..., -0.3020, -0.0667,  0.0980],
          [-0.4902, -0.3647, -0.3176,  ..., -0.2863, -0.0510,  0.1059],
          [-0.5373, -0.3412, -0.2863,  ..., -0.2784, -0.0353,  0.1137]],
 
         [[-0.3961, -0.5216, -0.6314,  ..., -0.5373, -0.4902, -0.4353],

## Data collator

In [13]:
from transformers import DefaultDataCollator

data_collator = DefaultDataCollator()

In [14]:
import evaluate
import numpy as np

accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [15]:
labels = dataset["train"].features["label"].names
label2id, id2label = dict(), dict()
for i, label in enumerate(labels):
    label2id[label] = str(i)
    id2label[str(i)] = label

## Training

In [16]:
from transformers import AutoModelForImageClassification, TrainingArguments, Trainer


model = AutoModelForImageClassification.from_pretrained(
    model_name,
    num_labels=len(labels),
    id2label=id2label,
    label2id=label2id,
    ignore_mismatched_sizes=True
).to(device)

# model = AutoModelForImageClassification.from_pretrained(
#     "./model_0106_572",
#     num_labels=len(labels),
#     id2label=id2label,
#     label2id=label2id,
#     ignore_mismatched_sizes=True
# ).to(device)

Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224 and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([1000]) in the checkpoint and torch.Size([15]) in the model instantiated
- classifier.weight: found shape torch.Size([1000, 768]) in the checkpoint and torch.Size([15, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [17]:
training_args = TrainingArguments(
    output_dir="./sample_logs",
    remove_unused_columns=False,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=16,
    gradient_accumulation_steps=4,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    warmup_ratio=0.1,
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    disable_tqdm=False
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    tokenizer=processor,
    compute_metrics=compute_metrics,
)

trainer.train()

Epoch,Training Loss,Validation Loss


TrainOutput(global_step=1640, training_loss=0.8045136031581135, metrics={'train_runtime': 4177.7992, 'train_samples_per_second': 25.133, 'train_steps_per_second': 0.393, 'total_flos': 8.126136800683647e+18, 'train_loss': 0.8045136031581135, 'epoch': 9.98})

In [18]:
trainer.evaluate()

{'eval_loss': 0.8449455499649048,
 'eval_accuracy': 0.7231111111111111,
 'eval_runtime': 95.7134,
 'eval_samples_per_second': 47.015,
 'eval_steps_per_second': 2.946,
 'epoch': 9.98}

In [19]:
import time
model_path = 'over_sample_model_epoch_10_' + time.strftime("%D-%T").replace("/", "-").replace(":", "-")
trainer.save_model(model_path)

ds = load_dataset("imagefolder", data_dir="./sample_dataset/dataset")
ds

Resolving data files:   0%|          | 0/6978 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['image', 'label'],
        num_rows: 6978
    })
})

In [20]:
from transformers import pipeline

classifier = pipeline(
    "image-classification",
    model=model_path,
    tokenizer=processor,
    device=0 if torch.cuda.is_available() else -1,
)

In [21]:
def top_k_accuracy(data):
    top_1 = 0
    top_3 = 0
    top_5 = 0
    for d in data:
        r = classifier(d["image"])
        l = id2label[str(d["label"])]
        for _ in range(5):
            if l == r[_]["label"]:
                if _ < 1:
                    top_1 += 1
                if _ < 3:
                    top_3 += 1
                if _ < 5:
                    top_5 += 1
                break
                
    print(f"top 1: {top_1 / len(data)}")
    print(f"top 3: {top_3 / len(data)}")
    print(f"top 5: {top_5 / len(data)}")

In [22]:
top_k_accuracy(original_dataset["test"])



top 1: 0.7231111111111111
top 3: 0.938
top 5: 0.9686666666666667


In [23]:
top_k_accuracy(ds["train"])

top 1: 0.5911435941530524
top 3: 0.8306104901117799
top 5: 0.8998280309544282
