In [1]:
import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
!pip install transformers wandb



In [3]:
import numpy as np
from PIL import Image
import random

import wandb

import torch
from torchinfo import summary
import torchvision
from torchvision import datasets, transforms
from torch.utils.data import Dataset, DataLoader, random_split

from transformers import ViTImageProcessor, ViTForImageClassification, TrainingArguments, Trainer

2024-04-16 18:57:54.612603: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-16 18:57:54.612721: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-16 18:57:54.708443: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


## Weights & Biases

In [5]:
os.environ['WANDB_PROJECT'] = 'hotel-id-vit-v1'
os.environ['WANDB_LOG_MODEL'] = 'checkpoint'

In [6]:
torch.cuda.is_available()

True

In [7]:
DATA_PATH = '/kaggle/input/hotel-id-to-combat-human-trafficking-2022-fgvc9/'
IMG_PATH = os.path.join(DATA_PATH, 'train_images')

In [8]:
total_classes = len(os.listdir(IMG_PATH))

print(f'Different IDs: {total_classes}')

Different IDs: 3116


In [9]:
def set_seed(seed):
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    random.seed(seed)

In [10]:
set_seed(42)

In [11]:
model_name_or_path = 'google/vit-base-patch16-224-in21k'

processor = ViTImageProcessor.from_pretrained(model_name_or_path)
processor

preprocessor_config.json:   0%|          | 0.00/160 [00:00<?, ?B/s]

ViTImageProcessor {
  "_valid_processor_keys": [
    "images",
    "do_resize",
    "size",
    "resample",
    "do_rescale",
    "rescale_factor",
    "do_normalize",
    "image_mean",
    "image_std",
    "return_tensors",
    "data_format",
    "input_data_format"
  ],
  "do_normalize": true,
  "do_rescale": true,
  "do_resize": true,
  "image_mean": [
    0.5,
    0.5,
    0.5
  ],
  "image_processor_type": "ViTImageProcessor",
  "image_std": [
    0.5,
    0.5,
    0.5
  ],
  "resample": 2,
  "rescale_factor": 0.00392156862745098,
  "size": {
    "height": 224,
    "width": 224
  }
}

In [12]:
class CustomImageFolderDataset(Dataset):
    def __init__(self, root, processor, transform=None):
        self.dataset = datasets.ImageFolder(root=root)
        self.processor = processor
        self.transform = transform
        self.id2label = {idx: label for idx, label in enumerate(self.dataset.classes)}
        self.label2id = {label: idx for idx, label in enumerate(self.dataset.classes)}
        
    def __len__(self):
        return len(self.dataset)
        
    def __getitem__(self, idx):
        image, label = self.dataset[idx]
        if self.transform is not None:
            image = self.transform(image)
        processed = self.processor(image, return_tensors='pt')
        return {
            'pixel_values': processed['pixel_values'].squeeze(0),  # Remove batch dimension
            'labels': torch.tensor(label, dtype=torch.long)
        }

In [13]:
class RandomOcclusion:
    def __init__(self, p=0.5, min_size_ratio=0.05, max_size_ratio=0.50):
        self.p = p
        self.min_size_ratio = min_size_ratio
        self.max_size_ratio = max_size_ratio
        
    def __call__(self, image):
        if random.random() > self.p:
            return image
        
        width, height = image.size
        curr_min_width = int(width * self.min_size_ratio)
        curr_max_width = int(width * self.max_size_ratio)
        
        curr_min_height = int(height * self.min_size_ratio)
        curr_max_height = int(height * self.max_size_ratio)

        occlusion_width = random.randint(curr_min_width, curr_max_width)
        occlusion_height = random.randint(curr_min_height, curr_max_height)

        # Position
        x = random.randint(0, width - occlusion_width)
        y = random.randint(0, height - occlusion_height)

        mask = Image.new('RGB', (occlusion_width, occlusion_height), (0, 0, 0)) # Occlussion color
        image.paste(mask, (x, y))
        return image

In [14]:
transform = transforms.Compose([
    transforms.Resize((processor.size["height"], processor.size["width"])),
    transforms.RandomResizedCrop(224),
    transforms.RandomHorizontalFlip(),
    # RandomOcclusion(p=0.5, min_size_ratio=0.05, max_size_ratio=0.50),
    transforms.RandomRotation(degrees=(-30, 30)),
    transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1),
    transforms.ToTensor(),
    # transforms.Normalize(mean=processor.image_mean, std=processor.image_std)
])

In [15]:
h_dataset = CustomImageFolderDataset(root=IMG_PATH, processor=processor)

In [16]:
train_ratio = 0.7
val_ratio = 0.10
test_ratio = 0.20

In [17]:
total_size = len(h_dataset)
train_size = int(train_ratio * total_size)
val_size = int(val_ratio * total_size)
test_size = total_size - train_size - val_size

In [18]:
train_ds, val_ds, test_ds = random_split(h_dataset, [train_size, val_size, test_size])

In [19]:
train_ds.transform = transform
val_ds.transform = None
test_ds.transform = None

In [20]:
print(f'Train size: {len(train_ds)} | {train_size}')
print(f'Val size: {len(val_ds)} | {val_size}')
print(f'Test size: {len(test_ds)} | {test_size}')

Train size: 31291 | 31291
Val size: 4470 | 4470
Test size: 8941 | 8941


### Create and train model

In [21]:
def collate_fn(batch):
    return {
        'pixel_values': torch.stack([x['pixel_values'] for x in batch]),
        'labels': torch.tensor([x['labels'] for x in batch])
    }

In [22]:
def compute_acc(prob_preds, labels, k):
    y = np.repeat([labels], repeats=k, axis=0).T
    # Sort and get top k preds
    preds = np.argsort(-np.array(prob_preds), axis=1)[:, :k]
    # Check if any of top k predictions are correct and calculate mean accuracy
    topk_acc = (preds == y).any(axis=1).mean()
    return topk_acc

def compute_map(prob_preds, labels, k):
    # Sort and get top k preds
    preds = np.argsort(-np.array(prob_preds), axis=1)[:, :k]
    # Mat where each row contains the correct label repeated k times
    # labels = np.array(labels)
    # y = np.repeat(labels[:, np.newaxis], k, axis=1)
    y = np.repeat([labels], repeats=k, axis=0).T
    # Get rank (1-indexed later)
    qry = np.where(preds == y)
    ranks_0b = qry[1]
    # Get precision only where the correct label appears in the preds top-k
    precisions = np.zeros(len(labels))
    if len(ranks_0b) > 0:
        # Calculate the precision for each instance where the correct label is found
        precisions[qry[0]] = 1 / (ranks_0b + 1)
    map_at_k = np.mean(precisions)
    return map_at_k


def compute_metrics(p):
    prob_preds = p.predictions
    return {
        'topk_acc_at_5': compute_acc(prob_preds, p.label_ids, k=5),
        'topk_acc_at_10': compute_acc(prob_preds, p.label_ids, k=10),
        'topk_acc_at_100': compute_acc(prob_preds, p.label_ids, k=100),
        'map_at_5': compute_map(prob_preds, p.label_ids, k=5),
        'map_at_50': compute_map(prob_preds, p.label_ids, k=50),
        'map_at_100': compute_map(prob_preds, p.label_ids, k=100),
    }

In [23]:
model = ViTForImageClassification.from_pretrained(
    model_name_or_path,
    num_labels=total_classes,
    id2label=h_dataset.id2label,
    label2id=h_dataset.label2id
)

config.json:   0%|          | 0.00/502 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/346M [00:00<?, ?B/s]

Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224-in21k and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [24]:
summary(model=model, input_size=(1, 3, 224, 224), col_names=['input_size', 'output_size', 'num_params', 'trainable'])

Layer (type:depth-idx)                                  Input Shape               Output Shape              Param #                   Trainable
ViTForImageClassification                               [1, 3, 224, 224]          [1, 3116]                 --                        True
├─ViTModel: 1-1                                         [1, 3, 224, 224]          [1, 197, 768]             --                        True
│    └─ViTEmbeddings: 2-1                               [1, 3, 224, 224]          [1, 197, 768]             152,064                   True
│    │    └─ViTPatchEmbeddings: 3-1                     [1, 3, 224, 224]          [1, 196, 768]             590,592                   True
│    │    └─Dropout: 3-2                                [1, 197, 768]             [1, 197, 768]             --                        --
│    └─ViTEncoder: 2-2                                  [1, 197, 768]             [1, 197, 768]             --                        True
│    │    └─ModuleList: 

In [25]:
training_args = TrainingArguments(
    output_dir='/kaggle/working/vit-hid',
    per_device_train_batch_size=128,
    per_device_eval_batch_size=128,
    evaluation_strategy='steps',
    num_train_epochs=8,
    fp16=True,
    save_steps=120,
    eval_steps=120,
    logging_steps=10,
    learning_rate=2e-3,
    save_total_limit=2,
    remove_unused_columns=False,
    push_to_hub=False,
    report_to='wandb',
    run_name='vit-run-kg-fl-04',
    load_best_model_at_end=True,
    metric_for_best_model='map_at_5',
    greater_is_better=True,
)

In [26]:
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=collate_fn,
    compute_metrics=compute_metrics,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    tokenizer=processor,
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [27]:
train_results = trainer.train()
trainer.save_model()
trainer.log_metrics("Train", train_results.metrics)
trainer.save_metrics("train", train_results.metrics)
trainer.save_state()

[34m[1mwandb[0m: Currently logged in as: [33mart-av-ch-1[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: wandb version 0.16.6 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade
[34m[1mwandb[0m: Tracking run with wandb version 0.16.5
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/kaggle/working/wandb/run-20240416_185840-cbhyzeym[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33mvit-run-kg-fl-04[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/art-av-ch-1/hotel-id-vit-v1[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/art-av-ch-1/hotel-id-vit-v1/runs/cbhyzeym/workspace[0m


Step,Training Loss,Validation Loss,Topk Acc At 5,Topk Acc At 10,Topk Acc At 100,Map At 5,Map At 50,Map At 100
120,7.5466,7.579098,0.086353,0.123714,0.24094,0.049713,0.058627,0.059209
240,7.4778,7.509267,0.085906,0.125503,0.246532,0.049623,0.058417,0.059162
360,7.4266,7.532063,0.087248,0.128859,0.244519,0.049575,0.058678,0.059292
480,7.4891,7.474282,0.087248,0.12774,0.252125,0.050712,0.059706,0.060395
600,7.4353,7.453968,0.084564,0.131767,0.258613,0.049131,0.058916,0.059621
720,7.3286,7.405352,0.093736,0.134676,0.26085,0.052468,0.061627,0.062354
840,7.2518,7.382185,0.099553,0.139597,0.27047,0.055567,0.064282,0.065081
960,7.2117,7.363597,0.104474,0.137584,0.269128,0.063826,0.071604,0.072422
1080,7.1692,7.330328,0.106711,0.142729,0.277405,0.06204,0.07033,0.071127
1200,7.1094,7.251625,0.114541,0.144966,0.289038,0.06915,0.077166,0.077984


[34m[1mwandb[0m: Adding directory to artifact (/kaggle/working/vit-hid/checkpoint-120)... Done. 3.0s
[34m[1mwandb[0m: Adding directory to artifact (/kaggle/working/vit-hid/checkpoint-240)... Done. 2.9s
[34m[1mwandb[0m: Adding directory to artifact (/kaggle/working/vit-hid/checkpoint-360)... Done. 3.2s
[34m[1mwandb[0m: Adding directory to artifact (/kaggle/working/vit-hid/checkpoint-480)... Done. 3.1s
[34m[1mwandb[0m: Adding directory to artifact (/kaggle/working/vit-hid/checkpoint-600)... Done. 3.0s
[34m[1mwandb[0m: Adding directory to artifact (/kaggle/working/vit-hid/checkpoint-720)... Done. 2.8s
[34m[1mwandb[0m: Adding directory to artifact (/kaggle/working/vit-hid/checkpoint-840)... Done. 2.6s
[34m[1mwandb[0m: Adding directory to artifact (/kaggle/working/vit-hid/checkpoint-960)... Done. 3.2s
[34m[1mwandb[0m: Adding directory to artifact (/kaggle/working/vit-hid/checkpoint-1080)... Done. 3.1s
[34m[1mwandb[0m: Adding directory to artifact (/kaggle/worki

***** Train metrics *****
  epoch                    =           8.0
  total_flos               = 18570405054GF
  train_loss               =        7.1339
  train_runtime            =    7:09:00.70
  train_samples_per_second =         9.725
  train_steps_per_second   =         0.076


In [28]:
metrics = trainer.evaluate(test_ds)
trainer.log_metrics("Eval", metrics)
trainer.save_metrics("eval", metrics)

***** Eval metrics *****
  epoch                   =        8.0
  eval_loss               =     6.9796
  eval_map_at_100         =     0.1074
  eval_map_at_5           =     0.0985
  eval_map_at_50          =     0.1064
  eval_runtime            = 0:12:08.46
  eval_samples_per_second =     12.274
  eval_steps_per_second   =      0.096
  eval_topk_acc_at_10     =      0.168
  eval_topk_acc_at_100    =     0.3367
  eval_topk_acc_at_5      =     0.1416


In [29]:
wandb.finish()

[34m[1mwandb[0m:                                                                                
[34m[1mwandb[0m: 
[34m[1mwandb[0m: Run history:
[34m[1mwandb[0m:               eval/loss █▇▇▇▇▆▆▅▅▄▄▃▃▂▂▂▁
[34m[1mwandb[0m:         eval/map_at_100 ▁▁▁▁▁▁▂▃▃▄▄▅▆▇███
[34m[1mwandb[0m:           eval/map_at_5 ▁▁▁▁▁▁▂▃▃▄▄▅▆▇███
[34m[1mwandb[0m:          eval/map_at_50 ▁▁▁▁▁▁▂▃▃▄▄▅▆▇███
[34m[1mwandb[0m:            eval/runtime ▂▁▁▂▁▁▁▁▁▁▁▁▁▁▁▁█
[34m[1mwandb[0m: eval/samples_per_second ▃█▇▄▆▆█▇█▇▇███▇▇▁
[34m[1mwandb[0m:   eval/steps_per_second ▃█▇▄▆▆█▇██▇███▇▇▁
[34m[1mwandb[0m:     eval/topk_acc_at_10 ▁▁▂▂▂▃▄▃▄▄▅▆▆▇███
[34m[1mwandb[0m:    eval/topk_acc_at_100 ▁▁▁▂▂▂▃▃▄▅▅▆▆▇▇██
[34m[1mwandb[0m:      eval/topk_acc_at_5 ▁▁▁▁▁▂▃▃▄▅▅▆▆▇███
[34m[1mwandb[0m:             train/epoch ▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
[34m[1mwandb[0m:       train/global_step ▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
[34m[1mwandb[0m:         train/grad_norm ▄▃▁▃▂▁▁▂▁▁▂▁

In [30]:
#