In [1]:
import os

# Change the current directory to root directory
new_directory = "../"
os.chdir(new_directory)

# Verify the current directory has changed
updated_directory = os.getcwd()
print("Updated Directory:", updated_directory)

Updated Directory: /media/toma/2TB_30May2023/OBJECT_DETECTION_L/object_detection


In [7]:
### MAKE JSONS

from utils.imagenet_1k_classes  import IMAGENET2012_CLASSES
import json

id2label = {str(i): f"{key} - {value}" for i, (key, value) in enumerate(IMAGENET2012_CLASSES.items())}
label2id = {f"{key} - {value}" : str(i) for i, (key, value) in enumerate(IMAGENET2012_CLASSES.items())}


with open("configs/datasets/imagenet-1k-id2label.json", "w") as json_file:
    json.dump(id2label, json_file)

with open("configs/datasets/imagenet-1k-label2id.json", "w") as json_file:
    json.dump(label2id, json_file)


### Pre-loaded data creation

In [None]:
from datasets import load_dataset

IMAGENET_DIR = "data/imagenet_1k"
imagenet_dataset = load_dataset("utils/imagenet_1k_dataset_script.py", data_dir=IMAGENET_DIR, splits = ["validation"], cache_dir=".cache")


In [None]:
imagenet_dataset["validation"][3456]

In [None]:
if not os.path.exists("preloaded_data_imagenet"):
    os.makedirs("preloaded_data_imagenet") 

In [None]:
imagenet_dataset["validation"].to_parquet("preloaded_data_imagenet/validation.parquet")

#### Load pre-loaded data

In [4]:
from datasets import load_dataset

imagenet_dataset = load_dataset("parquet", data_files={"validation": "preloaded_data_imagenet/validation.parquet"}, cache_dir=".cache2")

Found cached dataset parquet (/media/toma/2TB_30May2023/OBJECT_DETECTION_L/object_detection/.cache2/parquet/default-cf7323d42978dad3/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
100%|██████████| 1/1 [00:00<00:00, 26.21it/s]


In [3]:
imagenet_dataset["validation"][3456]

{'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=373x560>,
 'label': 72}

In [None]:
imagenet_dataset["validation"][3456]["image"]

In [None]:
from transformers import AutoImageProcessor

checkpoint = "microsoft/focalnet-tiny"
image_processor = AutoImageProcessor.from_pretrained(checkpoint)

In [None]:
from torchvision.transforms import RandomResizedCrop, Compose, Normalize, ToTensor

normalize = Normalize(mean=image_processor.image_mean, std=image_processor.image_std)
size = (
    image_processor.size["shortest_edge"]
    if "shortest_edge" in image_processor.size
    else (image_processor.size["height"], image_processor.size["width"])
)
_transforms = Compose([RandomResizedCrop(size), ToTensor(), normalize])


def transforms(examples):
    examples["pixel_values"] = [_transforms(img.convert("RGB")) for img in examples["image"]]
    del examples["image"]
    return examples

imagenet_dataset_transformed = imagenet_dataset.with_transform(transforms)

#### Augmentation

    # | Color Jitter Factor           | 0.4      | d
    # | Auto-augmentation             | rand-m9-mstd0.5-inc1 | d
    # | Random Erasing Probability    | 0.25     | d
    # | Random Erasing Mode           | Pixel    | d
    # | Mixup α                       | 0.8      |
    # | Cutmix α                      | 0.8      |
    # | Mixup Probability             | 1.0      |
    # | Mixup Switch Probability      | 0.5      |

In [None]:
import torchvision.transforms as transforms

transform = transforms.Compose([
    transforms.ColorJitter(brightness=0.4, contrast=0.4, saturation=0.4, hue=0.4),
    transforms.RandomApply([
        transforms.AutoAugment(policy='rand-m9-mstd0.5-inc1')
    ], p=1.0),
    transforms.RandomErasing(p=0.25, scale=(0.02, 0.25), ratio=(0.3, 3.3), value='random', inplace=False),
    transforms.RandomApply([
        transforms.RandomErasing(p=1.0, scale=(0.02, 0.25), ratio=(0.3, 3.3), value='pixel', inplace=False)
    ], p=1.0),
    transforms.ToTensor(),
    transforms.RandomApply([
        transforms.RandomChoice([
            transforms.RandomHorizontalFlip(),
            transforms.RandomVerticalFlip()
        ])
    ], p=0.5),
    transforms.RandomApply([
        transforms.RandomChoice([
            transforms.RandomRotation(90),
            transforms.RandomRotation(180),
            transforms.RandomRotation(270)
        ])
    ], p=0.5),
    transforms.RandomApply([
        transforms.RandomResizedCrop(size=224, scale=(0.08, 1.0), ratio=(0.75, 1.333))
    ], p=0.5),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    transforms.RandomApply([
        transforms.RandomChoice([
            transforms.ColorJitter(brightness=0.4, contrast=0.4, saturation=0.4, hue=0.4),
            transforms.Grayscale(num_output_channels=3)
        ])
    ], p=0.2),
    transforms.RandomApply([
        transforms.RandomChoice([
            transforms.GaussianBlur(kernel_size=3, sigma=(0.1, 2.0)),
            transforms.RandomAffine(degrees=15, translate=(0.1, 0.1), scale=(0.8, 1.2), shear=10, resample=False, fillcolor=0)
        ])
    ], p=0.2),
    transforms.RandomApply([
        transforms.RandomPerspective(distortion_scale=0.5, p=1.0, interpolation=3)
    ], p=0.5),
    transforms.RandomApply([
        transforms.RandomErasing(p=1.0, scale=(0.02, 0.25), ratio=(0.3, 3.3), value='random', inplace=False)
    ], p=0.5),
    transforms.RandomApply([
        transforms.RandomErasing(p=1.0, scale=(0.02, 0.25), ratio=(0.3, 3.3), value='pixel', inplace=False)
    ], p=0.5),
    transforms.RandomApply([
        transforms.RandomChoice([
            transforms.RandomHorizontalFlip(),
            transforms.RandomVerticalFlip()
        ])
    ], p=0.5),
    transforms.RandomApply([
        transforms.RandomChoice([
            transforms.RandomRotation(90),
            transforms.RandomRotation(180),
            transforms.RandomRotation(270)
        ])
    ], p=0.5),
    transforms.RandomApply([
        transforms.RandomResizedCrop(size=224, scale=(0.08, 1.0), ratio=(0.75, 1.333))
    ], p=0.5),
    transforms.Normalize(mean=[0.485, 0.456, 0.406


In [None]:

    # | Mixup α                       | 0.8      |
    # | Cutmix α                      | 0.8      |
    # | Mixup Probability             | 1.0      |
    # | Mixup Switch Probability      | 0.5      |
# | Stochastic Drop Path Rate     | 0.2/0.3/0.5 |
# | Label Smoothing               | 0.1      |


import torchvision.transforms as transforms

# Color Jitter
color_jitter_factor = 0.4
color_jitter_transform = transforms.ColorJitter(brightness=color_jitter_factor, contrast=color_jitter_factor,
                                                saturation=color_jitter_factor, hue=color_jitter_factor)

# Auto-augmentation
auto_augmentation_transform = transforms.AutoAugment(policy=transforms.AutoAugmentPolicy.IMAGENET)

# Random Erasing
random_erasing_probability = 0.25
random_erasing_transform = transforms.RandomErasing(p=random_erasing_probability, value='random', inplace=False)








# Mixup
mixup_alpha = 0.8
mixup_probability = 1.0
mixup_switch_probability = 0.5
mixup_transform = transforms.RandomMixup(probability=mixup_probability, switch_prob=mixup_switch_probability,
                                         alpha=mixup_alpha)

# Cutmix
cutmix_alpha = 0.8
cutmix_transform = transforms.RandomCutmix(probability=1.0, beta=cutmix_alpha)

# Compose all transforms
composed_transforms = transforms.Compose([
    color_jitter_transform,
    auto_augmentation_transform,
    random_erasing_transform,
    mixup_transform,
    cutmix_transform,
    transforms.ToTensor(),
])

# Apply transforms to your dataset or image
transformed_data = composed_transforms(your_data)


In [None]:
### TRYING

In [None]:
imagenet_dataset_transformed["validation"][3456]["pixel_values"]

In [None]:
## DATA COLLATOR

from transformers import DefaultDataCollator

data_collator = DefaultDataCollator()

## EVALUATION

import evaluate

accuracy = evaluate.load("accuracy")


import numpy as np


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [3]:
## BUILD MODEL

from transformers import AutoModelForImageClassification, TrainingArguments, Trainer, AutoConfig
import json

config = AutoConfig.from_pretrained('./configs/backbones/focalnet/config.json')

# config_dict = config.__dict__
# print(config_dict)

# read id2label and label2id
with open("configs/datasets/imagenet-1k-id2label.json", 'r') as json_file:
    # Load the JSON data
    id2label = json.load(json_file)


with open("configs/datasets/imagenet-1k-label2id.json", 'r') as json_file:
    # Load the JSON data
    label2id = json.load(json_file)

# config["id2label"] = id2label
# config["label2id"] = label2id
# config["num_labels"] = len(label2id.keys())

config.id2label = id2label
config.label2id = label2id
config.num_labels = len(label2id.keys())



# config = AutoConfig(**config_dict)

model = AutoModelForImageClassification.from_config(config)

In [8]:
len(label2id.keys())

1000

In [4]:
model

FocalNetForImageClassification(
  (focalnet): FocalNetModel(
    (embeddings): FocalNetEmbeddings(
      (patch_embeddings): FocalNetPatchEmbeddings(
        (projection): Conv2d(3, 96, kernel_size=(4, 4), stride=(4, 4))
      )
      (norm): LayerNorm((96,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): FocalNetEncoder(
      (stages): ModuleList(
        (0): FocalNetStage(
          (layers): ModuleList(
            (0): FocalNetLayer(
              (norm1): LayerNorm((96,), eps=1e-05, elementwise_affine=True)
              (modulation): FocalNetModulation(
                (projection_in): Linear(in_features=96, out_features=195, bias=True)
                (projection_context): Conv2d(96, 96, kernel_size=(1, 1), stride=(1, 1))
                (activation): GELU(approximate='none')
                (projection_out): Linear(in_features=96, out_features=96, bias=True)
                (projection_dropout): Dropout(p=0.0, inplace=F

In [None]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="-1"

In [None]:
training_args = TrainingArguments(
    output_dir="my_awesome_food_model",
    remove_unused_columns=False,
    evaluation_strategy="steps",
    save_strategy="steps",
    learning_rate=5e-5,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=2,
    per_device_eval_batch_size=2,
    save_steps = 3,
    max_steps = 6,
    warmup_ratio=0.1,
    no_cuda=True,
    logging_steps=2,
    # load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    # push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=imagenet_dataset_transformed["validation"],
    eval_dataset=imagenet_dataset_transformed["validation"],
    tokenizer=image_processor,
    compute_metrics=compute_metrics,
)

trainer.train()

In [None]:
#### Inference


ds = load_dataset("food101", split="validation[:10]")
image = ds["image"][0]


from transformers import pipeline

classifier = pipeline("image-classification", model="my_awesome_food_model") # must pre-loaded id2label, label2id
classifier(image)