In [None]:
import torch
import matplotlib.pyplot as plt
import numpy as np
import torch.nn as nn
import transformers, datasets, evaluate, datasets, huggingface_hub
from torchvision.transforms import ColorJitter
from datasets import load_dataset
from transformers import TrainingArguments, Trainer
from huggingface_hub import notebook_login
import accelerate
from accelerate import Accelerator
torch.backends.cuda.matmul.allow_tf32 = True
notebook_login()


In [None]:
ds = load_dataset("aashraychegu/glacier_scopes" )
ds = ds["train"].train_test_split(.1)
train_ds = ds["train"]
test_ds = ds["test"]

id2label = {
	"0": "sky",
	"1": "surface",
	"2": "bed",
}
# json.load(open(cached_download(hf_hub_url(repo_id, filename, repo_type="dataset")), "r"))
id2label = {int(k): v for k, v in id2label.items()}
label2id = {v: k for k, v in id2label.items()}
num_labels = len(id2label)

In [None]:
from transformers import SegformerImageProcessor, SegformerModel, SegformerConfig, AutoImageProcessor, SegformerForSemanticSegmentation

checkpoint = "nvidia/segformer-b0-finetuned-ade-512-512"
image_processor = SegformerImageProcessor.from_pretrained(checkpoint)
model = SegformerForSemanticSegmentation.from_pretrained(
    checkpoint).to("cuda:0")

test_config = SegformerConfig(num_channels=1,num_labels=num_labels, label2id=label2id, id2label=id2label)
test_image_processor = SegformerImageProcessor.from_pretrained(checkpoint)
test_model = SegformerForSemanticSegmentation(test_config)
test_model.from_pretrained(checkpoint).to("cuda:0")

In [None]:
device = "cuda:0"

In [None]:
jitter = ColorJitter(brightness=0.1, contrast=0.1, saturation=0.1, hue=0.1)
def train_transforms(example_batch):
    images = [jitter(x.convert("RGB")) for x in example_batch["image"]]
    labels = [x for x in example_batch["label"]]
    # inputs = image_processor(images, labels)
    inputs = test_image_processor(images, labels)
    return inputs


def val_transforms(example_batch):
    images = [x.convert("RGB") for x in example_batch["image"]]
    labels = [x for x in example_batch["label"]]
    # inputs = image_processor(images, labels)
    inputs = test_image_processor(images, labels)
    return inputs
train_ds.set_transform(train_transforms)
test_ds.set_transform(val_transforms)

In [None]:
metric = evaluate.load("mean_iou")

def compute_metrics(eval_pred):
    with torch.no_grad():
        logits, labels = eval_pred
        logits_tensor = torch.from_numpy(logits)
        logits_tensor = nn.functional.interpolate(
            logits_tensor,
            size=labels.shape[-2:],
            mode="bilinear",
            align_corners=False,
        ).argmax(dim=1)

        pred_labels = logits_tensor.detach().cpu().numpy()
        metrics = metric.compute(
            predictions=pred_labels,
            references=labels,
            num_labels=num_labels,
            reduce_labels=False,
        )
        for key, value in metrics.items():
            if type(value) is np.ndarray:
                metrics[key] = value.tolist()
        return metrics

In [None]:
training_args = TrainingArguments(
    output_dir="segformer-test-1",
    learning_rate=6e-5,
    num_train_epochs=50,
    auto_find_batch_size=True,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    save_total_limit=3,
    evaluation_strategy="steps",
    save_strategy="steps",
    save_steps=400,
    eval_steps=400,
    logging_steps=30,
    eval_accumulation_steps=5,
    remove_unused_columns=False,
    fp16=True,
    tf32=True,
    gradient_accumulation_steps=4,
)

trainer = Trainer(
    # model=model,
    model = test_model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=test_ds,
    compute_metrics=compute_metrics,
)

trainer.train()

Once training is completed, share your model to the Hub with the [push_to_hub()](https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.Trainer.push_to_hub) method so everyone can use your model:

In [None]:
trainer.push_to_hub()

## Inference

Great, now that you've finetuned a model, you can use it for inference!

Load an image for inference:

In [None]:
image = ds["train"][0]["pixel_values"]
plt.imshow(torch.tensor(image).permute(1, 2, 0).cpu().numpy())

In [None]:
from transformers import pipeline

segmenter = pipeline("image-segmentation", model=model,image_processor=image_processor)
segmenter(ds["train"][0])

You can also manually replicate the results of the `pipeline` if you'd like. Process the image with an image processor and place the `pixel_values` on a GPU:

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # use GPU if available, otherwise use a CPU
encoding = image_processor(image, return_tensors="pt")
pixel_values = encoding.pixel_values.to(device)