In [12]:
import numpy as np
import torch
import math
import fastai
from PIL import Image
from diffusers import DiffusionPipeline
from diffusers.utils import pt_to_pil
from dataloader import get_imagenette_dataloader
from quantize import quantize_img, plot_imgs
from ddpm import DDPMCB
from preprocessing import clip_preprocess, conditioning_transform
from functools import partial
from fastai.vision.all import ImageDataLoaders
from encoder import ViTImageEncoder

device = "cuda"

In [3]:
def method_helper(o): return list(filter(lambda x: x[0] != "_", dir(o)))

In [None]:
encoder = ViTImageEncoder(7, output_dim=unet.config.encoder_hid_dim).to(device)
encoder_preprocess = encoder.feature_extractor

In [4]:
# stage 2
stage_2 = DiffusionPipeline.from_pretrained(
    "DeepFloyd/IF-II-L-v1.0", text_encoder=None, variant="fp16", 
     torch_dtype=torch.float16, class_labels=None 
)
# stage_2.enable_model_cpu_offload()


A mixture of fp16 and non-fp16 filenames will be loaded.
Loaded fp16 filenames:
[text_encoder/model.fp16-00002-of-00002.safetensors, text_encoder/model.fp16-00001-of-00002.safetensors, safety_checker/model.fp16.safetensors, unet/diffusion_pytorch_model.fp16.safetensors]
Loaded non-fp16 filenames:
[watermarker/diffusion_pytorch_model.safetensors
If this behavior is not expected, please check your folder structure.
Keyword arguments {'class_labels': None} are not expected by IFSuperResolutionPipeline and will be ignored.


Loading pipeline components...:   0%|          | 0/7 [00:00<?, ?it/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [5]:
scheduler = stage_2.scheduler

# Load the UNet model - the core denoiser
unet = stage_2.unet.to(device)

In [6]:
clip_preprocess = partial(clip_preprocess, stage_2=stage_2)

In [8]:
dls = ImageDataLoaders.from_folder(
    "/mnt/wd/datasets/imagenette2",
    valid_pct=0.1,
    item_tfms=[clip_preprocess, conditioning_transform],
    bs=4,
    num_workers=16
)

In [9]:
dls.one_batch()[0].shape

torch.Size([4, 7, 224, 224])

In [None]:
# dls.one_batch()

In [None]:
# dls = ImageDataLoaders.from_folder("/mnt/wd/datasets/imagenette2", valid_pct=0.1, bs=4, item_tfms=Resize(224))

In [None]:
# def preprocess(frame)
#     return clip_processor(frame)["pixel_values"][0]

In [None]:
# clip_processor(dls.one_batch()[0], rescale=False)["pixel_values"][0].shape

In [None]:
one_batch = dls.one_batch()[0]

Some weights of ViTModel were not initialized from the model checkpoint at google/vit-base-patch16-224 and are newly initialized: ['vit.pooler.dense.bias', 'vit.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


ViTImageProcessor {
  "do_convert_rgb": null,
  "do_normalize": true,
  "do_rescale": true,
  "do_resize": true,
  "image_mean": [
    0.5,
    0.5,
    0.5
  ],
  "image_processor_type": "ViTImageProcessor",
  "image_std": [
    0.5,
    0.5,
    0.5
  ],
  "resample": 2,
  "rescale_factor": 0.00392156862745098,
  "size": {
    "height": 224,
    "width": 224
  }
}

In [None]:
encoder(dls.one_batch()[0])

In [None]:
class CTModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.unet = unet
        self.unet.class_embedding = None
        self.vit = ViTImageEncoder(7, output_dim=self.unet.config.encoder_hid_dim).to(device)

        for param in self.unet.parameters():
            param.requires_grad = False
        

    def forward(self, noisy_images, images, t):
        encoded = self.vit(images).expand(-1, 77, -1).half()

        return self.unet(noisy_images.half(), t.half(), encoded.half())[0]

In [None]:
model = CTModel()

In [None]:
one_batch = dls.one_batch()
one_batch[0].shape
images = one_batch[0]
images = torch.cat([images, images], dim=1)

In [None]:
with torch.no_grad():
    x = model(images, one_batch[0], torch.tensor([1.0]*4, dtype=torch.float16, device="cuda"))
# x

In [None]:
x[0].size()

In [None]:
learn = Learner(dls, model, loss_func=torch.nn.MSELoss(), cbs=[DDPMCB(unet,scheduler)]).to_fp16()
# learn = Learner(dls, model.half(), loss_func=torch.nn.MSELoss(), cbs=[DDPMCB(unet,scheduler)])
# from fastai.learner import AvgSmoothLoss

# class FP16AvgSmoothLoss(AvgSmoothLoss):
#     def accumulate(self, learn):
#         self.count += 1
#         loss_fp16 = to_detach(learn.loss.mean()).half()  # Ensure FP16
#         self.val = torch.lerp(loss_fp16, self.val.half(), self.beta)  # Convert self.val to FP16

# learn.recorder.metrics = []

learn.lr_find()

In [None]:
lr = 10e-05
learn.fit_one_cycle(1, lr)

In [None]:
# If lr_max is not provided, use the suggested learning rate from the finder
    lr_max = lr_max or lr_max_suggested
    print(f"Using learning rate: {lr_max:.2e}")

    # 🚀 Step 2: Train the model with OneCycle policy
    learn.fit_one_cycle(epochs, lr_max)

    return learn  # Return trained Learner

In [None]:
x = x.expand(-1, 77, -1)
one_batch = torch.cat([one_batch, one_batch], dim=1)