In [1]:
import numpy as np
import torch
import torch.nn as nn
import math
import fastai
from PIL import Image
from diffusers import DiffusionPipeline
from diffusers.utils import pt_to_pil
from dataloader import get_imagenette_dataloader
from quantize import quantize_img, plot_imgs
from ddpm import DDPMCB
from preprocessing import clip_preprocess, conditioning_transform
from functools import partial
from fastai.vision.all import (ImageDataLoaders, Resize, TensorImage, Learner, 
                               Callback, Normalize)
from encoder import ViTImageEncoder
import fastcore.all as fc

device = "cuda"

In [2]:
def method_helper(o): return list(filter(lambda x: x[0] != "_", dir(o)))

In [3]:
# stage g
stage_2 = DiffusionPipeline.from_pretrained(
    "DeepFloyd/IF-II-L-v1.0", text_encoder=None, variant="fp16", 
     torch_dtype=torch.float16, class_labels=None 
)
# stage_2.enable_model_cpu_offload()


A mixture of fp16 and non-fp16 filenames will be loaded.
Loaded fp16 filenames:
[text_encoder/model.fp16-00002-of-00002.safetensors, safety_checker/model.fp16.safetensors, text_encoder/model.fp16-00001-of-00002.safetensors, unet/diffusion_pytorch_model.fp16.safetensors]
Loaded non-fp16 filenames:
[watermarker/diffusion_pytorch_model.safetensors
If this behavior is not expected, please check your folder structure.
Keyword arguments {'class_labels': None} are not expected by IFSuperResolutionPipeline and will be ignored.


Loading pipeline components...:   0%|          | 0/7 [00:00<?, ?it/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [4]:
scheduler = stage_2.scheduler
unet = stage_2.unet.to(device)

In [5]:
dls = ImageDataLoaders.from_folder( "/mnt/wd/datasets/imagenette2", valid_pct=0.1, bs=1,)
one_batch = dls.one_batch()[0]
one_batch.shape

torch.Size([1, 3, 375, 500])

In [6]:
encoder = ViTImageEncoder(7, output_dim=unet.config.encoder_hid_dim).to(device)
encoder_preprocess = encoder.feature_extractor
c_preprocess = partial(clip_preprocess, stage_2=stage_2)

Some weights of ViTModel were not initialized from the model checkpoint at google/vit-base-patch16-224 and are newly initialized: ['vit.pooler.dense.bias', 'vit.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
img = c_preprocess(one_batch[0].cpu().numpy())

It looks like you are trying to rescale already rescaled images. If the input images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again.


In [8]:
cond_transform = partial(conditioning_transform, encode_preprocess=None)
# cond_transform(img).shape

In [9]:
def preprocessing(x):
    if not isinstance(x, fastai.vision.core.TensorCategory):
        # x = TensorImage(x).permute(2,1,0).numpy()
        x = Resize(224)(x)
        x = TensorImage(x).permute(2,1,0)
    x = cond_transform(x)
    x = x.to("cpu")
    # x = c_preprocess(x)
    return x

# preprocessing(one_batch[0].cpu().numpy()).shape

In [10]:
# dls = ImageDataLoaders.from_folder(
#     "/mnt/wd/datasets/imagenette2",
#     valid_pct=0.1,
#     item_tfms=[preprocessing],
# #     batch_tfms=[Normalize()],
#     bs=3000,
#     num_workers=16
# )
# xb, _ = dls.one_batch()
# mean = xb.mean(dim=[0,2,3])  # Compute per-channel mean
# std = xb.std(dim=[0,2,3])    # Compute per-channel std
# print("Auto-calculated Mean:", mean)
# print("Auto-calculated Std:", std)

In [11]:
mean = torch.tensor([ 4.6301e-01,  4.5852e-01,  4.3105e-01,  1.8062e-03,  1.7940e-03,
                    1.6879e-03,  6.3991e-04, -1.0427e-05, -8.0591e-08,  3.9216e-03])
std = torch.tensor([0.2826, 0.2781, 0.3003, 0.0011, 0.0011, 0.0012, 0.0007, 0.0015, 0.0014, 0.0000])

In [12]:
dls = ImageDataLoaders.from_folder(
    "/mnt/wd/datasets/imagenette2",
    valid_pct=0.1,
    item_tfms=[preprocessing],
    # batch_tfms=[Normalize.from_stats(mean, std)],
    batch_tfms=[Normalize()],
    bs=4,
    num_workers=16
)

In [13]:
dls.one_batch()[0].shape
dls.one_batch()[0][0,2,...].std()

TensorImage(0.7753, device='cuda:0')

In [14]:
class CTModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.unet = unet
        self.unet.class_embedding = None
        self.vit = ViTImageEncoder(7, output_dim=self.unet.config.encoder_hid_dim).to(device)

        for param in self.unet.parameters():
            param.requires_grad = False
        

    def forward(self, noisy_images, images, t):
        encoded = self.vit(images).expand(-1, 77, -1).half()

        return self.unet(noisy_images.half(), t.half(), encoded.half())[0]

In [15]:
model = CTModel()

Some weights of ViTModel were not initialized from the model checkpoint at google/vit-base-patch16-224 and are newly initialized: ['vit.pooler.dense.bias', 'vit.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [16]:
one_batch = dls.one_batch()
one_batch[0].shape
images = one_batch[0]
# images = torch.cat([images, images], dim=1)

In [17]:
# Without DDPM callback it won't work
# with torch.no_grad():
#     x = model(images, one_batch[0], torch.tensor([1.0]*4, dtype=torch.float16, device="cuda"))
# x

In [42]:
learn = Learner(dls, model, loss_func=torch.nn.MSELoss(), cbs=[DDPMCB(unet,scheduler)]).to_fp16()

In [43]:
from fastai.callback.hook import ActivationStats

# Create a list of layers to track. You can add or remove layers based on what you want to observe.
layers_to_track = [
    learn.model.vit.vit.embeddings.patch_embeddings.projection,
    learn.model.vit.vit.encoder.layer[0].attention.attention.query,
    learn.model.vit.vit.encoder.layer[0].attention.attention.key,
    learn.model.vit.vit.encoder.layer[0].attention.attention.value,
    learn.model.vit.vit.encoder.layer[0].intermediate.dense,
    learn.model.vit.vit.encoder.layer[0].output.dense,
    learn.model.vit.vit.encoder.layer[0].layernorm_before,
    learn.model.vit.vit.encoder.layer[0].layernorm_after,
    learn.model.vit.vit.encoder.layer[6].attention.attention.query,
    learn.model.vit.vit.encoder.layer[6].attention.attention.key,
    learn.model.vit.vit.encoder.layer[6].attention.attention.value,
    learn.model.vit.vit.encoder.layer[6].intermediate.dense,
    learn.model.vit.vit.encoder.layer[6].output.dense,
    learn.model.vit.vit.encoder.layer[6].layernorm_before,
    learn.model.vit.vit.encoder.layer[6].layernorm_after,
    learn.model.vit.vit.layernorm,
    learn.model.vit.vit.pooler.dense,
]

# Add the ActivationStats callback
astats = ActivationStats(modules=layers_to_track)
learn.add_cb(astats)

<fastai.learner.Learner at 0x7e9f6004ea20>

In [37]:
# learn.lr_find()

ViTEncoder(
  (layer): ModuleList(
    (0-11): 12 x ViTLayer(
      (attention): ViTSdpaAttention(
        (attention): ViTSdpaSelfAttention(
          (query): Linear(in_features=768, out_features=768, bias=True)
          (key): Linear(in_features=768, out_features=768, bias=True)
          (value): Linear(in_features=768, out_features=768, bias=True)
          (dropout): Dropout(p=0.0, inplace=False)
        )
        (output): ViTSelfOutput(
          (dense): Linear(in_features=768, out_features=768, bias=True)
          (dropout): Dropout(p=0.0, inplace=False)
        )
      )
      (intermediate): ViTIntermediate(
        (dense): Linear(in_features=768, out_features=3072, bias=True)
        (intermediate_act_fn): GELUActivation()
      )
      (output): ViTOutput(
        (dense): Linear(in_features=3072, out_features=768, bias=True)
        (dropout): Dropout(p=0.0, inplace=False)
      )
      (layernorm_before): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (l

In [44]:
lr = 10e-05
learn.fit_one_cycle(1, lr)
learn.save("ctransfer_epoch_1.pth")
# learn = learn.load("ctransfer_epoch_1.pth")

epoch,train_loss,valid_loss,time


KeyboardInterrupt: 

AttributeError: Exception occured in `ActivationStats` when calling event `before_fit`:
	'str' object has no attribute 'register_forward_hook'

In [24]:
learn2 = learn.add_cb(astats)
learn2.save("ctransfer_epoch_2.pth")

AttributeError: 'CTModel' object has no attribute 'keys'

[('unet',
  UNet2DConditionModel(
    (conv_in): Conv2d(6, 160, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (time_proj): Timesteps()
    (time_embedding): TimestepEmbedding(
      (linear_1): Linear(in_features=160, out_features=1280, bias=True)
      (act): GELU(approximate='none')
      (linear_2): Linear(in_features=1280, out_features=1280, bias=True)
    )
    (encoder_hid_proj): Linear(in_features=4096, out_features=1280, bias=True)
    (class_embedding): None
    (add_embedding): TextTimeEmbedding(
      (norm1): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)
      (pool): AttentionPooling(
        (k_proj): Linear(in_features=4096, out_features=4096, bias=True)
        (q_proj): Linear(in_features=4096, out_features=4096, bias=True)
        (v_proj): Linear(in_features=4096, out_features=4096, bias=True)
      )
      (proj): Linear(in_features=4096, out_features=1280, bias=True)
      (norm2): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
    )
    (

In [None]:
astats.color_dim()