In [30]:
import numpy as np
import os
import torch
from diffusers import DiffusionPipeline, DPMSolverMultistepScheduler

from transformers import CLIPProcessor, CLIPModel, CLIPTextModel, CLIPVisionModel, CLIPVisionModelWithProjection
from PIL import Image
import cv2



# Check dimensionality of Text encoder in different pipelines

In [16]:
# Stable Diffusion Pipeline

# sd_encoder = CLIPTextModel.from_pretrained("cerspense/zeroscope_v2_XL/text_encoder")
# sd_encoder = CLIPTextModel.from_pretrained("cerspense/zeroscope_v2_576w/text_encoder")
pipe = DiffusionPipeline.from_pretrained("../../zeroscope_v2_576w", torch_dtype=torch.float16)
print(pipe.text_encoder.config)



Loading pipeline components...:   0%|          | 0/5 [00:00<?, ?it/s]

CLIPTextConfig {
  "_name_or_path": "../../zeroscope_v2_576w/text_encoder",
  "architectures": [
    "CLIPTextModel"
  ],
  "attention_dropout": 0.0,
  "bos_token_id": 0,
  "dropout": 0.0,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_size": 1024,
  "initializer_factor": 1.0,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 77,
  "model_type": "clip_text_model",
  "num_attention_heads": 16,
  "num_hidden_layers": 23,
  "pad_token_id": 1,
  "projection_dim": 512,
  "torch_dtype": "float16",
  "transformers_version": "4.30.2",
  "vocab_size": 49408
}



In [17]:
pipe = DiffusionPipeline.from_pretrained("../../zeroscope_v2_XL", torch_dtype=torch.float16)
print(pipe.text_encoder.config)

Loading pipeline components...:   0%|          | 0/5 [00:00<?, ?it/s]

CLIPTextConfig {
  "_name_or_path": "../../zeroscope_v2_XL/text_encoder",
  "architectures": [
    "CLIPTextModel"
  ],
  "attention_dropout": 0.0,
  "bos_token_id": 0,
  "dropout": 0.0,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_size": 1024,
  "initializer_factor": 1.0,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 77,
  "model_type": "clip_text_model",
  "num_attention_heads": 16,
  "num_hidden_layers": 23,
  "pad_token_id": 1,
  "projection_dim": 512,
  "torch_dtype": "float16",
  "transformers_version": "4.30.2",
  "vocab_size": 49408
}



## Check embedding dot product between zeroscope enc and clip_vit_large vision encoder

In [36]:
zeroscope_model = DiffusionPipeline.from_pretrained("../../zeroscope_v2_576w", 
                                                    torch_dtype=torch.float16,
                                                    device="cuda:0")

zeroscope_model.to("cuda:0")

# Transform caption from BMD into embedding
caption = "A duck is swimming in a lake searching for food"
with torch.no_grad():
    text_emb = cond_vectors = zeroscope_model._encode_prompt(
                    caption,
                    "cuda:0",
                    num_images_per_prompt=1,
                    do_classifier_free_guidance=False,
                )

print(text_emb.shape)
print(zeroscope_model.text_encoder.projection(text_emb).shape)



Keyword arguments {'device': 'cuda:0'} are not expected by TextToVideoSDPipeline and will be ignored.


Loading pipeline components...:   0%|          | 0/5 [00:00<?, ?it/s]

torch.Size([1, 77, 1024])


In [33]:

# Get vision embedding
vision_enc = CLIPVisionModelWithProjection.from_pretrained("openai/clip-vit-large-patch14").eval()
print(vision_enc.config)
processor = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14")
img = Image.open("../data/stimuli_bmd/frames/0001/001.png")
input = processor(images=img, return_tensors="pt")
print(input.keys())

# preproc = transforms.Compose([
#     transforms.Resize(size=self.clip_size[0], interpolation=transforms.InterpolationMode.BICUBIC),
#     transforms.CenterCrop(size=self.clip_size),
#     transforms.Normalize(mean=(0.48145466, 0.4578275, 0.40821073), std=(0.26862954, 0.26130258, 0.27577711))
# ])

outputs = vision_enc(**input)
print(type(outputs))
print(outputs[0].shape)
print(len(outputs))


Some weights of the model checkpoint at openai/clip-vit-large-patch14 were not used when initializing CLIPVisionModelWithProjection: ['text_model.encoder.layers.3.mlp.fc2.bias', 'text_model.encoder.layers.2.self_attn.k_proj.bias', 'text_model.encoder.layers.10.layer_norm1.weight', 'text_model.encoder.layers.9.layer_norm1.weight', 'text_model.encoder.layers.1.self_attn.v_proj.bias', 'text_model.encoder.layers.9.self_attn.q_proj.bias', 'text_model.encoder.layers.6.mlp.fc2.weight', 'text_model.encoder.layers.4.mlp.fc2.bias', 'text_model.encoder.layers.9.self_attn.out_proj.bias', 'text_model.encoder.layers.1.self_attn.k_proj.bias', 'text_model.encoder.layers.11.mlp.fc1.bias', 'text_model.encoder.layers.11.mlp.fc1.weight', 'text_model.encoder.layers.4.self_attn.k_proj.bias', 'text_model.encoder.layers.10.self_attn.out_proj.bias', 'text_model.encoder.layers.11.self_attn.out_proj.bias', 'text_model.encoder.layers.2.layer_norm2.bias', 'text_model.encoder.layers.9.self_attn.v_proj.bias', 'text_

CLIPVisionConfig {
  "_name_or_path": "openai/clip-vit-large-patch14",
  "attention_dropout": 0.0,
  "dropout": 0.0,
  "hidden_act": "quick_gelu",
  "hidden_size": 1024,
  "image_size": 224,
  "initializer_factor": 1.0,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-05,
  "model_type": "clip_vision_model",
  "num_attention_heads": 16,
  "num_channels": 3,
  "num_hidden_layers": 24,
  "patch_size": 14,
  "projection_dim": 768,
  "transformers_version": "4.30.2"
}

dict_keys(['pixel_values'])
<class 'transformers.models.clip.modeling_clip.CLIPVisionModelOutput'>
torch.Size([1, 768])
2
