In [6]:
import numpy as np
from torch import nn
import torch
from transformers import CLIPTokenizer, CLIPTextModel, CLIPVisionModel, CLIPFeatureExtractor

path_results = '../results/loop_clip_sd/'

In [2]:
class AbstractEncoder(nn.Module):
    def __init__(self):
        super().__init__()

    def encode(self, *args, **kwargs):
        raise NotImplementedError
        
class FrozenCLIPEmbedder(AbstractEncoder):
    """Uses the CLIP transformer encoder for text (from Hugging Face)"""
    def __init__(self, version="openai/clip-vit-large-patch14", device="cuda", max_length=77):
        super().__init__()
        self.tokenizer = CLIPTokenizer.from_pretrained(version)
        self.transformer = CLIPTextModel.from_pretrained(version)
        self.device = device
        self.max_length = max_length
        self.freeze()

    def freeze(self):
        self.transformer = self.transformer.eval()
        for param in self.parameters():
            param.requires_grad = False

    def forward(self, text):
        batch_encoding = self.tokenizer(text, truncation=True, max_length=self.max_length, return_length=True,
                                        return_overflowing_tokens=False, padding="max_length", return_tensors="pt")
        tokens = batch_encoding["input_ids"].to(self.device)
        outputs = self.transformer(input_ids=tokens)

        z = outputs.last_hidden_state
        return z

    def encode(self, text):
        return self(text)

In [3]:
text = ['santa is coming tonight', 'a picture of a dog', 'a picture of a cat']
with open(f'{path_results}/conditioning_text.txt', 'w') as f:
    for item in text:
        f.write(f'{item}\n')
text_encoder = FrozenCLIPEmbedder().cuda()


for iter in range(1):

    # Create and save conditioning
    cond = text_encoder(text).cpu().numpy()
    cond_filename = f'{path_results}/conditioning_embedding_iter{iter}.npy'
    np.save(cond_filename, cond)

    # Generate
    !PYTHONPATH="$PYTHONPATH:./stable-diffusion-main" \
        python stable-diffusion-main/scripts/txt2img.py --plms \
        --config './stable-diffusion-main/configs/stable-diffusion/v1-inference.yaml' \
        --ckpt /gpfswork/rech/dcf/ulb98yg/DATA/sd-v1-4.ckpt \
        --outdir '/gpfswork/rech/dcf/ulb98yg/dream-domain/results/loop_clip_sd/iter{iter}' \
        --cond-from-file {cond_filename}

Global seed set to 42
Loading model from /gpfswork/rech/dcf/ulb98yg/DATA/sd-v1-4.ckpt
Global Step: 470000
LatentDiffusion: Running in eps-prediction mode
DiffusionWrapper has 859.52 M params.
making attention of type 'vanilla' with 512 in_channels
Working with z of shape (1, 4, 32, 32) = 4096 dimensions.
making attention of type 'vanilla' with 512 in_channels
Creating invisible watermark encoder (see https://github.com/ShieldMnt/invisible-watermark)...
Sampling:   0%|                                           | 0/2 [00:00<?, ?it/s]
data:   0%|                                               | 0/1 [00:00<?, ?it/s][A
Conditioning: torch.Size([3, 77, 768])

Data shape for PLMS sampling is (3, 4, 64, 64)
Running PLMS Sampling with 50 timesteps


PLMS Sampler:   0%|                                      | 0/50 [00:00<?, ?it/s][A[A

PLMS Sampler:   2%|▌                             | 1/50 [00:00<00:31,  1.54it/s][A[A

PLMS Sampler:   4%|█▏                            | 2/50 [00:00<00:19,  2.

In [7]:
processor = CLIPFeatureExtractor.from_pretrained("openai/clip-vit-large-patch14")
img_encoder = CLIPVisionModel.from_pretrained("openai/clip-vit-large-patch14")

img = processor(images=torch.randn(3, 64, 64))['pixel_values'][0]
img = torch.tensor(img).unsqueeze(0)
img_encoder(img)['last_hidden_state'].shape

You are using a model of type clip_text_model to instantiate a model of type clip_vision_model. This is not supported for all configurations of models and can yield errors.
Some weights of the model checkpoint at openai/clip-vit-large-patch14 were not used when initializing CLIPVisionModel: ['text_model.encoder.layers.10.mlp.fc1.bias', 'text_model.encoder.layers.3.layer_norm2.weight', 'text_model.encoder.layers.11.self_attn.out_proj.weight', 'text_model.encoder.layers.8.layer_norm2.bias', 'text_model.encoder.layers.3.self_attn.k_proj.weight', 'text_model.encoder.layers.6.layer_norm2.weight', 'text_model.encoder.layers.8.self_attn.k_proj.weight', 'text_model.encoder.layers.10.mlp.fc2.weight', 'text_model.encoder.layers.9.self_attn.out_proj.weight', 'text_model.encoder.layers.0.mlp.fc2.weight', 'text_model.encoder.layers.6.self_attn.k_proj.weight', 'text_model.encoder.layers.9.self_attn.k_proj.weight', 'text_model.encoder.layers.1.self_attn.out_proj.weight', 'text_model.encoder.layers.6.

torch.Size([1, 50, 768])

# Size embedding text and image not the same! Stable diffusion conditioned on full sequence text embedding, not on multimodal vector. Impossible to reproject image in CLIP and use embedding as condition...