In [1]:
!nvidia-smi

Sat Mar  9 11:18:07 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.161.07             Driver Version: 535.161.07   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA RTX A4000               On  | 00000000:08:00.0 Off |                  Off |
| 41%   26C    P8              13W / 100W |     11MiB / 16376MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [19]:
from typing import Tuple, Union, List
import os

import numpy as np
from PIL import Image

import torch
from transformers import CLIPTextModel, CLIPTokenizer, DataCollatorWithPadding

In [3]:
text_encoder = CLIPTextModel.from_pretrained(
    "int_ch/models/runwayml--stable-diffusion-inpainting",
    subfolder="text_encoder")

In [5]:
text_encoder.to('cuda', dtype=torch.float32)
print('moved to cuda')

moved to cuda


In [8]:
tokenizer = CLIPTokenizer.from_pretrained(
    "int_ch/models/runwayml--stable-diffusion-inpainting",
    subfolder="tokenizer")

In [9]:
def tokenize_function(caption):
    return tokenizer(caption, truncation=False)

In [20]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [95]:
caption1 = "Photorealistic interior design of a mid-century modern living room with a focus on warm wood tones and pops of color. \
The room features a large plush velvet sectional sofa in a rich emerald green, paired with a mid-century modern wood \
and glass coffee table.Ample natural light streams \
    through large windows with white sheer curtains. A statement art piece with abstract expressionist elements hangs above \
    a mid-century modern credenza."

caption2 = "A Moroccan-style rug with geometric patterns adds texture to the floor. Ample natural light streams \
    through large windows with white sheer curtains. A statement art piece with abstract expressionist elements hangs above \
    a mid-century modern credenza. Lush Fiddle Leaf Fig and Monstera Deliciosa plants add a touch of nature to the space.\
    The room features a large plush velvet sectional sofa in a rich emerald green, paired with a mid-century modern wood \
and glass coffee table."

In [96]:
prompt_lst = [caption1, caption2]
prompt_token_lst = []
for prompt in prompt_lst:
    prompt_dict = tokenize_function(prompt)
    prompt_token_lst.append(prompt_dict)
prompt_tensors = data_collator(prompt_token_lst)

In [97]:
prompt_tensors['input_ids'].size()

torch.Size([2, 95])

In [61]:
collated_captions['input_ids']

tensor([[49406,  1153, 16157,  7305,  1681,   539,   320,  4734,   268,  4275,
          4077,  2815,  1530,   593,   320,  4353,   525,  3616,  1704, 14744,
           537, 11705,   539,  3140,   269,   518,  1530,  4643,   320,  3638,
         18926, 11063, 21876, 15723,   530,   320,  4021, 16980,  1901,   267,
         12433,   593,   320,  4734,   268,  4275,  4077,  1704,   537,  3313,
          2453,  2175,   269, 49407]])

In [77]:
cc = prompt_tensors['input_ids'].to('cuda')

In [78]:
emb = text_encoder(cc)
type(emb)

transformers.modeling_outputs.BaseModelOutputWithPooling

In [79]:
with torch.no_grad():  # Disable gradient calculation for efficiency
  text_embeddings = text_encoder(cc).pooler_output

In [80]:
text_embeddings.size()

torch.Size([2, 768])

In [6]:
def do_encode(inputs, text_encoder, device, max_seq_len=75):
    embeddings = []
    tokens = inputs['input_ids']
    attention_mask = inputs['attention_mask']
    num_chunks = (tokens.size(1) + max_seq_len - 1) // max_seq_len

    text_encoder = text_encoder.to(device)
    tokens = tokens.to(device)
    attention_mask = attention_mask.to(device)
    
    for i in range(num_chunks):
        start_idx = i * max_seq_len
        end_idx = start_idx + max_seq_len
        chunk_tokens = tokens[:, start_idx:end_idx]
        # chunk_attention_mask = attention_mask[:, start_idx:end_idx]

        chunk_embeddings = text_encoder.text_model.embeddings.token_embedding(chunk_tokens)

        chunk_size = chunk_tokens.size(1)
        position_ids = torch.arange(start_idx, start_idx + chunk_size, dtype=torch.long)
        position_ids = position_ids.unsqueeze(0).expand(chunk_tokens.size(0), chunk_size)

        position_ids = torch.clamp(position_ids.to(device), max=text_encoder.text_model.embeddings.position_embedding.num_embeddings - 1)
        position_embeddings = text_encoder.text_model.embeddings.position_embedding(position_ids)
        chunk_embeddings += position_embeddings

        embeddings.append(chunk_embeddings)

    concatenated_embeddings = torch.cat(embeddings, dim=1)
    attention_mask_expanded = attention_mask.unsqueeze(1).unsqueeze(2).repeat(1, 1, attention_mask.shape[1], 1)
    encoder_outputs = text_encoder.text_model.encoder(concatenated_embeddings, attention_mask=attention_mask_expanded)
    return(encoder_outputs.last_hidden_state)

In [30]:
encoder_hidden_states = do_encode(collated_captions, text_encoder, 'cuda')

In [31]:
encoder_hidden_states.size()

torch.Size([1, 119, 768])

In [32]:
import os

In [35]:
os.getcwd()

'/'

In [None]:
! python 'int_ch/local_evaluation.py'

  torch.utils._pytree._register_pytree_node(
  return register_model(fn_wrapper)
  return register_model(fn_wrapper)
  return register_model(fn_wrapper)
  return register_model(fn_wrapper)
  return register_model(fn_wrapper)
  torch.utils._pytree._register_pytree_node(
The config attributes {'dropout': 0.0, 'sample_size': 32} were passed to ControlNetModel, but are not expected and will be ignored. Please verify your config.json configuration file.
Loading pipeline components...: 100%|█████████████| 6/6 [00:02<00:00,  2.01it/s]
You have disabled the safety checker for <class 'diffusers.pipelines.controlnet.pipeline_controlnet_inpaint.StableDiffusionControlNetInpaintPipeline'> by passing `safety_checker=None`. Ensure that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered results in services or applications open to the public. Both the diffusers team and Hugging Face strongly recommend to keep the safety filter enabled in all public facing circumsta

In [51]:
text_encoder

CLIPTextModel(
  (text_model): CLIPTextTransformer(
    (embeddings): CLIPTextEmbeddings(
      (token_embedding): Embedding(49408, 768)
      (position_embedding): Embedding(77, 768)
    )
    (encoder): CLIPEncoder(
      (layers): ModuleList(
        (0-11): 12 x CLIPEncoderLayer(
          (self_attn): CLIPAttention(
            (k_proj): Linear(in_features=768, out_features=768, bias=True)
            (v_proj): Linear(in_features=768, out_features=768, bias=True)
            (q_proj): Linear(in_features=768, out_features=768, bias=True)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (layer_norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): CLIPMLP(
            (activation_fn): QuickGELUActivation()
            (fc1): Linear(in_features=768, out_features=3072, bias=True)
            (fc2): Linear(in_features=3072, out_features=768, bias=True)
          )
          (layer_norm2): LayerNorm((768,), eps=1e

In [53]:
text_encoder.text_model.encoder

CLIPEncoder(
  (layers): ModuleList(
    (0-11): 12 x CLIPEncoderLayer(
      (self_attn): CLIPAttention(
        (k_proj): Linear(in_features=768, out_features=768, bias=True)
        (v_proj): Linear(in_features=768, out_features=768, bias=True)
        (q_proj): Linear(in_features=768, out_features=768, bias=True)
        (out_proj): Linear(in_features=768, out_features=768, bias=True)
      )
      (layer_norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (mlp): CLIPMLP(
        (activation_fn): QuickGELUActivation()
        (fc1): Linear(in_features=768, out_features=3072, bias=True)
        (fc2): Linear(in_features=3072, out_features=768, bias=True)
      )
      (layer_norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    )
  )
)

In [100]:
def get_pipeline_embeds(prompt, negative_prompt, device):
    """ Get pipeline embeds for prompts bigger than the maxlength of the pipe
    :param pipeline:
    :param prompt:
    :param negative_prompt:
    :param device:
    :return:
    """
    max_length = tokenizer.model_max_length

    # simple way to determine length of tokens
    count_prompt = len(prompt.split(" "))
    count_negative_prompt = len(negative_prompt.split(" "))

    # create the tensor based on which prompt is longer
    if count_prompt >= count_negative_prompt:
        input_ids = tokenizer(prompt, return_tensors="pt", truncation=False).input_ids.to(device)
        shape_max_length = input_ids.shape[-1]
        negative_ids = tokenizer(negative_prompt, truncation=False, padding="max_length",
                                          max_length=shape_max_length, return_tensors="pt").input_ids.to(device)

    else:
        negative_ids = tokenizer(negative_prompt, return_tensors="pt", truncation=False).input_ids.to(device)
        shape_max_length = negative_ids.shape[-1]
        input_ids = tokenizer(prompt, return_tensors="pt", truncation=False, padding="max_length",
                                       max_length=shape_max_length).input_ids.to(device)

    print(f'shape_max_length : {shape_max_length} & max_length : {max_length}')
    concat_embeds = []
    neg_embeds = []
    for i in range(0, shape_max_length, max_length):
        concat_embeds.append(text_encoder(input_ids[:, i: i + max_length])[0])
        neg_embeds.append(text_encoder(negative_ids[:, i: i + max_length])[0])

    return torch.cat(concat_embeds, dim=1), torch.cat(neg_embeds, dim=1)

In [101]:
p_e, n_e = get_pipeline_embeds(caption1, caption2, 'cuda')
p_e.size(), n_e.size()

shape_max_length : 95 & max_length : 77


(torch.Size([1, 95, 768]), torch.Size([1, 95, 768]))