Question about difference of training SDXL between kohya and diffusers #2534

SeungHwa92 · 2024-05-25T13:32:18Z

SeungHwa92
May 25, 2024

I am working on training SDXL Lora so I review the kohya code and diffusers code.
I was checking whether kohya and diffusers have same output of text_encoder_2(hidden_states2, pool2).
hidden_state2 values are same but pool2 have different values.
I found that the way of calculating pool2 are different.

the code for calculating pool2 from kohya and diffusers belows.

Difference start from this.
kohya uses "text_encoder_2_output['last_hidden_state']" or "text_encoder_2_output[1]" for pool2
but diffusers uses "text_encoder_2_output['text_embeds']" or "text_encoder_2_output[0]" for pool2

if you have any insights for implementing this code.
Can you share your insights ?

# diffusers prompt embedding function https://github.com/huggingface/diffusers/blob/5cd45c24bf616f09c818455184f3d1c3a3cebe00/examples/dreambooth/train_dreambooth_lora_sdxl.py#L934
def encode_prompt(text_encoders, tokenizers, prompt, text_input_ids_list=None):
    prompt_embeds_list = []

    for i, text_encoder in enumerate(text_encoders):
        if tokenizers is not None:
            tokenizer = tokenizers[i]
            text_input_ids = tokenize_prompt(tokenizer, prompt)
        else:
            assert text_input_ids_list is not None
            text_input_ids = text_input_ids_list[i]

        prompt_embeds = text_encoder(
            text_input_ids.to(text_encoder.device), output_hidden_states=True, return_dict=False
        )

        # We are only ALWAYS interested in the pooled output of the final text encoder
        pooled_prompt_embeds = prompt_embeds[0]
        prompt_embeds = prompt_embeds[-1][-2]
        bs_embed, seq_len, _ = prompt_embeds.shape
        prompt_embeds = prompt_embeds.view(bs_embed, seq_len, -1)
        prompt_embeds_list.append(prompt_embeds)

    prompt_embeds = torch.concat(prompt_embeds_list, dim=-1)
    pooled_prompt_embeds = pooled_prompt_embeds.view(bs_embed, -1)
    return prompt_embeds, pooled_prompt_embeds



# kohya prompt embedding function https://github.com/kohya-ss/sd-scripts/blob/bfb352bc433326a77aca3124248331eb60c49e8c/library/train_util.py#L4505
def get_hidden_states_sdxl(
    max_token_length: int,
    input_ids1: torch.Tensor,
    input_ids2: torch.Tensor,
    tokenizer1: CLIPTokenizer,
    tokenizer2: CLIPTokenizer,
    text_encoder1: CLIPTextModel,
    text_encoder2: CLIPTextModelWithProjection,
    weight_dtype: Optional[str] = None,
    accelerator = None,
):
    # input_ids: b,n,77 -> b*n, 77
    b_size = input_ids1.size()[0]
    input_ids1 = input_ids1.reshape((-1, tokenizer1.model_max_length))  # batch_size*n, 77
    input_ids2 = input_ids2.reshape((-1, tokenizer2.model_max_length))  # batch_size*n, 77

    # text_encoder1
    enc_out = text_encoder1(input_ids1, output_hidden_states=True, return_dict=True)
    hidden_states1 = enc_out["hidden_states"][11]

    # text_encoder2
    enc_out = text_encoder2(input_ids2, output_hidden_states=True, return_dict=True)
    hidden_states2 = enc_out["hidden_states"][-2]  # penuultimate layer

    # pool2 = enc_out["text_embeds"]
    unwrapped_text_encoder2 = text_encoder2 if accelerator is None else accelerator.unwrap_model(text_encoder2)
    pool2 = pool_workaround(unwrapped_text_encoder2, enc_out["last_hidden_state"], input_ids2, tokenizer2.eos_token_id)

    # b*n, 77, 768 or 1280 -> b, n*77, 768 or 1280
    n_size = 1 if max_token_length is None else max_token_length // 75
    hidden_states1 = hidden_states1.reshape((b_size, -1, hidden_states1.shape[-1]))
    hidden_states2 = hidden_states2.reshape((b_size, -1, hidden_states2.shape[-1]))

    if max_token_length is not None:
        # bs*3, 77, 768 or 1024
        # encoder1: <BOS>...<EOS> の三連を <BOS>...<EOS> へ戻す
        states_list = [hidden_states1[:, 0].unsqueeze(1)]  # <BOS>
        for i in range(1, max_token_length, tokenizer1.model_max_length):
            states_list.append(hidden_states1[:, i : i + tokenizer1.model_max_length - 2])  # <BOS> の後から <EOS> の前まで
        states_list.append(hidden_states1[:, -1].unsqueeze(1))  # <EOS>
        hidden_states1 = torch.cat(states_list, dim=1)

        # v2: <BOS>...<EOS> <PAD> ... の三連を <BOS>...<EOS> <PAD> ... へ戻す　正直この実装でいいのかわからん
        states_list = [hidden_states2[:, 0].unsqueeze(1)]  # <BOS>
        for i in range(1, max_token_length, tokenizer2.model_max_length):
            chunk = hidden_states2[:, i : i + tokenizer2.model_max_length - 2]  # <BOS> の後から 最後の前まで
            # this causes an error:
            # RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation
            # if i > 1:
            #     for j in range(len(chunk)):  # batch_size
            #         if input_ids2[n_index + j * n_size, 1] == tokenizer2.eos_token_id:  # 空、つまり <BOS> <EOS> <PAD> ...のパターン
            #             chunk[j, 0] = chunk[j, 1]  # 次の <PAD> の値をコピーする
            states_list.append(chunk)  # <BOS> の後から <EOS> の前まで
        states_list.append(hidden_states2[:, -1].unsqueeze(1))  # <EOS> か <PAD> のどちらか
        hidden_states2 = torch.cat(states_list, dim=1)

        # pool はnの最初のものを使う
        pool2 = pool2[::n_size]

    if weight_dtype is not None:
        # this is required for additional network training
        hidden_states1 = hidden_states1.to(weight_dtype)
        hidden_states2 = hidden_states2.to(weight_dtype)

    return hidden_states1, hidden_states2, pool2

# kohya prompt embedding function https://github.com/kohya-ss/sd-scripts/blob/bfb352bc433326a77aca3124248331eb60c49e8c/library/train_util.py#L4462C1-L4502C25
def pool_workaround(
    text_encoder: CLIPTextModelWithProjection, last_hidden_state: torch.Tensor, input_ids: torch.Tensor, eos_token_id: int
):
    r"""
    workaround for CLIP's pooling bug: it returns the hidden states for the max token id as the pooled output
    instead of the hidden states for the EOS token
    If we use Textual Inversion, we need to use the hidden states for the EOS token as the pooled output

    Original code from CLIP's pooling function:

    \# text_embeds.shape = [batch_size, sequence_length, transformer.width]
    \# take features from the eot embedding (eot_token is the highest number in each sequence)
    \# casting to torch.int for onnx compatibility: argmax doesn't support int64 inputs with opset 14
    pooled_output = last_hidden_state[
        torch.arange(last_hidden_state.shape[0], device=last_hidden_state.device),
        input_ids.to(dtype=torch.int, device=last_hidden_state.device).argmax(dim=-1),
    ]
    """

    # input_ids: b*n,77
    # find index for EOS token

    # Following code is not working if one of the input_ids has multiple EOS tokens (very odd case)
    # eos_token_index = torch.where(input_ids == eos_token_id)[1]
    # eos_token_index = eos_token_index.to(device=last_hidden_state.device)

    # Create a mask where the EOS tokens are
    eos_token_mask = (input_ids == eos_token_id).int()

    # Use argmax to find the last index of the EOS token for each element in the batch
    eos_token_index = torch.argmax(eos_token_mask, dim=1)  # this will be 0 if there is no EOS token, it's fine
    eos_token_index = eos_token_index.to(device=last_hidden_state.device)

    # get hidden states for EOS token
    pooled_output = last_hidden_state[torch.arange(last_hidden_state.shape[0], device=last_hidden_state.device), eos_token_index]

    # apply projection: projection may be of different dtype than last_hidden_state
    pooled_output = text_encoder.text_projection(pooled_output.to(text_encoder.text_projection.weight.dtype))
    pooled_output = pooled_output.to(last_hidden_state.dtype)

    return pooled_output



strings = ['hello world ! How are you ?']

tokenizers = load_tokenizers()

# load kohya models
load_stable_diffusion_format, kohya_text_encoder, kohya_text_encoder_2, kohya_vae, kohya_unet, logit_scale, ckpt_info = library.sdxl_train_util._load_target_model(CIVITAI_CKPT_PATH, None, 'v1', torch.float)

# load diffusers models
diffusers_pipeline = diffusers.StableDiffusionXLPipeline.from_pretrained(CONVERTED_PATH)
diffusers_text_encoder = diffusers_pipeline.text_encoder
diffusers_text_encoder_2 = diffusers_pipeline.text_encoder_2

# Transpose text_projection.weight for same output
diffusers_text_encoder_2.text_projection.weight.data = diffusers_text_encoder_2.text_projection.weight.data.T.contiguous() 

# tokenize
input_ids1 = tokenizers[0](strings, padding="max_length", truncation=True, max_length=77, return_tensors="pt").input_ids
input_ids2 = tokenizers[1](strings, padding="max_length", truncation=True, max_length=77, return_tensors="pt").input_ids


kohya_hidden_state1, kohya_hidden_state2, kohya_pool2 = get_hidden_states_sdxl(max_token_length=77,
                                                                                            input_ids1=input_ids1,
                                                                                            input_ids2=input_ids2,
                                                                                            tokenizer1=tokenizers[0],
                                                                                            tokenizer2=tokenizers[1],
                                                                                            text_encoder1=kohya_text_encoder,
                                                                                            text_encoder2=kohya_text_encoder_2,
                                                                                            weight_dtype=torch.float)
kohya_hidden_state = torch.cat([kohya_hidden_state1, kohya_hidden_state2], dim=2)

diffusers_hidden_state, diffusers_pool2 = encode_prompt(text_encoders=[diffusers_text_encoder, diffusers_text_encoder_2],
                                                                        tokenizers=tokenizers,
                                                                        prompt=strings)

print('hidden_state are same :', torch.allclose(kohya_hidden_state , diffusers_hidden_state))  # True
print('pool2 are same :',torch.allclose(kohya_pool2 , diffusers_pool2 ))  # False

                                            
kohya_text_encoder_2_output = kohya_text_encoder_2(input_ids2, output_hidden_states=True, return_dict=True)
diffusers_text_encoder_2_output = diffusers_text_encoder_2(input_ids2, output_hidden_states=True, return_dict=False)

print('check text_encoder_2 outputs are same')
print('text_embeds is index 0 in list:', torch.allclose(kohya_text_encoder_2_output['text_embeds'], diffusers_text_encoder_2_output[0]))  # True
print('last_hidden_state is index 1 in list:', torch.allclose(kohya_text_encoder_2_output['last_hidden_state'], diffusers_text_encoder_2_output[1]))  # True

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Question about difference of training SDXL between kohya and diffusers #2534

Uh oh!

{{title}}

Uh oh!

Uh oh!

{{editor}}'s edit

{{editor}}'s edit

Uh oh!

Replies: 0 comments

Select a reply

Uh oh!

Uh oh!

Question about difference of training SDXL between kohya and diffusers #2534

Uh oh!

Uh oh!

SeungHwa92 May 25, 2024

Replies: 0 comments

SeungHwa92
May 25, 2024