<a href="https://colab.research.google.com/github/adrianpuiu/kandinsky/blob/main/Kandinsky_2_1_textual%20inversion%20inference_Batching_in_Google_Drive_%2B_Dynamic_Prompting_%2B_Symmetry_Option.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#@title Check GPU
!nvidia-smi -L

GPU 0: Tesla T4 (UUID: GPU-b7c8d2fa-ce47-b694-4779-0e47551a3832)


In [1]:
#@title Mount Google Drive
from google.colab import drive

drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
#@title Installation
!pip install 'git+https://github.com/ai-forever/Kandinsky-2.git'
!pip install git+https://github.com/openai/CLIP.git

from kandinsky2 import get_kandinsky2

In [None]:
#@title Downloads
model = get_kandinsky2('cuda', task_type='text2img', cache_dir='/content/drive/MyDrive/tmp', model_version='2.1', use_flash_attention=False)

making attention of type 'vanilla' with 512 in_channels
making attention of type 'vanilla' with 512 in_channels
making attention of type 'vanilla' with 512 in_channels
Working with z of shape (1, 4, 32, 32) = 4096 dimensions.


In [3]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
from copy import deepcopy

import torch
import torch.nn as nn
from matplotlib import pyplot as plt

from kandinsky2 import get_kandinsky2

In [4]:
def show_image(image, figsize=(5, 5), cmap=None, title='', xlabel=None, ylabel=None, axis=False):
    plt.figure(figsize=figsize)
    plt.imshow(image, cmap=cmap)
    plt.title(title)
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.axis(axis)
    plt.show();

def show_images(images, n_rows=1, title='', figsize=(5, 5), cmap=None, xlabel=None, ylabel=None, axis=False):
    n_cols = len(images) // n_rows
    if n_rows == n_cols == 1:
        show_image(images[0], title=title, figsize=figsize, cmap=cmap, xlabel=xlabel, ylabel=ylabel, axis=axis)
    else:
        fig, axes = plt.subplots(n_rows, n_cols, figsize=figsize)
        fig.tight_layout(pad=0.0)
        axes = axes.flatten()
        for ax, img in zip(axes, images):
            ax.imshow(img, cmap=cmap)
            ax.set_title(title)
            ax.set_xlabel(xlabel)
            ax.set_ylabel(ylabel)
            ax.axis(axis)
        plt.show();
        
def add_new_embeds(model, placeholder_token, embeds_path):
    learned_token = torch.load(embeds_path)

    num_added_tokens = model.tokenizer1.add_tokens(placeholder_token)
    placeholder_token_id = model.tokenizer1.convert_tokens_to_ids(placeholder_token)

    model.text_encoder.model.transformer.resize_token_embeddings(len(model.tokenizer1))
    model.text_encoder.model.transformer.get_input_embeddings().weight.data[placeholder_token_id] = learned_token['t1'][placeholder_token]
      
    t2p_index_to_add = len(model.tokenizer2.encoder)
    model.tokenizer2.encoder[placeholder_token] = t2p_index_to_add
    model.tokenizer2.decoder[t2p_index_to_add] = placeholder_token
    model.tokenizer2.cache[placeholder_token] = placeholder_token

    t2p_tok = model.tokenizer2.encode(placeholder_token)
    t2p_str = model.tokenizer2.decode(t2p_tok)
    
    old_vocab_size, t2_embed_dize = model.clip_model.token_embedding.weight.shape

    new_embed = nn.Embedding(old_vocab_size + 1, t2_embed_dize).to(device)
    new_embed.weight.data[:old_vocab_size, :] = model.clip_model.token_embedding.weight.data.clone()
    new_embed.weight.data[t2p_tok[0], :] = learned_token['t2'][placeholder_token]

    model.clip_model.token_embedding = deepcopy(new_embed)
    
    return model

In [9]:
device='cuda'
cache_dir='/content/drive/MyDrive/tmp'
task_type = 'text2img'
model = get_kandinsky2(
    device=device, 
    task_type=task_type, 
    model_version='2.1', 
    cache_dir=cache_dir, 
    use_flash_attention=False);
placeholder_token = 'sks' # your word here
embeds_path = '/content/drive/MyDrive/finetune/output/sks/learned_embeds.bin' # your path to embedding here 

model = add_new_embeds(model, placeholder_token, embeds_path)



making attention of type 'vanilla' with 512 in_channels
making attention of type 'vanilla' with 512 in_channels
making attention of type 'vanilla' with 512 in_channels
Working with z of shape (1, 4, 32, 32) = 4096 dimensions.


In [40]:
#@title Settings & Prompt 
batch_name = "Batch" #@param {type:"string"}
width_height = [1280, 540] #@param {type: 'raw'}
#@markdown - Dynamic Prompting: "a glass of { wine | beer | milk }" will randomly choose one of these words for each image:
text_prompt = " a masterpiece painting in the style of sks having extreme details and a perfect composition and very well lit scene" #@param {type:"string"}
num_steps = 50 #@param {type:"integer"}
batch_size = 2 #@param {type:"integer"}
guidance_scale = 1 #@param {type:"number"}
sampler = 'p_sampler' #@param {type:"string"}
prior_cf_scale = 4 #@param {type:"number"}
prior_steps = "5" #@param {type:"string"}

w, h = width_height

In [None]:
prompt = f'''a masterpiece painting in the style of {placeholder_token} having extreme details and a perfect composition and very well lit scene.  '''

for _ in range(2):
    images = model.generate_text2img(
        prompt,
        num_steps=50, 
        batch_size=4, 
        guidance_scale=7.5,
        h=768, 
        w=768,
        sampler='p_sampler', 
        prior_cf_scale=2,
        prior_steps='4',
    )

    show_images(images, n_rows=2, figsize=(15, 15))

  0%|          | 0/50 [00:00<?, ?it/s]

In [None]:
#@title Images Generation
import os
import random
from PIL import Image
from IPython.display import display, clear_output
import time

def generate_dynamic_prompt(prompt):
    while "{" in prompt and "}" in prompt:
        start = prompt.index("{")
        end = prompt.index("}")
        options_str = prompt[start + 1:end]
        options = options_str.split("|")
        choice = random.choice(options).strip()
        prompt = prompt[:start] + choice + prompt[end + 1:]
    return prompt

output_folder = f"/content/drive/MyDrive/{batch_name}/"
os.makedirs(output_folder, exist_ok=True)

num_images = batch_size  # The total number of images to generate

for i in range(num_images):
    dynamic_prompt = generate_dynamic_prompt(text_prompt)
    print(f"Generating image {i + 1} with prompt: {dynamic_prompt}")  # Print the generated prompt
    images = model.generate_text2img(dynamic_prompt, num_steps=num_steps,
                                     batch_size=1, guidance_scale=guidance_scale,
                                     h=h, w=w, sampler=sampler, prior_cf_scale=prior_cf_scale,
                                     prior_steps=prior_steps)

    # Save generated image to Google Drive
    img_filename = f"{dynamic_prompt.replace(',', '').replace(' ', '_')}_{i}.png"
    img_path = os.path.join(output_folder, img_filename)
    
    # The output is already a PIL Image
    image = images[0]
    image.save(img_path)

    # Display image and clear output after every 10 images
    display(image)
    if (i + 1) % 10 == 0:
        time.sleep(1)
        clear_output(wait=True)

    print(f"Image {i + 1} saved to Google Drive at: {img_path}")


# New section