In [None]:
!nvidia-smi

## Import Dependencies

In [2]:
from IPython.display import Video
from deforum_kandinsky import KandinskyV22Img2ImgPipeline, DeforumKandinsky
from diffusers import KandinskyV22PriorPipeline
from transformers import CLIPVisionModelWithProjection
from diffusers.models import UNet2DConditionModel
import imageio.v2 as iio
from PIL import Image
import numpy as np
import torch
import datetime
from tqdm.notebook import tqdm
import ipywidgets as widgets
from IPython import display
import os

In [3]:
#  create video from generated frames
def frames2video(frames, output_path="video.mp4", fps=24, display=False):
    writer = iio.get_writer(output_path, fps=fps)
    for frame in tqdm(frames):
        writer.append_data(np.array(frame))
    writer.close()
    if display:
        display.Video(url=output_path)

## Load Kandinsky 2.1 or 2.2 

In [None]:
from diffusers import KandinskyV22PriorPipeline
from deforum_kandinsky import (
    KandinskyV22Img2ImgPipeline, 
    DeforumKandinsky,  
    KandinskyImg2ImgPipeline, 
    DeforumKandinsky
)

# load models
model_version = 2.2
device = "cuda"

if model_version == 2.2:
    image_encoder = CLIPVisionModelWithProjection.from_pretrained(
        'kandinsky-community/kandinsky-2-2-prior', 
        subfolder='image_encoder'
        ).to(torch.float16).to(device)

    unet = UNet2DConditionModel.from_pretrained(
        'kandinsky-community/kandinsky-2-2-decoder', 
        subfolder='unet'
        ).to(torch.float16).to(device)

    prior = KandinskyV22PriorPipeline.from_pretrained(
        'kandinsky-community/kandinsky-2-2-prior', 
        image_encoder=image_encoder, 
        torch_dtype=torch.float16
        ).to(device)
    decoder = KandinskyV22Img2ImgPipeline.from_pretrained(
        'kandinsky-community/kandinsky-2-2-decoder', 
        unet=unet, 
        torch_dtype=torch.float16
        ).to(device)

elif model_version == 2.1: 

    image_encoder = CLIPVisionModelWithProjection.from_pretrained(
        "kandinsky-community/kandinsky-2-1-prior", 
        subfolder='image_encoder',
        torch_dtype=torch.float16
        ).to(device)
    unet = UNet2DConditionModel.from_pretrained(
        "kandinsky-community/kandinsky-2-1", 
        subfolder='unet',
        torch_dtype=torch.float16
        ).to(device)
    prior = KandinskyPriorPipeline.from_pretrained(
        "kandinsky-community/kandinsky-2-1-prior", 
        torch_dtype=torch.float16
        ).to(device)
    decoder = KandinskyImg2ImgPipeline.from_pretrained(
        'kandinsky-community/kandinsky-2-1', 
        unet=unet, 
        torch_dtype=torch.float16
        ).to(device)

## Create Animation Using GUI

In [5]:
def create_animation_widgets():
    prompt = widgets.Text(
        description='Prompt:', 
        layout=widgets.Layout(width='80%')
    )
    negative_prompt = widgets.Text(
        description='Neg Prompt:', 
        value="low quility, bad image, cropped, out of frame",
        layout=widgets.Layout(width='80%')
    )
    duration = widgets.FloatSlider(
        description='Duration:', 
        min=0.25, max=60, 
        value=5, step=0.25, 
        layout=widgets.Layout(width='80%')
    )
    
    acceleration = widgets.FloatSlider(
        description='Acceleration:', 
        min=0.1, max=5, 
        value=1, step=0.1, 
        layout=widgets.Layout(width='80%')
    )

    animation = widgets.Dropdown(
        options=[
            'right', 'left', 
            'up', 'down', 
            'spin_clockwise', 'spin_counterclockwise', 
            'zoomin', 'zoomout', 
            'rotate_right', 'rotate_left', 
            'rotate_up', 'rotate_down', 
            'around_right', 'around_left', 
            'zoomin_sinus_x', 'zoomout_sinus_y', 
            'right_sinus_y', 'left_sinus_y', 
            'live'
        ],
        value="right",
        description='Number:',
    )
    return widgets.VBox(children=(prompt,negative_prompt, duration, acceleration, animation))

def create_video_settings():
    return widgets.VBox(children=[
        widgets.HTML("<h2>Video Settings</h2>"),
        widgets.BoundedIntText(
            min=64,
            max=1e6,
            step=64,
            value=640,
            description='Width:',
            disabled=False
        ),
        widgets.BoundedIntText(
            min=64,
            max=1e6,
            step=64,
            value=640,
            description='Height:',
            disabled=False
        ),
        widgets.IntSlider(
            description='FPS',
            min=1, 
            max=48, 
            value=24, 
            step=1
        ),
        widgets.Text(
            description='output path:', 
            value = "video.mp4",
        )
    ])

def create_animation_tabs():
    an_widgets = widgets.Tab(layout=widgets.Layout(width='90%', height='100%'))
    an_widgets.children = [create_animation_widgets()]

    def update(a, an_widgets=an_widgets):
        if an_widgets.children[-1].children[0].value:
            an_widgets.children += (create_animation_widgets(),)
        for index, child in enumerate(an_widgets.children):
            an_widgets.set_title(index, child.children[0].value)

    def clear(a, an_widgets=an_widgets):
        children = list(an_widgets.children)
        children.pop(an_widgets.selected_index)
        an_widgets.children = tuple(children)
        an_widgets.set_title(0, "")

    add_button = widgets.Button(
        description='Add Animation',
        layout=widgets.Layout(width='44.75%')
    )
    add_button.style.button_color = "blue"
    add_button.on_click(update)

    clear_button = widgets.Button(
        description='Remove Animation',
        layout=widgets.Layout(width='44.75%')
    )
    clear_button.style.button_color = "red"
    clear_button.on_click(clear)

    return widgets.VBox([
        widgets.HTML("<h2>Animations</h2>"), 
        an_widgets,
        widgets.HBox([add_button, clear_button])
    ])


def create_start_button(animation_tabs, video_widgets, deforum, animation_display):
    def render_deforum(animation, animation_display, output_path):
        frames = []
        with animation_display:
            start_time = datetime.datetime.now() 
            progress = widgets.IntProgress(value=0, min=0, max=len(deforum))
            for index, item in enumerate(animation):
                image = item.pop("image", None)
                frames.append(image)
                progress.value = index+1
                display.clear_output(wait=True)
                display.display(image, progress)
                elapsed_time = datetime.datetime.now() - start_time
                elapsed_time -= datetime.timedelta(microseconds=elapsed_time.microseconds)
                estimated_time = (elapsed_time/(index+1)*len(deforum))
                estimated_time -= datetime.timedelta(microseconds=estimated_time.microseconds)
                
                print(f"estimated_time: {elapsed_time}/{estimated_time}")
                for key, value in item.items(): 
                    print(f"{key}: {value}")
                
            progress.style.bar_color = 'green'
        if output_path and output_path.endswith(".mp4"):
            frames2video(frames, output_path)
        else: 
            frames2video(frames)
        
            
    def parse_args(_):
        children = animation_tabs.children[1].children
        prompts = []
        negative_prompts = [] 
        durations = []
        animations = []
        accelerations = []
        
        for child in children:
            prompt, negative_prompt, duration, acceleration, animation = [x.value for x in child.children]
            if prompt: 
                prompts.append(prompt)
                negative_prompts.append(negative_prompt)
                durations.append(duration)
                accelerations.append(acceleration)
                animations.append(animation)
                
        width, height, fps = [int(x.value) for x in video_widgets.children[1:-1]]
        output_path = video_widgets.children[-1].value
        animation = deforum(
            prompts=prompts,
            negative_prompts=negative_prompts, 
            animations=animations, 
            prompt_durations=durations,
            accelerations=accelerations,
            H=height,
            W=width,
            fps=fps,
            sampler="euler"
        )
        animation = tqdm(animation, total=len(deforum))
        render_deforum(animation, animation_display, output_path)

    button = widgets.Button(
        description='Start Rendering!', 
        layout=widgets.Layout(width='90%')
    )
    button.on_click(parse_args)
    return button

In [None]:
# define instance of Deforum
deforum = DeforumKandinsky(
    prior=prior,
    decoder_img2img=decoder,
    device='cuda'
)

video_widgets = create_video_settings()
animation_tabs = create_animation_tabs()
animation_display = widgets.Output()
start_button = create_start_button(animation_tabs, video_widgets, deforum, animation_display)
display.display(animation_display, video_widgets, animation_tabs, start_button)

In [None]:
display.Video(url="video.mp4")

## Generate Animation Using Wrapper

In [None]:
import os 

# define instance of Deforum
deforum = DeforumKandinsky(
    prior=prior,
    decoder_img2img=decoder,
    device='cuda'
)


prompts=[
    "a painting of a tiger with clouds in the background, a detailed painting, by Dan Mumford, unsplash, psychedelic art, a painting of a cat, iridescent smoke, casey weldon, fractal cloud, dark rainbow nimbus, colorfull sky, dreaming of electric sheep, style of tim hildebrandt, night time dark with neon colors, fenrir, swirling clouds",
]



animation = deforum(
    prompts=prompts,
    animations = ["left"]*len(prompts),
    prompt_durations=[4]*len(prompts),
    accelerations = [1]*len(prompts),
    H=640,
    W=640,
    fps=24,
    save_samples=False,
    linear_transition=True,
    diffusion_cadence="2",
    strength_schedule="0:(0.1)", 
    sampler="dpmpp_2m",
    prior_seed = 2,
    seed = 1,
)

frames = []

out = widgets.Output()
pbar = tqdm(animation, total=len(deforum))
display.display(out)

with out:
    for index, item in enumerate(pbar):
        frame = item.pop('image', None)
        frames.append(frame)
        display.clear_output(wait=True) 
        display.display(frame)
        for key, value in item.items():
            if not isinstance(value, (np.ndarray, torch.Tensor, Image.Image)):
                print(f"{key}: {value}")
            

display.clear_output(wait=True) 
frames2video(frames, "output_2_2.mp4", fps=24)
display.Video(url="output_2_2.mp4")

## Create per-frame Animations

In [None]:
def DeforumAnimArgs():

    #@markdown ####**Animation:**
    animation_mode = "3D" #@param ['None', '2D', '3D', 'Video Input', 'Interpolation'] {type:'string'}
    max_frames = 1 #@param {type:"number"}
    border = 'replicate' #@param ['wrap', 'replicate'] {type:'string'}

    #@markdown ####**Motion Parameters:**
    angle = "0:(0)"#@param {type:"string"}
    zoom = "0:(1.00)"#@param {type:"string"}
    translation_x = "0:(0)"#@param {type:"string"}
    translation_y = "0:(0)"#@param {type:"string"}
    translation_z = "0:(0)"#@param {type:"string"}
    rotation_3d_x = "0:(0)"#@param {type:"string"}
    rotation_3d_y = "0:(0)"#@param {type:"string"}
    rotation_3d_z = "0:(0)"#@param {type:"string"}
    flip_2d_perspective = True #@param {type:"boolean"}
    perspective_flip_theta = "0:(0)"#@param {type:"string"}
    perspective_flip_phi = "0:(0)"#@param {type:"string"}
    perspective_flip_gamma = "0:(0)"#@param {type:"string"}
    perspective_flip_fv = "0:(56)"#@param {type:"string"}
    noise_schedule = "0: (0.00)"#@param {type:"string"}
    strength_schedule = "0: (0.2)"#@param {type:"string"}
    contrast_schedule = "0: (1.0)"#@param {type:"string"}
    hybrid_comp_alpha_schedule = "0:(1)" #@param {type:"string"}
    hybrid_comp_mask_blend_alpha_schedule = "0:(0.5)" #@param {type:"string"}
    hybrid_comp_mask_contrast_schedule = "0:(1)" #@param {type:"string"}
    hybrid_comp_mask_auto_contrast_cutoff_high_schedule =  "0:(100)" #@param {type:"string"}
    hybrid_comp_mask_auto_contrast_cutoff_low_schedule =  "0:(0)" #@param {type:"string"}

    #@markdown ####**Sampler Scheduling:**
    enable_schedule_samplers = False #@param {type:"boolean"}
    sampler_schedule = "0:('euler'),10:('dpm2'),20:('dpm2_ancestral'),30:('heun'),40:('euler'),50:('euler_ancestral'),60:('dpm_fast'),70:('dpm_adaptive'),80:('dpmpp_2s_a'),90:('dpmpp_2m')" #@param {type:"string"}

    #@markdown ####**Unsharp mask (anti-blur) Parameters:**
    kernel_schedule = "0: (5)"#@param {type:"string"}
    sigma_schedule = "0: (1.0)"#@param {type:"string"}
    amount_schedule = "0: (0.2)"#@param {type:"string"}
    threshold_schedule = "0: (0.0)"#@param {type:"string"}

    #@markdown ####**Coherence:**
    color_coherence = 'None' #@param ['None', 'Match Frame 0 HSV', 'Match Frame 0 LAB', 'Match Frame 0 RGB', 'Video Input'] {type:'string'}
    color_coherence_video_every_N_frames = 1 #@param {type:"integer"}
    color_force_grayscale = False #@param {type:"boolean"}
    diffusion_cadence = '2' #@param ['1','2','3','4','5','6','7','8'] {type:'string'}

    #@markdown ####**3D Depth Warping:**
    use_depth_warping = True #@param {type:"boolean"}
    midas_weight = 0.3#@param {type:"number"}
    near_plane = 200
    far_plane = 10000
    fov = 40#@param {type:"number"}
    padding_mode = 'border'#@param ['border', 'reflection', 'zeros'] {type:'string'}
    sampling_mode = 'bicubic'#@param ['bicubic', 'bilinear', 'nearest'] {type:'string'}
    save_depth_maps = False #@param {type:"boolean"}

    #@markdown ####**Video Input:**
    video_init_path =None#@param {type:"string"}
    extract_nth_frame = 1#@param {type:"number"}
    overwrite_extracted_frames = True #@param {type:"boolean"}
    use_mask_video = False #@param {type:"boolean"}
    video_mask_path ='/content/video_in.mp4'#@param {type:"string"}

    #@markdown ####**Hybrid Video for 2D/3D Animation Mode:**
    hybrid_generate_inputframes = False #@param {type:"boolean"}
    hybrid_use_first_frame_as_init_image = True #@param {type:"boolean"}
    hybrid_motion = "None" #@param ['None','Optical Flow','Perspective','Affine']
    hybrid_motion_use_prev_img = False #@param {type:"boolean"}
    hybrid_flow_method = "DIS Medium" #@param ['DenseRLOF','DIS Medium','Farneback','SF']
    hybrid_composite = False #@param {type:"boolean"}
    hybrid_comp_mask_type = "None" #@param ['None', 'Depth', 'Video Depth', 'Blend', 'Difference']
    hybrid_comp_mask_inverse = False #@param {type:"boolean"}
    hybrid_comp_mask_equalize = "None" #@param  ['None','Before','After','Both']
    hybrid_comp_mask_auto_contrast = False #@param {type:"boolean"}
    hybrid_comp_save_extra_frames = False #@param {type:"boolean"}
    hybrid_use_video_as_mse_image = False #@param {type:"boolean"}

    #@markdown ####**Interpolation:**
    interpolate_key_frames = False #@param {type:"boolean"}
    interpolate_x_frames = 20 #@param {type:"number"}
    
    #@markdown ####**Resume Animation:**
    resume_from_timestring = False #@param {type:"boolean"}
    resume_timestring = "20220829210106" #@param {type:"string"}

    return locals()

In [None]:
def DeforumArgs():
    #@markdown **Image Settings**
    W = 128 * 5#@param
    H = 128 * 5 #@param
    W, H = map(lambda x: x - x % 64, (W, H))  # resize to integer multiple of 64
    bit_depth_output = 8 #@param [8, 16, 32] {type:"raw"}

    #@markdown **Sampling Settings**
    seed = -1 #@param
    prior_seed = 0
    
    sampler = 'euler_ancestral' #@param ["klms","dpm2","dpm2_ancestral","heun","euler","euler_ancestral","plms", "ddim", "dpm_fast", "dpm_adaptive", "dpmpp_2s_a", "dpmpp_2m"]
    steps = 100 #@param
    scale = 7 #@param
    ddim_eta = 0.0 #@param
    dynamic_threshold = None
    static_threshold = None   

    #@markdown **Save & Display Settings**
    save_samples = False #@param {type:"boolean"}
    save_settings = False #@param {type:"boolean"}
    save_sample_per_step = False #@param {type:"boolean"}
    
    verbose = True
    display_samples = True #@param {type:"boolean"}
    show_sample_per_step = True #@param {type:"boolean"}
    
    #@markdown **Batch Settings**
    n_batch = 1 #@param
    n_samples = 1 #@param
    filename_format = "{timestring}_{index}_{prompt}.png" #@param ["{timestring}_{index}_{seed}.png","{timestring}_{index}_{prompt}.png"]
    seed_behavior = "iter" #@param ["iter","fixed","random","ladder","alternate"]
    seed_iter_N = 1 #@param {type:'integer'}
    make_grid = False #@param {type:"boolean"}
    grid_rows = 2 #@param 
    outdir = "output"

    #@markdown **True Settings**
    use_init = False #@param {type:"boolean"}
    strength = 0.2 #@param {type:"number"}
    strength_0_no_init = True # Set the strength to 0 automatically when no init image is used
    init_image = None
    # init_image = "https://cdn.pixabay.com/photo/2022/07/30/13/10/green-longhorn-beetle-7353749_1280.jpg" #@param {type:"string"}
    add_init_noise = False #@param {type:"boolean"}
    init_noise = 0.01 #@param
    # Whiter areas of the mask are areas that change more
    use_mask = False #@param {type:"boolean"}
    use_alpha_as_mask = False # use the alpha channel of the init image as the mask
    mask_file = "https://www.filterforge.com/wiki/images/archive/b/b7/20080927223728%21Polygonal_gradient_thumb.jpg" #@param {type:"string"}
    invert_mask = False #@param {type:"boolean"}
    # Adjust mask image, 1.0 is no adjustment. Should be positive numbers.
    mask_brightness_adjust = 1.0  #@param {type:"number"}
    mask_contrast_adjust = 1.0  #@param {type:"number"}
    # Overlay the masked image at the end of the generation so it does not get degraded by encoding and decoding
    overlay_mask = True  # {type:"boolean"}
    # Blur edges of final overlay mask, if used. Minimum = 0 (no blur)
    mask_overlay_blur = 5 # {type:"number"}

    #@markdown **Exposure/Contrast Conditional Settings**
    mean_scale = 0 #@param {type:"number"}
    var_scale = 0 #@param {type:"number"}
    exposure_scale = 0 #@param {type:"number"}
    exposure_target = 0.7 #@param {type:"number"}

    #@markdown **Color Match Conditional Settings**
    colormatch_scale = 0 #@param {type:"number"}
    colormatch_image = "https://www.saasdesign.io/wp-content/uploads/2021/02/palette-3-min-980x588.png" #@param {type:"string"}
    colormatch_n_colors = 4 #@param {type:"number"}
    ignore_sat_weight = 0 #@param {type:"number"}

    #@markdown **CLIP\Aesthetics Conditional Settings**
    clip_name = 'ViT-L/14' #@param ['ViT-L/14', 'ViT-L/14@336px', 'ViT-B/16', 'ViT-B/32']
    clip_scale = 0 #@param {type:"number"}
    aesthetics_scale = 0 #@param {type:"number"}
    cutn = 1 #@param {type:"number"}
    cut_pow = 0.0001 #@param {type:"number"}

    #@markdown **Other Conditional Settings**
    init_mse_scale = 0 #@param {type:"number"}
    init_mse_image = "https://cdn.pixabay.com/photo/2022/07/30/13/10/green-longhorn-beetle-7353749_1280.jpg" #@param {type:"string"}
    blue_scale = 0 #@param {type:"number"}
    
    #@markdown **Conditional Gradient Settings**
    gradient_wrt = 'x0_pred' #@param ["x", "x0_pred"]
    gradient_add_to = 'both' #@param ["cond", "uncond", "both"]
    decode_method = 'linear' #@param ["autoencoder","linear"]
    grad_threshold_type = 'dynamic' #@param ["dynamic", "static", "mean", "schedule"]
    clamp_grad_threshold = 0.2 #@param {type:"number"}
    clamp_start = 0.2 #@param
    clamp_stop = 0.01 #@param
    grad_inject_timing = list(range(1,10)) #@param

    #@markdown **Speed vs VRAM Settings**
    cond_uncond_sync = True #@param {type:"boolean"}
    precision = 'autocast' 
    C = 4
    f = 8

    cond_prompt = ""
    cond_prompts = ""
    uncond_prompt = ""
    uncond_prompts = ""
    timestring = ""
    init_latent = None
    init_sample = None
    init_sample_raw = None
    mask_sample = None
    init_c = None
    seed_internal = 0

    return locals()

In [None]:
import gc, math, os, pathlib, subprocess, sys, time, random
from types import SimpleNamespace
from deforum_kandinsky import render_animation, render_image_batch, render_interpolation, render_input_video
from deforum_kandinsky.helpers.prompts import Prompts

animation_prompts = {
    0: "a beautiful apple, trending on Artstation",
    24: "a beautiful banana, trending on Artstation",
    48: "a beautiful coconut, trending on Artstation",
    72: "a beautiful durian, trending on Artstation",
}
negative_prompts = {
    0: "bad image, cropped image"
}
cond, uncond = Prompts(prompt=animation_prompts, neg_prompt=negative_prompts).as_dict()
args = SimpleNamespace(**DeforumArgs())
anim_args = SimpleNamespace(**DeforumAnimArgs())
anim_args.max_frames = 96+1

args.timestring = time.strftime('%Y%m%d%H%M%S')
args.strength = max(0.0, min(1.0, args.strength))

if args.seed == -1:
    args.seed = random.randint(0, 2**32 - 1)
if not args.use_init:
    args.init_image = None
if args.sampler == 'plms' and (args.use_init or anim_args.animation_mode != 'None'):
    print(f"Init images aren't supported with PLMS yet, switching to KLMS")
    args.sampler = 'klms'
if args.sampler != 'ddim':
    args.ddim_eta = 0

if anim_args.animation_mode == 'None':
    anim_args.max_frames = 1
elif anim_args.animation_mode == 'Video Input':
    args.use_init = True

# clean up unused memory
gc.collect()
torch.cuda.empty_cache()

anim_args.video_init_path = "video.mp4"
anim_args.animation_mode = 'Video Input'


# dispatch to appropriate renderer
if anim_args.animation_mode == '2D' or anim_args.animation_mode == '3D':
    animation = render_animation(deforum.root, anim_args, args, cond, uncond)
elif anim_args.animation_mode == 'Video Input':
    animation = render_input_video(deforum.root, anim_args, args, cond, uncond)
elif anim_args.animation_mode == 'Interpolation':
    animation = render_interpolation(deforum.root, anim_args, args, cond, uncond)
else:
    animation = render_image_batch(deforum.root, args, cond, uncond)  

pbar = tqdm(animation, total=anim_args.max_frames-1)
frames = []
out =  widgets.Output()
display.display(out)
with out:
    for item in pbar:
        image = item.pop('image', None)
        frames.append(image)
        display.clear_output(wait=True)
        display.display(image)
        for key, value in item.items():
            print(f"{key}: {value}")