In [1]:
import os
import pandas as pd
import numpy as np
from google.colab import drive

drive.mount('/content/gdrive')

code_folder = '/content/gdrive/MyDrive/AI Music Visuals Share/New Codes'
fp = os.path.join(code_folder, 'prompts_excel.xlsx')

df_prompt = pd.read_excel(fp, 'prompts', index_col=0)
df_transitions = pd.read_excel(fp, 'transitions', dtype={'from_seed': str, 'to_seed': str})

df_transitions = df_transitions.where(df_transitions['compute'] == 'y').dropna(how='all')
df_transitions

Mounted at /content/gdrive


Unnamed: 0,from_name,from_seed,to_name,to_seed,compute,duration
16,fractal galaxies,5891113657437466,rainbow spiral wave2,1011141381797677,y,20.0
17,fractal galaxies,501321363070492,rainbow spiral wave2,1011141381797677,y,20.0


In [2]:
%%capture
!pip install diffusers
!pip install transformers scipy ftfy accelerate

!pip install -U --no-deps stable_diffusion_videos # stable_diffusion_videos wants old version of diffusers which doesn't allow for text embeddings
!pip install realesrgan av

from stable_diffusion_videos import make_video_pyav

In [3]:
import torch
from diffusers import StableDiffusionPipeline


pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4",
                                               torch_dtype=torch.float16,
                                               safety_checker=None
                                               )  


pipe = pipe.to("cuda")

Downloading (…)ain/model_index.json:   0%|          | 0.00/543 [00:00<?, ?B/s]

Fetching 16 files:   0%|          | 0/16 [00:00<?, ?it/s]

Downloading (…)_checker/config.json:   0%|          | 0.00/4.56k [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/492M [00:00<?, ?B/s]

Downloading (…)nfig-checkpoint.json:   0%|          | 0.00/209 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/1.22G [00:00<?, ?B/s]

Downloading (…)_encoder/config.json:   0%|          | 0.00/592 [00:00<?, ?B/s]

Downloading (…)tokenizer/merges.txt:   0%|          | 0.00/525k [00:00<?, ?B/s]

Downloading (…)cheduler_config.json:   0%|          | 0.00/313 [00:00<?, ?B/s]

Downloading (…)rocessor_config.json:   0%|          | 0.00/342 [00:00<?, ?B/s]

Downloading (…)_pytorch_model.bin";:   0%|          | 0.00/3.44G [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/806 [00:00<?, ?B/s]

Downloading (…)b28/unet/config.json:   0%|          | 0.00/743 [00:00<?, ?B/s]

Downloading (…)tokenizer/vocab.json:   0%|          | 0.00/1.06M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/472 [00:00<?, ?B/s]

Downloading (…)0b28/vae/config.json:   0%|          | 0.00/551 [00:00<?, ?B/s]

Downloading (…)_pytorch_model.bin";:   0%|          | 0.00/335M [00:00<?, ?B/s]



In [4]:
width = 512
height = 512

latent_width = width // 8
latent_height = height // 8


def generate_latent(generator, seed, device='cuda'):

    generator.manual_seed(int(seed))

    latent = torch.randn(
        (1, pipe.unet.in_channels, height // 8, width // 8),
        generator = generator,
        device = device
    )

    return latent

def generate_image(latent, prompt, guidance_scale, device='cuda'):
    

    with torch.autocast(device):
      images = pipe(
          prompt,
          guidance_scale=guidance_scale,
          latents = latent,
      )

    output_image = images.images[0]

    return output_image

def make_latent_steps(start_latent, stop_latent, steps):
    delta_latent = (stop_latent - start_latent)/float(steps)
    latent_steps = [start_latent + delta_latent*i for i in range(steps + 1)]

    #Check that start and end values are equal to targets within rounding errors
    # assert torch.isclose(latent_steps[0], from_latent, atol=1e-4).all()
    # assert torch.isclose(latent_steps[-1], to_latent, atol=1e-2).all()

    return latent_steps

def get_text_embed(prompt):
    text_input = pipe.tokenizer(
                prompt,
                padding="max_length",
                max_length=pipe.tokenizer.model_max_length,
                truncation=True,
                return_tensors="pt",
            )

    embed = pipe.text_encoder(text_input.input_ids.to('cuda'))[0]

    return embed


code_folder = r'/content/gdrive/MyDrive/AI Music Visuals Share/New Codes'
output_basedir = os.path.join(code_folder, 'output_transitions')
if not os.path.exists(output_basedir): os.makedirs(output_basedir)

In [5]:
from IPython.display import clear_output

generator = torch.Generator(device="cuda")

max_seed_characters = 4 # Take the first few numbers of the seed for the name
num_interpolation_steps = 3
num_inference_steps = 5

for i, row in df_transitions.iterrows():
  clear_output(wait=True)

  output_name = "{}-{} to {}-{}".format(
      row['from_name'],
      row['from_seed'][:max_seed_characters],
      row['to_name'],
      row['to_seed'][:max_seed_characters]
      )
  
  output_folder = os.path.join(output_basedir, output_name)
  if not os.path.exists(output_folder): os.makedirs(output_folder)


  prompts = [
      df_prompt['prompt'][row['from_name']], 
      df_prompt['prompt'][row['to_name']]
      ]

  guidance_scales = [
      float(df_prompt['guidance_scale'][row['from_name']]),
      float(df_prompt['guidance_scale'][row['to_name']])
  ]

  seeds = [int(row['from_seed']), int(row['to_seed'])]

  duration = float(row['duration'])
  fps = num_interpolation_steps/duration

  print("fps: {}".format(fps))

  from_latent = generate_latent(generator, seeds[0])
  to_latent = generate_latent(generator, seeds[1])

  from_text_embed = get_text_embed(prompts[0])
  to_text_embed = get_text_embed(prompts[1])

  # The tensor steps are len(num_interpolation_steps) + 1
  latent_steps = make_latent_steps(from_latent, to_latent, num_interpolation_steps)
  embed_steps = make_latent_steps(from_text_embed, to_text_embed, num_interpolation_steps)
  guidance_steps = np.linspace(guidance_scales[0], guidance_scales[1], num_interpolation_steps + 1)


  for i in range(len(latent_steps)):
      with torch.autocast('cuda'):
        images = pipe(
            prompt_embeds=embed_steps[i],
            guidance_scale=guidance_steps[i],
            latents = latent_steps[i],
            num_inference_steps = num_inference_steps
        )

      output_image = images.images[0]

      output_image.save(os.path.join(output_folder, "{}.png".format(i)))


  make_video_pyav(output_folder, 
                  output_filepath=os.path.join(output_basedir,  "{}.mp4".format(output_name)),
                  fps=fps
                  )

  

fps: 0.15


  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

In [6]:
# with torch.autocast('cuda'):
#   images = pipe(
#       prompt_embeds=embed_steps[-1],
#       guidance_scale=guidance_scales[1],
#       latents = latent_steps[-1],
#   )

# images.images[0]