In [None]:
# Credit to Jonathan Whitaker

In [None]:
!pip install scikit-video

In [None]:
import torch
import numpy as np

# Ignore the warnings
from demo import load_model
import skvideo.io

from tqdm.notebook import tqdm
from IPython.display import display, HTML 
from base64 import b64encode

model=load_model()

In [None]:
# For inline output
def display_video(path):  
    mp4 = open(path,'rb').read()   
    data_url = "data:video/mp4;base64," + b64encode(mp4).decode()
    display(
      HTML(
      """
          <video width=512 controls>
                <source src="%s" type="video/mp4">
          </video>
      """ % data_url
           )   
    )

# Latent walk

We can pick a few points in latent space and smoothly move between them. This gives quite a pleasing effect:

In [None]:
# Some parameters
n_points = 6 #@param
n_steps = 300 #@param
latents = torch.randn(n_points, 256)

# Loop through generating the frames
frames = []
for i in tqdm(range(n_steps)):
  p1 = max(0, int(n_points*i/n_steps))
  p2 = min(n_points, int(n_points*i/n_steps)+1)%n_points # so it wraps back to 0
  frac = (i-(p1*(n_steps/n_points))) / (n_steps/n_points)
  l = latents[p1]*(1-frac) + latents[p2]*frac
  im = model.G(l.unsqueeze(0)).clamp_(0., 1.)
  frame=(im[0].permute(1, 2, 0).detach().cpu().numpy()*255).astype(np.uint8)
  frames.append(frame)
skvideo.io.vwrite("outputvideo.mp4", frames)  

In [None]:
display_video("outputvideo.mp4")

In [None]:
!pip install librosa

# Audio-Reactive Interpolations

The next fun thing we can do is repeat the above but using audio to shift our position in latent space. Here's my take on this, but you could do this better by filtering different frequencies etc.

In [None]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 
import librosa
import soundfile as sf
from scipy.signal import savgol_filter

# The driving audio file
audio_file = './sounds/bensound-cute.wav' #@param

# How many points in the base latent walk loop
n_points = 6 #@param

# Smooths the animation effect, smaller=jerkier, must be odd
filter_window_size=301 #@param 

# How much should we scale position based on music vs the base path?
chr_scale = 0.5 #@param
base_scale = 0.3 #@param

# Load the file
X, sample_rate = sf.read(audio_file, dtype='float32')

X= X[:int(len(X)*0.5)] # Only use the first half of the audio

# Remove percussive elements
harmonic = librosa.effects.harmonic(X[:,0])

# Get chroma_stft (power in different notes)
chroma = librosa.feature.chroma_stft(harmonic) # Just one channel

# Smooth these out
chroma = savgol_filter(chroma, filter_window_size, 3)

# Calculate how many frames we want
fps = 25
duration = X.shape[0] / sample_rate
print('Duration:', duration)
n_steps = int(fps * duration)
print('N frames:', n_steps, fps * duration)

latents = torch.randn(n_points, 256)*base_scale
chroma_latents = torch.randn(12, 256)*chr_scale

frames=[]
for i in tqdm(range(n_steps)):
  p1 = max(0, int(n_points*i/n_steps))
  p2 = min(n_points, int(n_points*i/n_steps)+1)%n_points # so it wraps back to 0
  frac = (i-(p1*(n_steps/n_points))) / (n_steps/n_points)
  l = latents[p1]*(1-frac) + latents[p2]*frac
  for c in range(12):
    scale_factor = chroma[c, int(i*chroma.shape[1]/n_steps)]
    l += chroma_latents[c]*chr_scale*scale_factor
  im = model.G(l.unsqueeze(0)).clamp_(0., 1.)
  frame=(im[0].permute(1, 2, 0).detach().cpu().numpy()*255).astype(np.uint8)
  frames.append(frame)

skvideo.io.vwrite("walk.mp4", np.array(frames))

# Merge in audio
!ffmpeg -y -v 0 -i walk.mp4 -i {audio_file} -c:v copy -c:a aac -shortest walk_with_music.mp4

# display
# display_video('walk_with_music.mp4')

Enjoy :)