## Generate videos by prompting with ground truth captions
Oracle baseline for BOLDMoments videos

In [1]:
# Imports
import numpy as np
import matplotlib.pyplot as plt
import json
import shutil
import torch
from diffusers import DiffusionPipeline, DPMSolverMultistepScheduler
from diffusers.utils import export_to_video
import os
from utils import transform_vids_to_gifs, vid_to_gif, frames_to_vid

%load_ext autoreload
%autoreload 2




### Load captions 

In [2]:
# Load captions
annots = json.load(open('data/annotations.json', 'r')) # captions located in annots.values()[0]['text_descriptions']
print("annots['0001']:",annots['0001'])

annots['0001']: {'bmd_matrixfilename': 'vid_idx0001', 'MiT_url': 'https://data.csail.mit.edu/soundnet/actions3/wetting/0-0-1-6-7-2-8-0-17500167280.mp4', 'MiT_filename': 'wetting/0-0-1-6-7-2-8-0-17500167280.mp4', 'set': 'train', 'objects': ['red-breasted merganser', 'duck', 'American coot', 'goose', 'killer whale'], 'scenes': ['swimming hole', 'natural lake', 'watering hole', 'pond', 'ice floe'], 'actions': ['swimming', 'swimming', 'paddling', 'eating/feeding', 'swimming'], 'text_descriptions': ['A duck is swimming in a lake searching for food', 'A duck is floating atop the blue sparkly looking water.', 'A duck swims along in the water and pecks at the water.', 'A mallard is in the water alone swimming around and putting its beak in.', 'A duck swims in the daytime while pecking at the water.'], 'spoken_transcription': 'in a large mostly still body of water we see a duck swimming and pecking at the surface with his beak', 'memorability_score': 0.8147719988084737, 'memorability_decay': -0

### Functions to generate videos

In [None]:

def generate_videos_from_annots_with_gradio_api(annots, start_from=0):
    from gradio_client import Client

    client = Client("https://fffiloni-zeroscope--x84m2.hf.space/")
    for i, a in annots.items():
        if int(i) < start_from:
            continue
        for c in range(len(a['text_descriptions'])):
            prompt = a['text_descriptions'][c]
            print(prompt)
            result = client.predict(
                            prompt,	# str in 'Prompt' Textbox component
                            api_name="/zrscp"
            )

            # Move video to correct folder
            shutil.move(result, f'./oracle_gens/{i}_captionnumber{c}_{prompt}.mp4')

            # Make gif
            vid_to_gif(f'./oracle_gens/{i}_captionnumber{c}_{prompt}.mp4', f'./oracle_gens/{i}_captionnumber{c}_{prompt}.gif')
            break

def generate_videos_from_annots_with_local(annots, start_from=0):
    pipe = DiffusionPipeline.from_pretrained("../zeroscope_v2_576w", torch_dtype=torch.float16)
    pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
    # pipe.enable_model_cpu_offload()
    pipe.to("cuda:0")

    for i, a in annots.items():
        if int(i) < start_from:
            continue
        for c in range(len(a['text_descriptions'])):
            prompt = a['text_descriptions'][c]
            print("Generating video for prompt:", prompt)
            video_frames = pipe(prompt, 
                                num_inference_steps=20, 
                                height=320, 
                                width=320, 
                                num_frames=24).frames
            
            video_name = f'{i}_captionnumber{c}_{prompt.replace("/","-").replace(" ", "-")}'

            # Save frames
            os.makedirs(f'./oracle_gens/frames/{video_name}', exist_ok=True)
            for k, frame in enumerate(video_frames):
                plt.imsave(f'./oracle_gens/frames/{video_name}/{(k+1):03d}.png', frame) # We save frames starting with index 1 to match original stimuli

            # Make video and save
            frames_to_vid(video_frames, f'./oracle_gens/mp4/{video_name}.mp4', fps=8)

            # Make gif and save
            vid_to_gif(f'./oracle_gens/mp4/{video_name}.mp4', f'./oracle_gens/gif/{video_name}.gif')
            break


generate_videos_from_annots_with_local(annots, start_from=0)