In [3]:
import torch
torch.set_printoptions(linewidth=200)

In [4]:
from PIL import Image
from glob import glob
import requests

from transformers import CLIPProcessor, CLIPModel

model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

images = [Image.open(f"../faces/Picture {i}.png") for i in range(1, 23)]


In [19]:
texts = [
    "sad pikachu",
    "excited pikachu",
    "pikachu likes this",
    "surprised pikachu",
    "anxious pikachu",
    "pikachu in love",
    "scared pikachu",
    "tired pikachu",
    "confused pikachu",
    "blushing pikachu",
    "angry pikachu",
    "fainted pikachu",
]

inputs = processor(text=texts, images=images, return_tensors="pt", padding=True)
outputs = model(**inputs)
print(outputs.keys())
logits_per_text = outputs.logits_per_text  # this is the image-text similarity score
probs_per_text = logits_per_text.softmax(dim=1)  # we can take the softmax to get the label probabilities
logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
probs_per_image = logits_per_image.softmax(dim=1)  # we can take the softmax to get the label probabilities

odict_keys(['logits_per_image', 'logits_per_text', 'text_embeds', 'image_embeds', 'text_model_output', 'vision_model_output'])


In [23]:
# print(probs_per_image)
max_idx_per_image = [prob.argmax() for prob in probs_per_image]
matched_text = [texts[i] for i in max_idx_per_image]
# print("\n".join([f"Picture: {i+1} - {text}" for i, text in enumerate(matched_text)]))

max_idx_per_text = [prob.argmax() for prob in probs_per_text]
print(
    "\n".join(
        [
            f"{texts[i]} - Picture: {image_num+1}"
            for i, image_num in enumerate(max_idx_per_text)
        ]
    )
)

sad pikachu - Picture: 21
excited pikachu - Picture: 16
pikachu likes this - Picture: 22
surprised pikachu - Picture: 2
anxious pikachu - Picture: 21
pikachu in love - Picture: 10
scared pikachu - Picture: 2
tired pikachu - Picture: 20
confused pikachu - Picture: 9
blushing pikachu - Picture: 16
angry pikachu - Picture: 2
fainted pikachu - Picture: 20


## Text to video

In [9]:
DEVICE="mps"

In [10]:
from PIL import Image
import torch
from transformers import CLIPTextModel, CLIPTokenizer
from diffusers import AutoencoderKL, UNet2DConditionModel, PNDMScheduler

vae = AutoencoderKL.from_pretrained("CompVis/stable-diffusion-v1-4", subfolder="vae", use_safetensors=True)
tokenizer = CLIPTokenizer.from_pretrained("CompVis/stable-diffusion-v1-4", subfolder="tokenizer")
text_encoder = CLIPTextModel.from_pretrained(
    "CompVis/stable-diffusion-v1-4", subfolder="text_encoder", use_safetensors=True
)
unet = UNet2DConditionModel.from_pretrained(
    "CompVis/stable-diffusion-v1-4", subfolder="unet", use_safetensors=True
)

from diffusers import UniPCMultistepScheduler

scheduler = UniPCMultistepScheduler.from_pretrained("CompVis/stable-diffusion-v1-4", subfolder="scheduler")
prompt = ["a photograph of an astronaut riding a horse"]
height = 512  # default height of Stable Diffusion
width = 512  # default width of Stable Diffusion
num_inference_steps = 25  # Number of denoising steps
guidance_scale = 7.5  # Scale for classifier-free guidance
generator = torch.manual_seed(0)  # Seed generator to create the initial latent noise
batch_size = len(prompt)
text_input = tokenizer(
    prompt, padding="max_length", max_length=tokenizer.model_max_length, truncation=True, return_tensors="pt"
)

with torch.no_grad():
    text_embeddings = text_encoder(text_input.input_ids.to(DEVICE))[0]

max_length = text_input.input_ids.shape[-1]
uncond_input = tokenizer([""] * batch_size, padding="max_length", max_length=max_length, return_tensors="pt")
uncond_embeddings = text_encoder(uncond_input.input_ids.to(DEVICE))[0]
text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
latents = torch.randn(
    (batch_size, unet.config.in_channels, height // 8, width // 8),
    generator=generator,
    device=DEVICE,
)
latents = latents * scheduler.init_noise_sigma

from tqdm.auto import tqdm

scheduler.set_timesteps(num_inference_steps)

for t in tqdm(scheduler.timesteps):
    # expand the latents if we are doing classifier-free guidance to avoid doing two forward passes.
    latent_model_input = torch.cat([latents] * 2)

    latent_model_input = scheduler.scale_model_input(latent_model_input, timestep=t)

    # predict the noise residual
    with torch.no_grad():
        noise_pred = unet(latent_model_input, t, encoder_hidden_states=text_embeddings).sample

    # perform guidance
    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)

    # compute the previous noisy sample x_t -> x_t-1
    latents = scheduler.step(noise_pred, t, latents).prev_sample

# scale and decode the image latents with vae
latents = 1 / 0.18215 * latents
with torch.no_grad():
    image = vae.decode(latents).sample

image = (image / 2 + 0.5).clamp(0, 1).squeeze()
image = (image.permute(1, 2, 0) * 255).to(torch.uint8).cpu().numpy()
images = (image * 255).round().astype("uint8")
image = Image.fromarray(image)
image

vae/config.json:   0%|          | 0.00/551 [00:00<?, ?B/s]

diffusion_pytorch_model.safetensors:   0%|          | 0.00/335M [00:00<?, ?B/s]

tokenizer/tokenizer_config.json:   0%|          | 0.00/806 [00:00<?, ?B/s]

tokenizer/vocab.json:   0%|          | 0.00/1.06M [00:00<?, ?B/s]

tokenizer/merges.txt:   0%|          | 0.00/525k [00:00<?, ?B/s]

tokenizer/special_tokens_map.json:   0%|          | 0.00/472 [00:00<?, ?B/s]

text_encoder/config.json:   0%|          | 0.00/592 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/492M [00:00<?, ?B/s]

unet/config.json:   0%|          | 0.00/743 [00:00<?, ?B/s]

diffusion_pytorch_model.safetensors:   0%|          | 0.00/3.44G [00:00<?, ?B/s]

KeyboardInterrupt: 

#### text to video

In [23]:
import torch
import imageio
from diffusers import TextToVideoZeroPipeline

# load stable diffusion model weights
model_id = "dreamlike-art/dreamlike-photoreal-2.0"

# create a TextToVideoZero pipeline
pipe = TextToVideoZeroPipeline.from_pretrained(model_id, torch_dtype=torch.float16, safety_checker=None, feature_extractor=None).to(DEVICE)


Loading pipeline components...:   0%|          | 0/5 [00:00<?, ?it/s]

In [29]:
# define the text prompt
# prompt = "Pikachu smiling"
prompt = "pikachu's face, smiling, minimal"

# generate the video using our pipeline
result = pipe(prompt=prompt, height=512, width=512).images
result = [(r * 255).astype("uint8") for r in result]

# save the resulting image
imageio.mimsave("video.mp4", result, fps=4, codec='libx264')

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/48 [00:00<?, ?it/s]