In [2]:
## Perception Encoder Demo

In [1]:
import torch, os
import decord
from core.vision_encoder.factory import create_model_and_transforms, get_tokenizer
from PIL import Image

In [2]:
model_name = 'PE-Core-G14-448'
model, _, preprocess = create_model_and_transforms(model_name)
model = model.cuda()
tokenizer = get_tokenizer(model_name)

{'embed_dim': 1280, 'quick_gelu': False, 'vision_cfg': {'image_size': 448, 'patch_size': 14, 'layers': 50, 'width': 1536, 'output_dim': 1280, 'head_width': 96, 'heads': 16, 'mlp_ratio': 5.833333334, 'global_layers': -1, 'relative_pos_embed_type': 'rope_2d', 'pos_embed_type': 'learnable', 'pool_type': 'attn', 'embed_cls_token': False}, 'text_cfg': {'context_length': 72, 'vocab_size': 49408, 'width': 1280, 'output_dim': 1280, 'heads': 20, 'layers': 24}}


  checkpoint = torch.load(checkpoint_path, map_location=map_location)


_IncompatibleKeys(missing_keys=[], unexpected_keys=['logit_scale'])
Submodule: visual
Number of parameters:
1.88 Billion
Submodule: transformer
Number of parameters:
0.47 Billion
Submodule: token_embedding
Number of parameters:
0.06 Billion
Submodule: ln_final
Number of parameters:
0.00 Billion


In [4]:
os.chdir('/home/berniehuang/git/perception_models_dev/apps/pe/')
image = preprocess(Image.open("./docs/cat.png")).unsqueeze(0).cuda()
text = tokenizer(["a diagram", "a dog", "a cat"]).cuda()
with torch.no_grad(), torch.autocast("cuda"):
    image_features = model.encode_image(image)
    text_features = model.encode_text(text)
    image_features /= image_features.norm(dim=-1, keepdim=True)
    text_features /= text_features.norm(dim=-1, keepdim=True)
    text_probs = (100.0 * image_features @ text_features.T).softmax(dim=-1)
print("Label probs:", text_probs)  # prints: [[0.0, 0.0, 1.0]]

Label probs: tensor([[2.1108e-05, 2.3231e-04, 9.9975e-01]], device='cuda:0')


In [5]:
def preprocess_video(video_path, num_frames=8, transform=None):
    """
    Uniformly samples a specified number of frames from a video and preprocesses them.
    Parameters:
    - video_path: str, path to the video file.
    - num_frames: int, number of frames to sample. Defaults to 8.
    - transform: torchvision.transforms, a transform function to preprocess frames.
    Returns:
    - Video Tensor: a tensor of shape (num_frames, 3, H, W) where H and W are the height and width of the frames.
    """
    # Load the video
    vr = decord.VideoReader(video_path)
    total_frames = len(vr)
    # Uniformly sample frame indices
    frame_indices = [int(i * (total_frames / num_frames)) for i in range(num_frames)]
    frames = vr.get_batch(frame_indices).asnumpy()
    # Preprocess frames
    preprocessed_frames = [transform(Image.fromarray(frame)) for frame in frames]
    return torch.stack(preprocessed_frames, dim=0)

video = preprocess_video("./docs/dog.mp4", 8, transform=preprocess).unsqueeze(0).cuda()
text = tokenizer(["a diagram", "a dog", "a cat"]).cuda()
print(video.shape)

torch.Size([1, 8, 3, 448, 448])


In [6]:
with torch.no_grad(), torch.autocast("cuda"):
    image_features = model.encode_video2(video)
    text_features = model.encode_text(text)
    image_features /= image_features.norm(dim=-1, keepdim=True)
    text_features /= text_features.norm(dim=-1, keepdim=True)
    text_probs = (100.0 * image_features @ text_features.T).softmax(dim=-1)

print("Label probs:", text_probs)  # prints: [[0.0, 1.0, 1.0]]

Label probs: tensor([[5.1739e-05, 9.9790e-01, 2.0506e-03]], device='cuda:0')
