Ref: https://github.com/X-PLUG/mPLUG-Owl/tree/main/mPLUG-Owl3

In [None]:
import os 
import torch

In [None]:
print(os.getenv("CONDA_DEFAULT_ENV"))

In [None]:
from transformers import AutoConfig, AutoModel
from transformers import AutoTokenizer, AutoProcessor

In [None]:
from PIL import Image
from decord import VideoReader, cpu

### Model Loading

In [None]:
model_path = 'mPLUG/mPLUG-Owl3-7B-240728'

In [None]:
config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
model = AutoModel.from_pretrained(model_path, torch_dtype=torch.half, trust_remote_code=True, config=config)
tokenizer = AutoTokenizer.from_pretrained(model_path)
processor = model.init_processor(tokenizer)

In [None]:
_ = model.eval().cuda()

### Prepare Chat Messages

In [None]:
messages = [
    {
        "role": "user", 
        "content": """<|video|> Describe this video."""
    },
    {
        "role": "assistant", 
        "content": ""
    }
]

### Load Test Video

In [None]:
videos = ['/home/aritrad/MSR-Project/samples/4min-video.mp4']

In [None]:
MAX_NUM_FRAMES=16

In [None]:
def encode_video(video_path):
    def uniform_sample(l, n):
        gap = len(l) / n
        idxs = [int(i * gap + gap / 2) for i in range(n)]
        return [l[i] for i in idxs]

    vr = VideoReader(video_path, ctx=cpu(0))
    sample_fps = round(vr.get_avg_fps() / 1)  # FPS
    frame_idx = [i for i in range(0, len(vr), sample_fps)]
    if len(frame_idx) > MAX_NUM_FRAMES:
        frame_idx = uniform_sample(frame_idx, MAX_NUM_FRAMES)
    frames = vr.get_batch(frame_idx).asnumpy()
    frames = [Image.fromarray(v.astype('uint8')) for v in frames]
    print('num frames:', len(frames))
    return frames

In [None]:
video_frames = [encode_video(_) for _ in videos]
inputs = processor(messages, images=None, videos=video_frames)

In [None]:
inputs.to('cuda')
inputs.update({
    'tokenizer': tokenizer,
    'max_new_tokens':256,
    'decode_text':True,
})

In [None]:
g = model.generate(**inputs)
print(g)