Ref: https://github.com/X-PLUG/mPLUG-Owl/tree/main/mPLUG-Owl3

Run this in vid_env environment. This models works with transformers ver 4.37.2 to 4.47.1

In [None]:
import os 
import torch

In [None]:
print(os.getenv("CONDA_DEFAULT_ENV"))

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
from transformers import AutoConfig, AutoModel
from transformers import AutoTokenizer, AutoProcessor

In [None]:
from PIL import Image
from decord import VideoReader, cpu

### Model Loading

In [None]:
model_path = 'mPLUG/mPLUG-Owl3-7B-240728'

In [None]:
config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
model = AutoModel.from_pretrained(
    model_path, 
    torch_dtype=torch.half, 
    trust_remote_code=True, 
)
tokenizer = AutoTokenizer.from_pretrained(model_path)
processor = model.init_processor(tokenizer)

In [None]:
_ = model.eval().cuda()

In [None]:
# model

### Variable System Prompt

In [None]:
generation_prompt = (
    "You are a vision-language model answering questions about lecture videos. "
    "You are given:\n"
    "1. A question\n"
    "2. The video frames themselves\n\n"
    "Answer the question using ONLY the clearly visible visual evidence. "
    "If the answer cannot be determined, say exactly: "
    "\"The answer cannot be determined from the video.\""
)

In [None]:
feedback_prompt = (
    "You are reviewing an answer generated by a vision-language model.\n\n"
    "Your task:\n"
    "1. Check whether the answer is STRICTLY grounded in the provided visible visual evidence.\n"
    "2. Do NOT use outside knowledge.\n"
    "3. Identify specific unsupported claims if present.\n\n"
    "Respond in the following format:\n\n"
    "[GROUNDING_CHECK]\n"
    "Grounded: YES or NO\n\n"
    "[FEEDBACK]\n"
    "If NO:\n"
    "- Explain precisely which parts are not supported by the evidence.\n"
    "If YES:\n"
    "- Say \"The answer is fully grounded.\""
)

In [None]:
# "You are a vision-language model analyzing lecture videos. "
# "You must rely strictly on visual evidence from the video frames.\n\n"
# "Follow these steps:\n"
# "1. Carefully examine the video frames.\n"
# "2. Perform OCR on any readable text appearing on slides, blackboards, or written material.\n"
# "3. Explicitly list the extracted text before reasoning.\n"
# "4. Answer the question using ONLY the extracted visual information.\n"
# "5. If the required information is not visible or readable, say "
# "\"The answer cannot be determined from the video.\" Do NOT guess.\n"

### Prepare Chat Messages

In [None]:
def create_chat_message(system_prompt, user_prompt):

    message = [
        {
            "role": "system",
            "content": system_prompt
        },
        {
            "role": "user",
            "content": f"<|video|> {user_prompt}"
        },
        {
            "role": "assistant",
            "content": ""
        }
    ]
    return message


### Hallucinative Questions for Random Test:

From chemistry.mp4

question = 'What was the capacity of the beaker they used in the experiment?'

question = 'What was the melting pointed shown in the experiment?'

question = 'Did they use Potassium chlorate in the experiment?'

question = 'Did they use water in the whole experiment?'

question = 'What are the compounds being used here, list them'

------------------------------------------------------------------------------------------------------

From insertion_sort.mp4

question = 'Which sorting algorithm is being performed in the video?'

question = 'How many element is present in the array shown ?'

question = 'What is the runtime of the algorithm as explained by the professor?'

question = 'Write the elements present in the array the professor is explaining?'

------------------------------------------------------------------------------------------------------

From: 4min-video.mp4

question = 'Who are the target audience of this course as per the slides shown in the video?'

question = 'How many disciplines does electrical engineering overlaps with as shown in the slides in the video'

question = 'The course is divided into how many modules?'

### Load Test Video

In [None]:
videos = ['/home/aritrad/MSR-Project/random/insertion_sort.mp4']

In [None]:
MAX_NUM_FRAMES=32

In [None]:
def encode_video(video_path):
    
    def uniform_sample(l, n):
        gap = len(l) / n
        idxs = [int(i * gap + gap / 2) for i in range(n)]
        return [l[i] for i in idxs]

    vr = VideoReader(video_path, ctx=cpu(0))
    sample_fps = round(vr.get_avg_fps() / 1)  # FPS
    frame_idx = [i for i in range(0, len(vr), sample_fps)]
    if len(frame_idx) > MAX_NUM_FRAMES:
        frame_idx = uniform_sample(frame_idx, MAX_NUM_FRAMES)
    frames = vr.get_batch(frame_idx).asnumpy()
    frames = [Image.fromarray(v.astype('uint8')) for v in frames]
    #print(frames)
    print('num frames:', len(frames))
    return frames

In [None]:
video_frames = [encode_video(_) for _ in videos]

In [None]:
video_frames[0][23]

### Generate Answer

In [None]:
question = 'What algorithm is the professor explaining in the video?'
message = create_chat_message(generation_prompt, question)
inputs = processor(message, images=None, videos=video_frames)

In [None]:
inputs.to('cuda')
inputs.update({
    'tokenizer': tokenizer,
    'max_new_tokens':512,
    'decode_text':True,
})


In [None]:
%%time
answer = model.generate(**inputs)
print(answer[0])

### Generate Feedback

In [None]:
fb_question = f'Given Question: {question}, Generated Answer: {answer[0]}'
fb_message = create_chat_message(feedback_prompt, fb_question)

In [None]:
inputs = processor(fb_message, images=None, videos=video_frames)

In [None]:
inputs.to('cuda')
inputs.update({
    'tokenizer': tokenizer,
    'max_new_tokens':512,
    'decode_text':True,
})

In [None]:
%%time
answer = model.generate(**inputs)
print(answer[0])