In [None]:
import torch
import numpy as np
from decord import VideoReader, cpu
from PIL import Image
import torchvision.transforms as T
from transformers import AutoTokenizer, AutoModel
from torchvision.transforms.functional import InterpolationMode

# 기본 이미지 전처리 구성
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)

def build_transform(input_size):
    return T.Compose([
        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
        T.ToTensor(),
        T.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD)
    ])

# 비디오에서 프레임 인덱스 선택
def get_index(bound, fps, max_frame, first_idx=0, num_segments=32):
    if bound:
        start, end = bound[0], bound[1]
    else:
        start, end = -100000, 100000
    start_idx = max(first_idx, round(start * fps))
    end_idx = min(round(end * fps), max_frame)
    seg_size = float(end_idx - start_idx) / num_segments
    frame_indices = np.array([
        int(start_idx + (seg_size / 2) + np.round(seg_size * idx))
        for idx in range(num_segments)
    ])
    return frame_indices

# 비디오 데이터를 로드 및 전처리
def load_video(video_path, bound=None, input_size=448, max_num=1, num_segments=32):
    vr = VideoReader(video_path, ctx=cpu(0), num_threads=1)
    max_frame = len(vr) - 1
    fps = float(vr.get_avg_fps())

    pixel_values_list, num_patches_list = [], []
    transform = build_transform(input_size=input_size)
    frame_indices = get_index(bound, fps, max_frame, first_idx=0, num_segments=num_segments)
    for frame_index in frame_indices:
        img = Image.fromarray(vr[frame_index].asnumpy()).convert('RGB')
        img = transform(img)
        pixel_values_list.append(img)
    pixel_values = torch.stack(pixel_values_list)
    return pixel_values

# 모델 로드
path = "OpenGVLab/InternVL2_5-8B-MPO"
model = AutoModel.from_pretrained(
    path,
    torch_dtype=torch.bfloat16,
    low_cpu_mem_usage=True,
    use_flash_attn=False,
    trust_remote_code=True
).eval().cuda()

tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True, use_fast=False)

# 비디오 처리 및 모델 실행
def analyze_video(video_path, question):
    pixel_values = load_video(video_path, num_segments=8)
    pixel_values = pixel_values.to(torch.bfloat16).cuda()
    
    # 질문에 대한 응답 생성
    generation_config = {"max_new_tokens": 512, 
                         "do_sample": False, 
                         "top_k":10, 
                         "top_p":0.9, 
                         "temperature":0.3}
    video_prefix = ''.join([f'Frame{i+1}: <image>\n' for i in range(pixel_values.size(0))])
    query = video_prefix + question
    
    response, history= model.chat(tokenizer, pixel_values, query, generation_config, history=None, return_history=True)
    print(response)
    return response

# 비디오에 대한 질문
video_path = "/data/ephemeral/home/MovieclipsAkeelah and the Bee (59) Movie CLIP - Big Words Come From Little Words (2006) HD-_UZxXUwQX84.mp4"
question = (
    "Describe the current scene in detail, including the following information: "
    "1. Describe the people present in the scene, including their physical appearance, clothing, and expressions. "
    "2. Describe the actions or movements of the people, and what they are doing. "
    "3. Provide context for the situation: Where does the scene take place? What is happening overall? "
    "4. Mention any relevant objects, interactions, or environmental details that add meaning to the scene. "
    "Be as specific and descriptive as possible."
)

response = analyze_video(video_path, question)
print(response)
# print(f"Question: {question}\nResponse: {response}")


config.json:   0%|          | 0.00/4.05k [00:00<?, ?B/s]

configuration_internvl_chat.py:   0%|          | 0.00/3.94k [00:00<?, ?B/s]

configuration_intern_vit.py:   0%|          | 0.00/5.55k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/OpenGVLab/InternVL2_5-8B-MPO:
- configuration_intern_vit.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


configuration_internlm2.py:   0%|          | 0.00/7.00k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/OpenGVLab/InternVL2_5-8B-MPO:
- configuration_internlm2.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/OpenGVLab/InternVL2_5-8B-MPO:
- configuration_internvl_chat.py
- configuration_intern_vit.py
- configuration_internlm2.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_internvl_chat.py:   0%|          | 0.00/15.7k [00:00<?, ?B/s]

modeling_intern_vit.py:   0%|          | 0.00/18.1k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/OpenGVLab/InternVL2_5-8B-MPO:
- modeling_intern_vit.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_internlm2.py:   0%|          | 0.00/61.2k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/OpenGVLab/InternVL2_5-8B-MPO:
- modeling_internlm2.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


conversation.py:   0%|          | 0.00/15.3k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/OpenGVLab/InternVL2_5-8B-MPO:
- conversation.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/OpenGVLab/InternVL2_5-8B-MPO:
- modeling_internvl_chat.py
- modeling_intern_vit.py
- modeling_internlm2.py
- conversation.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors.index.json:   0%|          | 0.00/51.2k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.94G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.38G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/115 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/4.00k [00:00<?, ?B/s]

tokenization_internlm2.py:   0%|          | 0.00/8.79k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/OpenGVLab/InternVL2_5-8B-MPO:
- tokenization_internlm2.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


tokenizer.model:   0%|          | 0.00/1.48M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/179 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/844 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


The scene features a young girl with glasses and a yellow shirt paired with blue overalls. She has curly or wavy shoulder-length hair and her expression is serious. The man interacting with her is wearing glasses, a light blue vest, and a white shirt. He has facial hair and is wearing a watch. The woman's hands are folded, and she appears to be listening attentively. The man in the vest looks slightly concerned or inquisitive. They're in an office environment with a bookshelf, computer, and desk lamp, suggesting a formal interaction, possibly a meeting or a serious conversation. The presence of "MOVIECLIPS.COM" hints that this is a clip from a movie.
The scene features a young girl with glasses and a yellow shirt paired with blue overalls. She has curly or wavy shoulder-length hair and her expression is serious. The man interacting with her is wearing glasses, a light blue vest, and a white shirt. He has facial hair and is wearing a watch. The woman's hands are folded, and she appears 

: 