In [3]:
import os
import av
import torch
import numpy as np
import torch.nn.functional as F
from transformers import AutoImageProcessor, TimesformerForVideoClassification
from collections import defaultdict
from tqdm.notebook import tqdm

# 加载模型和图像处理器
model = TimesformerForVideoClassification.from_pretrained("facebook/timesformer-base-finetuned-k400")
image_processor = AutoImageProcessor.from_pretrained("MCG-NJU/videomae-base-finetuned-kinetics")

# 读取视频列表和标签，并按类别组织
video_labels = defaultdict(list)
with open("archive/kinetics400_val_list_videos.txt", "r") as f:
    for line in f:
        name, label = line.strip().split()
        video_labels[int(label)].append(name)

def split_video_into_segments(container, n_segments=8, frames_per_segment=16):
    frames = []
    frame_list = []
    # 从容器中读取所有帧
    for frame in container.decode(video=0):
        frame_list.append(frame.to_image())
    
    total_frames = len(frame_list)
    segment_length = total_frames // n_segments

    # 分割帧到不同的段中
    for i in range(n_segments):
        start_index = i * segment_length
        end_index = start_index + segment_length
        if i == n_segments - 1:
            # 确保最后一个段包括所有剩余的帧
            end_index = total_frames
        segment_frames = frame_list[start_index:end_index]
        
        # 如果段中帧数不足，重复最后一帧至足够数量
        while len(segment_frames) < frames_per_segment:
            segment_frames.append(segment_frames[-1])
        
        # 取每个段的前 frames_per_segment 帧以确保一致性
        frames.append(segment_frames[:frames_per_segment])
    
    return frames

def predict_segment(model, image_processor, video_segment, true_label):
    inputs = image_processor(list(video_segment), return_tensors="pt")
    inputs = {k: v.to('cuda') for k, v in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        probabilities = F.softmax(logits, dim=-1)
        predicted_label = logits.argmax(-1).item()
        prediction_score = probabilities[0, predicted_label].item()
        true_label_score = probabilities[0, true_label].item()  # 真实标签的预测分数

    return predicted_label, prediction_score, true_label_score


def predict_labels(sampled_files, true_labels):
    predictions = []
    model.to('cuda')
    for video_file, true_label in tqdm(zip(sampled_files, true_labels), desc="Processing videos", total=len(sampled_files), unit="video"):
        file_path = os.path.join("archive/videos_val", video_file)
        container = av.open(file_path)
        video_segments = split_video_into_segments(container)
        video_predictions = [predict_segment(model, image_processor, segment, true_label) for segment in video_segments]
        predictions.append(video_predictions)

    return predictions


# 准备视频样本
num_samples_per_class = 5  # 测试时减少样本量以加速过程
sampled_files, true_labels = list(video_labels.items())[0][1][:num_samples_per_class], [list(video_labels.keys())[0]] * num_samples_per_class
# 获取数据和预测
predictions = predict_labels(sampled_files, true_labels)

# 打印输出每个视频每个段的预测结果
for video_file, video_preds, true_label in zip(sampled_files, predictions, true_labels):
    print(f"Video: {video_file}, True Label: {true_label}")
    for i, (pred_label, pred_score, true_score) in enumerate(video_preds):
        print(f"  Segment {i+1}: Predicted Label = {pred_label}, Prediction Score = {pred_score:.4f}, True Label Score = {true_score:.4f}")




Processing videos:   0%|          | 0/5 [00:00<?, ?video/s]

Video: jf7RDuUTrsQ.mp4, True Label: 325
  Segment 1: Predicted Label = 81, Prediction Score = 0.4557, True Label Score = 0.0000
  Segment 2: Predicted Label = 171, Prediction Score = 0.6322, True Label Score = 0.0004
  Segment 3: Predicted Label = 1, Prediction Score = 0.5045, True Label Score = 0.0000
  Segment 4: Predicted Label = 289, Prediction Score = 0.4299, True Label Score = 0.0000
  Segment 5: Predicted Label = 1, Prediction Score = 0.4968, True Label Score = 0.0054
  Segment 6: Predicted Label = 127, Prediction Score = 0.4740, True Label Score = 0.0000
  Segment 7: Predicted Label = 57, Prediction Score = 0.3964, True Label Score = 0.0000
  Segment 8: Predicted Label = 81, Prediction Score = 0.4700, True Label Score = 0.0022
Video: EhRxb8-cNzQ.mp4, True Label: 325
  Segment 1: Predicted Label = 230, Prediction Score = 0.5502, True Label Score = 0.0000
  Segment 2: Predicted Label = 230, Prediction Score = 0.6382, True Label Score = 0.0000
  Segment 3: Predicted Label = 106, P