In [13]:
import os
import av
import torch
import numpy as np
import torch.nn.functional as F
from transformers import AutoImageProcessor, TimesformerForVideoClassification
from collections import defaultdict
from tqdm.notebook import tqdm

# 加载模型和图像处理器
model = TimesformerForVideoClassification.from_pretrained("facebook/timesformer-base-finetuned-k400")
image_processor = AutoImageProcessor.from_pretrained("MCG-NJU/videomae-base-finetuned-kinetics")
model.to('cuda')  # 直接将模型加载到GPU中

# 读取视频列表和标签，并按类别组织
video_labels = defaultdict(list)
with open("archive/kinetics400_val_list_videos.txt", "r") as f:
    for line in f:
        name, label = line.strip().split()
        video_labels[int(label)].append(name)

def split_video_into_segments(container, n_segments=8, frames_per_segment=16):
    frame_list = [frame.to_image() for frame in container.decode(video=0)]
    total_frames = len(frame_list)
    segment_length = total_frames // n_segments

    segments = []
    for i in range(n_segments):
        start_index = i * segment_length
        end_index = min(start_index + segment_length, total_frames)
        segment_frames = frame_list[start_index:end_index]

        while len(segment_frames) < frames_per_segment:
            segment_frames.append(segment_frames[-1])

        segments.append(segment_frames[:frames_per_segment])

    return segments

def predict_video_and_segments(model, image_processor, container, true_label):
    video_segments = split_video_into_segments(container)
    segment_outputs = []
    
    with torch.no_grad():
        # Process each segment
        for segment in video_segments:
            inputs = image_processor(list(segment), return_tensors="pt")
            inputs = {k: v.to('cuda') for k, v in inputs.items()}
            outputs = model(**inputs)
            logits = outputs.logits
            probabilities = F.softmax(logits, dim=-1)
            pred_label = logits.argmax(-1).item()
            pred_score = probabilities[0, pred_label].item()
            segment_outputs.append((pred_label, pred_score, probabilities))

        # Aggregate predictions for overall video prediction
        video_probs = torch.mean(torch.stack([output[2] for output in segment_outputs]), dim=0)
        video_pred_label = video_probs.argmax().item()
        video_pred_score = video_probs[0, video_pred_label].item()
        video_true_score = video_probs[0, true_label].item()

    return video_pred_label, video_pred_score, video_true_score, segment_outputs

def process_videos(sampled_files, true_labels):
    predictions = []
    for video_file, true_label in tqdm(zip(sampled_files, true_labels), desc="Processing videos", total=len(sampled_files), unit="video"):
        file_path = os.path.join("archive/videos_val", video_file)
        container = av.open(file_path)
        video_pred_label, video_pred_score, video_true_score, segment_outputs = predict_video_and_segments(model, image_processor, container, true_label)
        predictions.append((video_file, video_pred_label, video_pred_score, video_true_score, segment_outputs))

    return predictions

# 准备视频样本
num_samples_per_class = 5
sampled_files = list(video_labels.items())[0][1][:num_samples_per_class]
true_labels = [list(video_labels.keys())[0]] * num_samples_per_class

# 获取数据和预测
video_data = process_videos(sampled_files, true_labels)

# 打印输出每个视频的整体预测结果及每个片段的结果
for video_file, video_pred_label, video_pred_score, video_true_score, segment_outputs in video_data:
    print(f"Video: {video_file}, Overall Predicted Label = {video_pred_label}, Overall Prediction Score = {video_pred_score:.4f}, True Label = {true_labels[0]}, True Label Score = {video_true_score:.4f}")
    for i, (segment_label, segment_score, probabilities) in enumerate(segment_outputs):
        segment_video_score = probabilities[0, video_pred_label].item()
        segment_true_score = probabilities[0, true_labels[0]].item()
        print(f"  Segment {i+1}: Predicted Label = {segment_label}, Prediction Score = {segment_score:.4f}, Segment Video Label Score = {segment_video_score:.4f}, Segment True Label Score = {segment_true_score:.4f}")




Processing videos:   0%|          | 0/5 [00:00<?, ?video/s]

Video: jf7RDuUTrsQ.mp4, Overall Predicted Label = 1, Overall Prediction Score = 0.2028, True Label = 325, True Label Score = 0.0010
  Segment 1: Predicted Label = 81, Prediction Score = 0.4557, Segment Video Label Score = 0.0373, Segment True Label Score = 0.0000
  Segment 2: Predicted Label = 171, Prediction Score = 0.6322, Segment Video Label Score = 0.0035, Segment True Label Score = 0.0004
  Segment 3: Predicted Label = 1, Prediction Score = 0.5045, Segment Video Label Score = 0.5045, Segment True Label Score = 0.0000
  Segment 4: Predicted Label = 289, Prediction Score = 0.4299, Segment Video Label Score = 0.1295, Segment True Label Score = 0.0000
  Segment 5: Predicted Label = 1, Prediction Score = 0.4968, Segment Video Label Score = 0.4968, Segment True Label Score = 0.0054
  Segment 6: Predicted Label = 127, Prediction Score = 0.4740, Segment Video Label Score = 0.4092, Segment True Label Score = 0.0000
  Segment 7: Predicted Label = 57, Prediction Score = 0.3964, Segment Video