In [4]:
import os
import av
import torch
import numpy as np
from transformers import AutoImageProcessor, TimesformerForVideoClassification
from sklearn.metrics import precision_score, recall_score, f1_score
from collections import defaultdict
import random
from tqdm.notebook import tqdm  # 导入 tqdm 的 notebook 版本

# 加载模型和图像处理器
model = TimesformerForVideoClassification.from_pretrained("facebook/timesformer-base-finetuned-k400")
image_processor = AutoImageProcessor.from_pretrained("MCG-NJU/videomae-base-finetuned-kinetics")

# 读取视频列表和标签，并按类别组织
video_labels = defaultdict(list)
with open("archive/kinetics400_val_list_videos.txt", "r") as f:
    for line in f:
        name, label = line.strip().split()
        video_labels[int(label)].append(name)

def sample_frame_indices(clip_len, frame_sample_rate, seg_len):
    converted_len = int(clip_len * frame_sample_rate)
    end_idx = np.random.randint(converted_len, seg_len)
    start_idx = end_idx - converted_len
    indices = np.linspace(start_idx, end_idx, num=clip_len)
    return np.clip(indices, start_idx, end_idx - 1).astype(np.int64)

def read_video_pyav(container, indices):
    frames = []
    container.seek(0)
    start_index = indices[0]
    end_index = indices[-1]
    for i, frame in enumerate(container.decode(video=0)):
        if i > end_index:
            break
        if i >= start_index and i in indices:
            frames.append(frame.to_ndarray(format="rgb24"))
    return np.stack(frames)

def balanced_sample_videos(num_samples_per_class, num_classes=None):
    sampled_files = []
    labels = []

    # 随机选择指定数量的类别
    available_classes = list(video_labels.keys())
    if num_classes is not None and num_classes < len(available_classes):
        chosen_classes = random.sample(available_classes, num_classes)
    else:
        chosen_classes = available_classes

    for label in chosen_classes:
        files = video_labels[label]
        if len(files) >= num_samples_per_class:
            sampled = random.sample(files, num_samples_per_class)
        else:
            sampled = files
        sampled_files.extend(sampled)
        labels.extend([label] * len(sampled))
    
    return sampled_files, labels

def predict_labels(sampled_files, true_labels):
    predicted_labels = []
    model.to('cuda')  # 确保模型在 GPU 上
    for video_file in tqdm(sampled_files, desc="Processing videos", unit="video"):
        file_path = os.path.join("archive/videos_val", video_file)
        container = av.open(file_path)
        indices = sample_frame_indices(clip_len=16, frame_sample_rate=1, seg_len=container.streams.video[0].frames)
        video = read_video_pyav(container, indices)
        inputs = image_processor(list(video), return_tensors="pt")
        inputs = {k: v.to('cuda') for k, v in inputs.items()}  # 移动输入数据到 GPU

        with torch.no_grad():
            outputs = model(**inputs)
            predicted_label = outputs.logits.argmax(-1).item()
            predicted_labels.append(predicted_label)

    return predicted_labels, true_labels

# 指定每类样本数量和随机选择的类别数量
num_samples_per_class = 8  # 每类样本数
num_classes = 5  # 随机选择的类别数
sampled_files, true_labels = balanced_sample_videos(num_samples_per_class, num_classes)

# 使用采样结果进行预测
predicted_labels_index, true_labels_index = predict_labels(sampled_files, true_labels)

# 计算 precision, recall, 和 F1 分数
precision = precision_score(true_labels_index, predicted_labels_index, average='macro')
recall = recall_score(true_labels_index, predicted_labels_index, average='macro')
f1 = f1_score(true_labels_index, predicted_labels_index, average='macro')

print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)




Processing videos:   0%|          | 0/40 [00:00<?, ?video/s]

Precision: 0.3333333333333333
Recall: 0.19166666666666668
F1 Score: 0.23367743367743368


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
