In [2]:
import os
import av
import torch
import numpy as np
from transformers import AutoImageProcessor, TimesformerForVideoClassification
from sklearn.metrics import precision_score, recall_score, f1_score

# 加载模型和图像处理器
model = TimesformerForVideoClassification.from_pretrained("facebook/timesformer-base-finetuned-k400")
image_processor = AutoImageProcessor.from_pretrained("MCG-NJU/videomae-base-finetuned-kinetics")

# 读取视频列表和标签
video_labels = {}
with open("archive/kinetics400_val_list_videos.txt", "r") as f:
    for line in f:
        name, label = line.strip().split()
        video_labels[name] = int(label)

def sample_frame_indices(clip_len, frame_sample_rate, seg_len):
    converted_len = int(clip_len * frame_sample_rate)
    end_idx = np.random.randint(converted_len, seg_len)
    start_idx = end_idx - converted_len
    indices = np.linspace(start_idx, end_idx, num=clip_len)
    return np.clip(indices, start_idx, end_idx - 1).astype(np.int64)

def read_video_pyav(container, indices):
    frames = []
    container.seek(0)
    start_index = indices[0]
    end_index = indices[-1]
    for i, frame in enumerate(container.decode(video=0)):
        if i > end_index:
            break
        if i >= start_index and i in indices:
            frames.append(frame.to_ndarray(format="rgb24"))
    return np.stack(frames)

def predict_labels(start_index, end_index):
    predicted_labels = []
    true_labels = []
    video_files = sorted(list(video_labels.keys()))[start_index:end_index+1]

    for video_file in video_files:
        file_path = os.path.join("archive/videos_val", video_file)
        container = av.open(file_path)
        indices = sample_frame_indices(clip_len=16, frame_sample_rate=1, seg_len=container.streams.video[0].frames)
        video = read_video_pyav(container, indices)
        inputs = image_processor(list(video), return_tensors="pt")

        with torch.no_grad():
            outputs = model(**inputs)
            predicted_label = outputs.logits.argmax(-1).item()
            predicted_labels.append(predicted_label)

        true_labels.append(video_labels[video_file])

    return predicted_labels, true_labels

# 获取索引范围 0-99 的预测和真实标签
predicted_labels_index, true_labels_index = predict_labels(0, 19)

# 计算 precision, recall, 和 F1 分数
precision = precision_score(true_labels_index, predicted_labels_index, average='macro')
recall = recall_score(true_labels_index, predicted_labels_index, average='macro')
f1 = f1_score(true_labels_index, predicted_labels_index, average='macro')

print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)




Precision: 0.375
Recall: 0.39285714285714285
F1 Score: 0.38095238095238093


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [3]:
import torch

print(torch.__version__)
print("Is CUDA available: ", torch.cuda.is_available())
print("CUDA version:", torch.version.cuda)
print("Number of GPUs:", torch.cuda.device_count())
print("Current CUDA Device:", torch.cuda.current_device())
print("Current CUDA Device name:", torch.cuda.get_device_name(torch.cuda.current_device()))


2.3.0+cu121
Is CUDA available:  True
CUDA version: 12.1
Number of GPUs: 1
Current CUDA Device: 0
Current CUDA Device name: NVIDIA GeForce RTX 4090
