CPU Adversarial Pertubation

In [5]:
import os
import cv2
import numpy as np
from PIL import Image
from tqdm import tqdm
from skimage.util import random_noise
from skimage.filters import gaussian
from skimage.transform import rescale, AffineTransform, warp
from scipy.ndimage import zoom as scizoom

def add_gaussian_noise(image):
    noisy_image = random_noise(image, mode='gaussian', mean=0, var=0.01)
    return (noisy_image * 255).astype(np.uint8)

def add_motion_blur(image):
    size = 5  # Size of the kernel
    kernel_motion_blur = np.zeros((size, size))
    kernel_motion_blur[int((size-1)/2), :] = np.ones(size)
    kernel_motion_blur = kernel_motion_blur / size
    output = cv2.filter2D(image, -1, kernel_motion_blur)
    return output

def add_shot_noise(image):
    image = image / 255.0
    noisy_image = np.random.poisson(image * 255) / 255.0
    noisy_image = np.clip(noisy_image, 0, 1) * 255
    return noisy_image.astype(np.uint8)

def add_zoom_blur(image, zoom_factor=2):
    h, w = image.shape[:2]
    # Zoom in to the image and then crop to original size
    zoomed = scizoom(image, (zoom_factor, zoom_factor, 1))
    start_h = (zoomed.shape[0] - h) // 2
    start_w = (zoomed.shape[1] - w) // 2
    zoomed_cropped = zoomed[start_h:start_h+h, start_w:start_w+w]
    return zoomed_cropped.astype(np.uint8)

def process_videos(input_folder, output_folder, perturbation_type='gaussian_noise'):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    
    for dirname, _, filenames in os.walk(input_folder):
        for filename in tqdm(filenames):
            if filename.endswith('.mp4'):
                cap = cv2.VideoCapture(os.path.join(dirname, filename))
                
                # Get properties from input to use on output
                frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
                frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
                fps = cap.get(cv2.CAP_PROP_FPS)
                
                fourcc = cv2.VideoWriter_fourcc(*'mp4v')
                out_path = os.path.join(output_folder, filename)
                out = cv2.VideoWriter(out_path, fourcc, fps, (frame_width, frame_height))
                
                while True:
                    ret, frame = cap.read()
                    if not ret:
                        break
                    
                    if perturbation_type == 'gaussian_noise':
                        frame = add_gaussian_noise(frame)
                    elif perturbation_type == 'motion_blur':
                        frame = add_motion_blur(frame)
                    elif perturbation_type == 'shot_noise':
                        frame = add_shot_noise(frame)
                    elif perturbation_type == 'zoom_blur':
                        frame = add_zoom_blur(frame)
                    
                    out.write(frame)
                
                cap.release()
                out.release()

# Example usage:
input_folder = 'archive/videos_val'
output_folder = 'archive/gaussian_noise_perturbated'
perturbation_type = 'gaussian_noise'  # Can be 'gaussian_noise', 'shot_noise', or 'motion_blur'

process_videos(input_folder, output_folder, perturbation_type)


  0%|          | 0/19796 [00:00<?, ?it/s]

  0%|          | 20/19796 [01:24<23:13:07,  4.23s/it]


KeyboardInterrupt: 

GPU Adversarial Pertubation

In [2]:
import os
import cv2
import numpy as np
from PIL import Image
import torch
import torchvision.transforms.functional as TF
from tqdm import tqdm
from skimage.util import random_noise
from skimage.filters import gaussian
from scipy.ndimage import zoom as scizoom

# 设置使用的设备
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 添加高斯噪声
def add_gaussian_noise(image, std=0.1):
    noise = torch.randn(image.size(), device=device) * std
    noisy_image = image + noise
    return noisy_image.clamp(0, 1)

# 添加射击噪声
def add_shot_noise(image, scale=50):
    noisy_image = torch.poisson(image * scale) / scale
    return noisy_image.clamp(0, 1)

# 添加运动模糊
def add_motion_blur(image, kernel_size=5):
    kernel_motion_blur = torch.zeros((3, 1, kernel_size, kernel_size), device=device)
    kernel_motion_blur[:, 0, kernel_size//2, :] = torch.ones(kernel_size)
    kernel_motion_blur /= kernel_size
    padded_image = torch.nn.functional.pad(image, (kernel_size//2, kernel_size//2, kernel_size//2, kernel_size//2), mode='replicate')
    image_blurred = torch.nn.functional.conv2d(padded_image.unsqueeze(0), kernel_motion_blur, groups=3)
    return image_blurred.squeeze(0)

# 添加缩放模糊
def add_zoom_blur(image, zoom_factor=1.1):
    h, w = image.shape[1:]
    image_zoomed = TF.resize(image, int(zoom_factor * max(h, w)))
    center_crop = TF.center_crop(image_zoomed, [h, w])
    return center_crop

# 视频处理函数
def process_videos(input_folder, output_folder, perturbation_type='gaussian_noise', param=0.1):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    
    for filename in tqdm(os.listdir(input_folder)):
        if filename.endswith('.mp4'):
            cap = cv2.VideoCapture(os.path.join(input_folder, filename))
            frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
            frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
            fps = cap.get(cv2.CAP_PROP_FPS)
            
            fourcc = cv2.VideoWriter_fourcc(*'mp4v')
            out_path = os.path.join(output_folder, filename)
            out = cv2.VideoWriter(out_path, fourcc, fps, (frame_width, frame_height))
            
            while True:
                ret, frame = cap.read()
                if not ret:
                    break
                
                frame = TF.to_tensor(frame).to(device)
                
                if perturbation_type == 'gaussian_noise':
                    frame = add_gaussian_noise(frame, std=param)
                elif perturbation_type == 'shot_noise':
                    frame = add_shot_noise(frame, scale=param)
                elif perturbation_type == 'motion_blur':
                    frame = add_motion_blur(frame, kernel_size=int(param))
                elif perturbation_type == 'zoom_blur':
                    frame = add_zoom_blur(frame, zoom_factor=param)
                
                frame = TF.to_pil_image(frame.cpu())
                frame = np.array(frame)
                out.write(frame)
            
            cap.release()
            out.release()

# Example usage
input_folder = 'archive/videos_val'
output_folder = 'archive/zoom_blur'
perturbation_type = 'zoom_blur'  # Can be 'gaussian_noise', 'shot_noise', 'motion_blur', 'zoom_blur'
param = 1.1  # Example parameters: 0.1 for gaussian_noise, 3000 for shot_noise, 5 for motion_blur, 1.1 for zoom_blur

process_videos(input_folder, output_folder, perturbation_type, param)


  0%|          | 0/19796 [00:00<?, ?it/s]

100%|██████████| 19796/19796 [8:04:16<00:00,  1.47s/it]  


Select class Run 1 n_segments=8, frames_per_segment=16

In [1]:
import os
import av
import torch
import numpy as np
import torch.nn.functional as F
import random
import json
from transformers import AutoImageProcessor, TimesformerForVideoClassification
from collections import defaultdict
from tqdm.notebook import tqdm
from itertools import combinations

class VideoProcessor:
    def __init__(self, model_name, image_processor_name, device='cuda'):
        self.model = self.load_model(model_name)
        self.image_processor = AutoImageProcessor.from_pretrained(image_processor_name)
        self.device = device
        self.model.to(device)

    def load_model(self, model_name):
        if "timesformer" in model_name.lower():
            return TimesformerForVideoClassification.from_pretrained(model_name)
        else:
            raise ValueError(f"Unsupported model name: {model_name}")

    def split_video_into_segments(self, container, n_segments=8, frames_per_segment=16):
        frame_list = [frame.to_image() for frame in container.decode(video=0)]
        total_frames = len(frame_list)
        segment_length = total_frames // n_segments
        segments = []
        for i in range(n_segments):
            start = i * segment_length
            end = min(start + segment_length, total_frames)
            segment_frames = frame_list[start:end] if end - start == segment_length else frame_list[start:] + [frame_list[-1]] * (segment_length - (end - start))
            segments.append(segment_frames[:frames_per_segment])
        return segments

    def predict_video_and_segments(self, container, true_label):
        video_segments = self.split_video_into_segments(container)
        segment_outputs = []
        with torch.no_grad():
            for segment in video_segments:
                inputs = self.image_processor(list(segment), return_tensors="pt")
                inputs = {k: v.to(self.device) for k, v in inputs.items()}
                try:
                    outputs = self.model(**inputs)
                    logits = outputs.logits
                    probabilities = F.softmax(logits, dim=-1)
                    pred_label = logits.argmax(-1).item()
                    pred_score = probabilities[0, pred_label].item()
                    segment_outputs.append((pred_label, pred_score, probabilities))
                except RuntimeError as e:
                    print(f"Error processing segment: {e}")
                    continue
        return segment_outputs

class TemporalShap:
    def __init__(self, num_samples=100):
        self.num_samples = num_samples
    
    def approximate_shapley_values(self, segment_outputs, label_index):
        n = len(segment_outputs)
        shapley_values = [0] * n
        for _ in range(self.num_samples):
            random_subset = sorted(range(n), key=lambda _: random.random())
            subset_prob = torch.zeros_like(segment_outputs[0][2])
            for i, index in enumerate(random_subset):
                old_contribution = subset_prob[0, label_index].item()
                subset_prob += segment_outputs[index][2]
                subset_prob /= (i + 1)
                new_contribution = subset_prob[0, label_index].item()
                shapley_values[index] += new_contribution - old_contribution
        return [val / self.num_samples for val in shapley_values]

    def exact_shapley_values(self, segment_outputs, label_index):
        n = len(segment_outputs)
        shapley_values = [0] * n
        all_indices = list(range(n))
        for i in all_indices:
            marginal_contributions = []
            for subset_size in range(n):
                subsets = list(combinations([x for x in all_indices if x != i], subset_size))
                for subset in subsets:
                    subset_prob = torch.zeros_like(segment_outputs[0][2])
                    if subset:
                        subset_prob = torch.mean(torch.stack([segment_outputs[j][2] for j in subset]), dim=0)
                    with_i_prob = (subset_prob * len(subset) + segment_outputs[i][2]) / (len(subset) + 1)
                    marginal_contributions.append(with_i_prob[0, label_index].item() - subset_prob[0, label_index].item())
            shapley_values[i] = np.mean(marginal_contributions)
        return shapley_values

def process_videos(video_processor, shap_calculator, sampled_files, true_labels, use_exact=False):
    predictions = []
    for video_file, true_label in tqdm(zip(sampled_files, true_labels), desc="Processing videos", total=len(sampled_files), unit="video"):
        file_path = os.path.join(config["video_directory"], video_file)
        container = av.open(file_path)
        try:
            segment_outputs = video_processor.predict_video_and_segments(container, true_label)
            if not segment_outputs:
                print(f"Skipping video {video_file} due to empty segment outputs.")
                continue
            video_probs = torch.mean(torch.stack([output[2] for output in segment_outputs]), dim=0)
            video_pred_label = video_probs.argmax().item()
            video_pred_score = video_probs[0, video_pred_label].item()
            video_true_score = video_probs[0, true_label].item()
            
            if use_exact:
                sv_true_label = shap_calculator.exact_shapley_values(segment_outputs, true_label)
                sv_video_pred = shap_calculator.exact_shapley_values(segment_outputs, video_pred_label)
            else:
                sv_true_label = shap_calculator.approximate_shapley_values(segment_outputs, true_label)
                sv_video_pred = shap_calculator.approximate_shapley_values(segment_outputs, video_pred_label)
            
            predictions.append((video_file, video_pred_label, video_pred_score, video_true_score, true_label, segment_outputs, sv_true_label, sv_video_pred))
        except Exception as e:
            print(f"Error processing video {video_file}: {e}")
            continue
    return predictions

def save_results(predictions, filename="results.json"):
    results = []
    for video_file, video_pred_label, video_pred_score, video_true_score, video_true_label, segment_outputs, sv_true_label, sv_video_pred in predictions:
        video_result = {
            "video_file": video_file,
            "video_pred_label": video_pred_label,
            "video_pred_score": video_pred_score,
            "video_true_score": video_true_score,
            "video_true_label": video_true_label,
            "segments": []
        }
        for i, (segment_label, segment_score, probabilities) in enumerate(segment_outputs):
            segment_video_label_score = probabilities[0, video_pred_label].item()
            segment_true_label_score = probabilities[0, video_true_label].item()
            video_result["segments"].append({
                "segment_index": i + 1,
                "segment_label": segment_label,
                "segment_score": segment_score,
                "segment_video_label_score": segment_video_label_score,
                "segment_true_label_score": segment_true_label_score,
                "sv_true_label": sv_true_label[i],
                "sv_video_pred": sv_video_pred[i]
            })
        results.append(video_result)
    
    with open(filename, "w") as f:
        json.dump(results, f, indent=4)

# 配置
config = {
    "model_name": "facebook/timesformer-base-finetuned-k400",  # 用户可以在这里更换模型名称，例如 "huggingface/vivit"
    "image_processor_name": "MCG-NJU/videomae-base-finetuned-kinetics",
    "num_samples": 100,  # 近似 Shapley Value 计算的采样次数
    "num_classes": 10,    # 要测试的类的数量
    "num_samples_per_class": 3,  # 每个类的样本数量
    "video_list_path": "archive/kinetics400_val_list_videos.txt",
    "video_directory": "archive/videos_val",
    "use_exact": True  # 设置为 True 以使用精确 Shapley Value 计算
}

# 初始化处理器
video_processor = VideoProcessor(config["model_name"], config["image_processor_name"])
shap_calculator = TemporalShap(num_samples=config["num_samples"])

# 读取视频列表和标签，并按类别组织
video_labels = defaultdict(list)
with open(config["video_list_path"], "r") as f:
    for line in f:
        name, label = line.strip().split()
        video_labels[int(label)].append(name)

# 准备视频样本
sampled_files = []
true_labels = []
selected_classes = random.sample(list(video_labels.keys()), config["num_classes"])  # 随机选择指定数量的类别
for cls in selected_classes:
    sampled_files.extend(random.sample(video_labels[cls], config["num_samples_per_class"]))  # 从每个类别中随机选择指定数量的样本
    true_labels.extend([cls] * config["num_samples_per_class"])

# 获取数据和预测
video_data = process_videos(video_processor, shap_calculator, sampled_files, true_labels, use_exact=config["use_exact"])

# 打印并保存结果
save_results(video_data)

# 打印结果以便查看
for video_file, video_pred_label, video_pred_score, video_true_score, true_label, segment_outputs, sv_true_label, sv_video_pred in video_data:
    print(f"Video: {video_file}, Overall Predicted Label = {video_pred_label}, Overall Prediction Score = {video_pred_score:.4f}, True Label = {true_label}, True Label Score = {video_true_score:.4f}")
    for i, (segment_label, segment_score, probabilities) in enumerate(segment_outputs):
        segment_video_label_score = probabilities[0, video_pred_label].item()
        segment_true_label_score = probabilities[0, true_label].item()
        print(f"  Segment {i+1}: Predicted Label = {segment_label}, Prediction Score = {segment_score:.4f}, Segment Video Label Score = {segment_video_label_score:.4f}, Segment True Label Score = {segment_true_label_score:.4f}, SV True Label = {sv_true_label[i]:.4f}, SV Predicted Label = {sv_video_pred[i]:.4f}")




Processing videos:   0%|          | 0/30 [00:00<?, ?video/s]

  return torch.tensor(value)


Video: zSCSoHcKJg0.mp4, Overall Predicted Label = 362, Overall Prediction Score = 0.4941, True Label = 362, True Label Score = 0.4941
  Segment 1: Predicted Label = 135, Prediction Score = 0.6605, Segment Video Label Score = 0.0289, Segment True Label Score = 0.0289, SV True Label = -0.1280, SV Predicted Label = -0.1280
  Segment 2: Predicted Label = 135, Prediction Score = 0.6573, Segment Video Label Score = 0.0480, Segment True Label Score = 0.0480, SV True Label = -0.1226, SV Predicted Label = -0.1226
  Segment 3: Predicted Label = 135, Prediction Score = 0.6638, Segment Video Label Score = 0.0768, Segment True Label Score = 0.0768, SV True Label = -0.1145, SV Predicted Label = -0.1145
  Segment 4: Predicted Label = 135, Prediction Score = 0.7144, Segment Video Label Score = 0.1425, Segment True Label Score = 0.1425, SV True Label = -0.0958, SV Predicted Label = -0.0958
  Segment 5: Predicted Label = 362, Prediction Score = 0.8103, Segment Video Label Score = 0.8103, Segment True La

All dataset include

In [11]:
import os
import av
import torch
import numpy as np
import torch.nn.functional as F
import random
import json
from transformers import AutoImageProcessor, TimesformerForVideoClassification
from collections import defaultdict
from tqdm.notebook import tqdm
from itertools import combinations

class VideoProcessor:
    def __init__(self, model_name, image_processor_name, device='cuda'):
        self.model = self.load_model(model_name)
        self.image_processor = AutoImageProcessor.from_pretrained(image_processor_name)
        self.device = device
        self.model.to(device)

    def load_model(self, model_name):
        if "timesformer" in model_name.lower():
            return TimesformerForVideoClassification.from_pretrained(model_name)
        else:
            raise ValueError(f"Unsupported model name: {model_name}")

    def split_video_into_segments(self, container, n_segments=8, frames_per_segment=16):
        frame_list = [frame.to_image() for frame in container.decode(video=0)]
        total_frames = len(frame_list)
        segment_length = total_frames // n_segments
        segments = []
        for i in range(n_segments):
            start = i * segment_length
            end = min(start + segment_length, total_frames)
            segment_frames = frame_list[start:end] if end - start == segment_length else frame_list[start:] + [frame_list[-1]] * (segment_length - (end - start))
            segments.append(segment_frames[:frames_per_segment])
        return segments

    def predict_video_and_segments(self, container, true_label):
        video_segments = self.split_video_into_segments(container)
        segment_outputs = []
        with torch.no_grad():
            for segment in video_segments:
                inputs = self.image_processor(list(segment), return_tensors="pt")
                inputs = {k: v.to(self.device) for k, v in inputs.items()}
                try:
                    outputs = self.model(**inputs)
                    logits = outputs.logits
                    probabilities = F.softmax(logits, dim=-1)
                    pred_label = logits.argmax(-1).item()
                    pred_score = probabilities[0, pred_label].item()
                    segment_outputs.append((pred_label, pred_score, probabilities))
                except RuntimeError as e:
                    print(f"Error processing segment: {e}")
                    continue
        return segment_outputs

class TemporalShap:
    def __init__(self, num_samples=100):
        self.num_samples = num_samples
    
    def approximate_shapley_values(self, segment_outputs, label_index):
        n = len(segment_outputs)
        shapley_values = [0] * n
        for _ in range(self.num_samples):
            random_subset = sorted(range(n), key=lambda _: random.random())
            subset_prob = torch.zeros_like(segment_outputs[0][2])
            for i, index in enumerate(random_subset):
                old_contribution = subset_prob[0, label_index].item()
                subset_prob += segment_outputs[index][2]
                subset_prob /= (i + 1)
                new_contribution = subset_prob[0, label_index].item()
                shapley_values[index] += new_contribution - old_contribution
        return [val / self.num_samples for val in shapley_values]

    def exact_shapley_values(self, segment_outputs, label_index):
        n = len(segment_outputs)
        shapley_values = [0] * n
        all_indices = list(range(n))
        for i in all_indices:
            marginal_contributions = []
            for subset_size in range(n):
                subsets = list(combinations([x for x in all_indices if x != i], subset_size))
                for subset in subsets:
                    subset_prob = torch.zeros_like(segment_outputs[0][2])
                    if subset:
                        subset_prob = torch.mean(torch.stack([segment_outputs[j][2] for j in subset]), dim=0)
                    with_i_prob = (subset_prob * len(subset) + segment_outputs[i][2]) / (len(subset) + 1)
                    marginal_contributions.append(with_i_prob[0, label_index].item() - subset_prob[0, label_index].item())
            shapley_values[i] = np.mean(marginal_contributions)
        return shapley_values

def process_videos(video_processor, shap_calculator, sampled_files, true_labels, use_exact=False):
    predictions = []
    for video_file, true_label in tqdm(zip(sampled_files, true_labels), desc="Processing videos", total=len(sampled_files), unit="video"):
        file_path = os.path.join(config["video_directory"], video_file)
        container = av.open(file_path)
        try:
            segment_outputs = video_processor.predict_video_and_segments(container, true_label)
            if not segment_outputs:
                print(f"Skipping video {video_file} due to empty segment outputs.")
                continue
            video_probs = torch.mean(torch.stack([output[2] for output in segment_outputs]), dim=0)
            video_pred_label = video_probs.argmax().item()
            video_pred_score = video_probs[0, video_pred_label].item()
            video_true_score = video_probs[0, true_label].item()
            
            if use_exact:
                sv_true_label = shap_calculator.exact_shapley_values(segment_outputs, true_label)
                sv_video_pred = shap_calculator.exact_shapley_values(segment_outputs, video_pred_label)
            else:
                sv_true_label = shap_calculator.approximate_shapley_values(segment_outputs, true_label)
                sv_video_pred = shap_calculator.approximate_shapley_values(segment_outputs, video_pred_label)
            
            predictions.append((video_file, video_pred_label, video_pred_score, video_true_score, true_label, segment_outputs, sv_true_label, sv_video_pred))
        except Exception as e:
            print(f"Error processing video {video_file}: {e}")
            continue
    return predictions

def save_results(predictions, filename="results.json"):
    results = []
    for video_file, video_pred_label, video_pred_score, video_true_score, video_true_label, segment_outputs, sv_true_label, sv_video_pred in predictions:
        video_result = {
            "video_file": video_file,
            "video_pred_label": video_pred_label,
            "video_pred_score": video_pred_score,
            "video_true_score": video_true_score,
            "video_true_label": video_true_label,
            "segments": []
        }
        for i, (segment_label, segment_score, probabilities) in enumerate(segment_outputs):
            segment_video_label_score = probabilities[0, video_pred_label].item()
            segment_true_label_score = probabilities[0, video_true_label].item()
            video_result["segments"].append({
                "segment_index": i + 1,
                "segment_label": segment_label,
                "segment_score": segment_score,
                "segment_video_label_score": segment_video_label_score,
                "segment_true_label_score": segment_true_label_score,
                "sv_true_label": sv_true_label[i],
                "sv_video_pred": sv_video_pred[i]
            })
        results.append(video_result)
    
    with open(filename, "w") as f:
        json.dump(results, f, indent=4)

# 配置
config = {
    "model_name": "facebook/timesformer-base-finetuned-k400",  # 用户可以在这里更换模型名称，例如 "huggingface/vivit"
    "image_processor_name": "MCG-NJU/videomae-base-finetuned-kinetics",
    "num_samples": 100,  # 近似 Shapley Value 计算的采样次数
    "video_list_path": "archive/kinetics400_val_list_videos.txt",
    "video_directory": "archive/videos_val",
    "use_exact": True  # 设置为 True 以使用精确 Shapley Value 计算
}

# 初始化处理器
video_processor = VideoProcessor(config["model_name"], config["image_processor_name"])
shap_calculator = TemporalShap(num_samples=config["num_samples"])

# 读取视频列表和标签
video_files = []
true_labels = []
with open(config["video_list_path"], "r") as f:
    for line in f:
        name, label = line.strip().split()
        video_files.append(name)
        true_labels.append(int(label))

# 获取数据和预测
video_data = process_videos(video_processor, shap_calculator, video_files, true_labels, use_exact=config["use_exact"])

# 打印并保存结果
save_results(video_data)

# 打印结果以便查看
for video_file, video_pred_label, video_pred_score, video_true_score, true_label, segment_outputs, sv_true_label, sv_video_pred in video_data:
    print(f"Video: {video_file}, Overall Predicted Label = {video_pred_label}, Overall Prediction Score = {video_pred_score:.4f}, True Label = {true_label}, True Label Score = {video_true_score:.4f}")
    for i, (segment_label, segment_score, probabilities) in enumerate(segment_outputs):
        segment_video_label_score = probabilities[0, video_pred_label].item()
        segment_true_label_score = probabilities[0, true_label].item()
        print(f"  Segment {i+1}: Predicted Label = {segment_label}, Prediction Score = {segment_score:.4f}, Segment Video Label Score = {segment_video_label_score:.4f}, Segment True Label Score = {segment_true_label_score:.4f}, SV True Label = {sv_true_label[i]:.4f}, SV Predicted Label = {sv_video_pred[i]:.4f}")




Processing videos:   0%|          | 0/19796 [00:00<?, ?video/s]

Error processing video jf7RDuUTrsQ.mp4: Unable to create tensor, you should probably activate padding with 'padding=True' to have batched tensors with the same length.


添加断电续传 & performance

In [3]:
import os
import av
import torch
import numpy as np
import torch.nn.functional as F
import random
import json
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from transformers import AutoImageProcessor, TimesformerForVideoClassification
from collections import defaultdict
from tqdm.notebook import tqdm
from itertools import combinations

class VideoProcessor:
    def __init__(self, model_name, image_processor_name, device='cuda'):
        self.model = self.load_model(model_name)
        self.image_processor = AutoImageProcessor.from_pretrained(image_processor_name)
        self.device = device
        self.model.to(device)

    def load_model(self, model_name):
        if "timesformer" in model_name.lower():
            return TimesformerForVideoClassification.from_pretrained(model_name)
        else:
            raise ValueError(f"Unsupported model name: {model_name}")

    def split_video_into_segments(self, container, n_segments=8, frames_per_segment=16):
        frame_list = [frame.to_image() for frame in container.decode(video=0)]
        total_frames = len(frame_list)
        segment_length = total_frames // n_segments
        segments = []
        for i in range(n_segments):
            start = i * segment_length
            end = min(start + segment_length, total_frames)
            segment_frames = frame_list[start:end] if end - start == segment_length else frame_list[start:] + [frame_list[-1]] * (segment_length - (end - start))
            segments.append(segment_frames[:frames_per_segment])
        return segments

    def predict_video_and_segments(self, container, true_label):
        video_segments = self.split_video_into_segments(container)
        segment_outputs = []
        with torch.no_grad():
            for segment in video_segments:
                inputs = self.image_processor(list(segment), return_tensors="pt")
                inputs = {k: v.to(self.device) for k, v in inputs.items()}
                try:
                    outputs = self.model(**inputs)
                    logits = outputs.logits
                    probabilities = F.softmax(logits, dim=-1)
                    pred_label = logits.argmax(-1).item()
                    pred_score = probabilities[0, pred_label].item()
                    segment_outputs.append((pred_label, pred_score, probabilities))
                except RuntimeError as e:
                    print(f"Error processing segment: {e}")
                    continue
        return segment_outputs

class TemporalShap:
    def __init__(self, num_samples=100):
        self.num_samples = num_samples
    
    def approximate_shapley_values(self, segment_outputs, label_index):
        n = len(segment_outputs)
        shapley_values = [0] * n
        for _ in range(self.num_samples):
            random_subset = sorted(range(n), key=lambda _: random.random())
            subset_prob = torch.zeros_like(segment_outputs[0][2])
            for i, index in enumerate(random_subset):
                old_contribution = subset_prob[0, label_index].item()
                subset_prob += segment_outputs[index][2]
                subset_prob /= (i + 1)
                new_contribution = subset_prob[0, label_index].item()
                shapley_values[index] += new_contribution - old_contribution
        return [val / self.num_samples for val in shapley_values]

    def exact_shapley_values(self, segment_outputs, label_index):
        n = len(segment_outputs)
        shapley_values = [0] * n
        all_indices = list(range(n))
        for i in all_indices:
            marginal_contributions = []
            for subset_size in range(n):
                subsets = list(combinations([x for x in all_indices if x != i], subset_size))
                for subset in subsets:
                    subset_prob = torch.zeros_like(segment_outputs[0][2])
                    if subset:
                        subset_prob = torch.mean(torch.stack([segment_outputs[j][2] for j in subset]), dim=0)
                    with_i_prob = (subset_prob * len(subset) + segment_outputs[i][2]) / (len(subset) + 1)
                    marginal_contributions.append(with_i_prob[0, label_index].item() - subset_prob[0, label_index].item())
            shapley_values[i] = np.mean(marginal_contributions)
        return shapley_values

def process_videos(video_processor, shap_calculator, sampled_files, true_labels, use_exact=False, start_index=0):
    predictions = []
    for idx, (video_file, true_label) in tqdm(enumerate(zip(sampled_files, true_labels)), desc="Processing videos", total=len(sampled_files), initial=start_index, unit="video"):
        if idx < start_index:
            continue
        file_path = os.path.join(config["video_directory"], video_file)
        container = av.open(file_path)
        try:
            segment_outputs = video_processor.predict_video_and_segments(container, true_label)
            if not segment_outputs:
                print(f"Skipping video {video_file} due to empty segment outputs.")
                continue
            video_probs = torch.mean(torch.stack([output[2] for output in segment_outputs]), dim=0)
            video_pred_label = video_probs.argmax().item()
            video_pred_score = video_probs[0, video_pred_label].item()
            video_true_score = video_probs[0, true_label].item()
            
            if use_exact:
                sv_true_label = shap_calculator.exact_shapley_values(segment_outputs, true_label)
                sv_video_pred = shap_calculator.exact_shapley_values(segment_outputs, video_pred_label)
            else:
                sv_true_label = shap_calculator.approximate_shapley_values(segment_outputs, true_label)
                sv_video_pred = shap_calculator.approximate_shapley_values(segment_outputs, video_pred_label)
            
            prediction = (video_file, video_pred_label, video_pred_score, video_true_score, true_label, segment_outputs, sv_true_label, sv_video_pred)
            predictions.append(prediction)
            save_partial_results(prediction, "results.json")
        except Exception as e:
            print(f"Error processing video {video_file}: {e}")
            continue
    return predictions

def save_partial_results(prediction, filename):
    video_file, video_pred_label, video_pred_score, video_true_score, video_true_label, segment_outputs, sv_true_label, sv_video_pred = prediction
    video_result = {
        "video_file": video_file,
        "video_pred_label": video_pred_label,
        "video_pred_score": video_pred_score,
        "video_true_score": video_true_score,
        "video_true_label": video_true_label,
        "segments": []
    }
    for i, (segment_label, segment_score, probabilities) in enumerate(segment_outputs):
        segment_video_label_score = probabilities[0, video_pred_label].item()
        segment_true_label_score = probabilities[0, video_true_label].item()
        video_result["segments"].append({
            "segment_index": i + 1,
            "segment_label": segment_label,
            "segment_score": segment_score,
            "segment_video_label_score": segment_video_label_score,
            "segment_true_label_score": segment_true_label_score,
            "sv_true_label": sv_true_label[i],
            "sv_video_pred": sv_video_pred[i]
        })
    
    if os.path.exists(filename):
        with open(filename, "r") as f:
            results = json.load(f)
    else:
        results = []
    
    results.append(video_result)
    
    with open(filename, "w") as f:
        json.dump(results, f, indent=4)

def load_existing_results(filename):
    if os.path.exists(filename):
        with open(filename, "r") as f:
            results = json.load(f)
            processed_files = {result["video_file"] for result in results}
            return results, processed_files
    return [], set()

def save_results(predictions, filename="results.json"):
    results = []
    for video_file, video_pred_label, video_pred_score, video_true_score, video_true_label, segment_outputs, sv_true_label, sv_video_pred in predictions:
        video_result = {
            "video_file": video_file,
            "video_pred_label": video_pred_label,
            "video_pred_score": video_pred_score,
            "video_true_score": video_true_score,
            "video_true_label": video_true_label,
            "segments": []
        }
        for i, (segment_label, segment_score, probabilities) in enumerate(segment_outputs):
            segment_video_label_score = probabilities[0, video_pred_label].item()
            segment_true_label_score = probabilities[0, video_true_label].item()
            video_result["segments"].append({
                "segment_index": i + 1,
                "segment_label": segment_label,
                "segment_score": segment_score,
                "segment_video_label_score": segment_video_label_score,
                "segment_true_label_score": segment_true_label_score,
                "sv_true_label": sv_true_label[i],
                "sv_video_pred": sv_video_pred[i]
            })
        results.append(video_result)
    
    with open(filename, "w") as f:
        json.dump(results, f, indent=4)

def compute_metrics(predictions):
    true_labels = [pred[4] for pred in predictions]
    pred_labels = [pred[1] for pred in predictions]
    
    accuracy = accuracy_score(true_labels, pred_labels)
    precision = precision_score(true_labels, pred_labels, average='weighted')
    recall = recall_score(true_labels, pred_labels, average='weighted')
    f1 = f1_score(true_labels, pred_labels, average='weighted')
    
    return accuracy, precision, recall, f1

def save_performance_metrics(accuracy, precision, recall, f1, filename="performance.json"):
    performance = {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1_score": f1
    }
    
    with open(filename, "w") as f:
        json.dump(performance, f, indent=4)

# 配置
config = {
    "model_name": "facebook/timesformer-base-finetuned-k400",  # 用户可以在这里更换模型名称，例如 "huggingface/vivit"
    "image_processor_name": "MCG-NJU/videomae-base-finetuned-kinetics",
    "num_samples": 100,  # 近似 Shapley Value 计算的采样次数
    "video_list_path": "archive/kinetics400_val_list_videos.txt",
    "video_directory": "archive/videos_val",
    "use_exact": True  # 设置为 True 以使用精确 Shapley Value 计算
}

# 初始化处理器
video_processor = VideoProcessor(config["model_name"], config["image_processor_name"])
shap_calculator = TemporalShap(num_samples=config["num_samples"])

# 读取视频列表和标签
video_files = []
true_labels = []
with open(config["video_list_path"], "r") as f:
    for line in f:
        name, label = line.strip().split()
        video_files.append(name)
        true_labels.append(int(label))

# 加载现有结果和已处理文件
existing_results, processed_files = load_existing_results("results.json")

# 筛选未处理文件
unprocessed_files = [f for f in video_files if f not in processed_files]
unprocessed_labels = [true_labels[video_files.index(f)] for f in unprocessed_files]

# 获取数据和预测
video_data = process_videos(video_processor, shap_calculator, unprocessed_files, unprocessed_labels, use_exact=config["use_exact"], start_index=len(existing_results))

# 合并现有结果与新结果
all_results = existing_results + video_data

# 打印并保存结果
save_results(all_results)

# 计算并输出指标
accuracy, precision, recall, f1 = compute_metrics(all_results)
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

# 保存性能指标
save_performance_metrics(accuracy, precision, recall, f1, filename="performance.json")

# 打印详细结果
for video_file, video_pred_label, video_pred_score, video_true_score, true_label, segment_outputs, sv_true_label, sv_video_pred in video_data:
    print(f"Video: {video_file}, Overall Predicted Label = {video_pred_label}, Overall Prediction Score = {video_pred_score:.4f}, True Label = {true_label}, True Label Score = {video_true_score:.4f}")
    for i, (segment_label, segment_score, probabilities) in enumerate(segment_outputs):
        segment_video_label_score = probabilities[0, video_pred_label].item()
        segment_true_label_score = probabilities[0, true_label].item()
        print(f"  Segment {i+1}: Predicted Label = {segment_label}, Prediction Score = {segment_score:.4f}, Segment Video Label Score = {segment_video_label_score:.4f}, Segment True Label Score = {segment_true_label_score:.4f}, SV True Label = {sv_true_label[i]:.4f}, SV Predicted Label = {sv_video_pred[i]:.4f}")




Processing videos:   4%|3         | 750/19046 [00:00<?, ?video/s]

  return torch.tensor(value)


KeyboardInterrupt: 

添加能耗记录

In [4]:
import os
import av
import torch
import numpy as np
import torch.nn.functional as F
import random
import json
import time
import subprocess
import psutil
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from transformers import AutoImageProcessor, TimesformerForVideoClassification
from collections import defaultdict
from tqdm.notebook import tqdm
from itertools import combinations

class VideoProcessor:
    def __init__(self, model_name, image_processor_name, device='cuda'):
        self.model = self.load_model(model_name)
        self.image_processor = AutoImageProcessor.from_pretrained(image_processor_name)
        self.device = device
        self.model.to(device)

    def load_model(self, model_name):
        if "timesformer" in model_name.lower():
            return TimesformerForVideoClassification.from_pretrained(model_name)
        else:
            raise ValueError(f"Unsupported model name: {model_name}")

    def split_video_into_segments(self, container, n_segments=8, frames_per_segment=16):
        frame_list = [frame.to_image() for frame in container.decode(video=0)]
        total_frames = len(frame_list)
        segment_length = total_frames // n_segments
        segments = []
        for i in range(n_segments):
            start = i * segment_length
            end = min(start + segment_length, total_frames)
            segment_frames = frame_list[start:end] if end - start == segment_length else frame_list[start:] + [frame_list[-1]] * (segment_length - (end - start))
            segments.append(segment_frames[:frames_per_segment])
        return segments

    def predict_video_and_segments(self, container, true_label):
        video_segments = self.split_video_into_segments(container)
        segment_outputs = []
        with torch.no_grad():
            for segment in video_segments:
                inputs = self.image_processor(list(segment), return_tensors="pt")
                inputs = {k: v.to(self.device) for k, v in inputs.items()}
                try:
                    outputs = self.model(**inputs)
                    logits = outputs.logits
                    probabilities = F.softmax(logits, dim=-1)
                    pred_label = logits.argmax(-1).item()
                    pred_score = probabilities[0, pred_label].item()
                    segment_outputs.append((pred_label, pred_score, probabilities))
                except RuntimeError as e:
                    print(f"Error processing segment: {e}")
                    continue
        return segment_outputs

class TemporalShap:
    def __init__(self, num_samples=100):
        self.num_samples = num_samples
    
    def approximate_shapley_values(self, segment_outputs, label_index):
        n = len(segment_outputs)
        shapley_values = [0] * n
        for _ in range(self.num_samples):
            random_subset = sorted(range(n), key=lambda _: random.random())
            subset_prob = torch.zeros_like(segment_outputs[0][2])
            for i, index in enumerate(random_subset):
                old_contribution = subset_prob[0, label_index].item()
                subset_prob += segment_outputs[index][2]
                subset_prob /= (i + 1)
                new_contribution = subset_prob[0, label_index].item()
                shapley_values[index] += new_contribution - old_contribution
        return [val / self.num_samples for val in shapley_values]

    def exact_shapley_values(self, segment_outputs, label_index):
        n = len(segment_outputs)
        shapley_values = [0] * n
        all_indices = list(range(n))
        for i in all_indices:
            marginal_contributions = []
            for subset_size in range(n):
                subsets = list(combinations([x for x in all_indices if x != i], subset_size))
                for subset in subsets:
                    subset_prob = torch.zeros_like(segment_outputs[0][2])
                    if subset:
                        subset_prob = torch.mean(torch.stack([segment_outputs[j][2] for j in subset]), dim=0)
                    with_i_prob = (subset_prob * len(subset) + segment_outputs[i][2]) / (len(subset) + 1)
                    marginal_contributions.append(with_i_prob[0, label_index].item() - subset_prob[0, label_index].item())
            shapley_values[i] = np.mean(marginal_contributions)
        return shapley_values

def get_gpu_energy():
    try:
        result = subprocess.check_output(['nvidia-smi', '--query-gpu=energy.draw', '--format=csv,noheader,nounits'])
        return float(result.strip().split()[0])
    except Exception as e:
        print(f"Error reading GPU energy: {e}")
        return 0.0

def get_cpu_energy():
    try:
        energy = 0.0
        for domain in psutil.sensors_battery():
            energy += domain.power * domain.energy
        return energy
    except Exception as e:
        print(f"Error reading CPU energy: {e}")
        return 0.0

def process_videos(video_processor, shap_calculator, sampled_files, true_labels, use_exact=False, start_index=0):
    predictions = []
    for idx, (video_file, true_label) in tqdm(enumerate(zip(sampled_files, true_labels)), desc="Processing videos", total=len(sampled_files), initial=start_index, unit="video"):
        if idx < start_index:
            continue
        file_path = os.path.join(config["video_directory"], video_file)
        container = av.open(file_path)
        try:
            segment_outputs = video_processor.predict_video_and_segments(container, true_label)
            if not segment_outputs:
                print(f"Skipping video {video_file} due to empty segment outputs.")
                continue
            video_probs = torch.mean(torch.stack([output[2] for output in segment_outputs]), dim=0)
            video_pred_label = video_probs.argmax().item()
            video_pred_score = video_probs[0, video_pred_label].item()
            video_true_score = video_probs[0, true_label].item()
            
            if use_exact:
                sv_true_label = shap_calculator.exact_shapley_values(segment_outputs, true_label)
                sv_video_pred = shap_calculator.exact_shapley_values(segment_outputs, video_pred_label)
            else:
                sv_true_label = shap_calculator.approximate_shapley_values(segment_outputs, true_label)
                sv_video_pred = shap_calculator.approximate_shapley_values(segment_outputs, video_pred_label)
            
            prediction = (video_file, video_pred_label, video_pred_score, video_true_score, true_label, segment_outputs, sv_true_label, sv_video_pred)
            predictions.append(prediction)
            save_partial_results(prediction, "results.json")
        except Exception as e:
            print(f"Error processing video {video_file}: {e}")
            continue
    return predictions

def save_partial_results(prediction, filename):
    video_file, video_pred_label, video_pred_score, video_true_score, video_true_label, segment_outputs, sv_true_label, sv_video_pred = prediction
    video_result = {
        "video_file": video_file,
        "video_pred_label": video_pred_label,
        "video_pred_score": video_pred_score,
        "video_true_score": video_true_score,
        "video_true_label": video_true_label,
        "segments": []
    }
    for i, (segment_label, segment_score, probabilities) in enumerate(segment_outputs):
        segment_video_label_score = probabilities[0, video_pred_label].item()
        segment_true_label_score = probabilities[0, video_true_label].item()
        video_result["segments"].append({
            "segment_index": i + 1,
            "segment_label": segment_label,
            "segment_score": segment_score,
            "segment_video_label_score": segment_video_label_score,
            "segment_true_label_score": segment_true_label_score,
            "sv_true_label": sv_true_label[i],
            "sv_video_pred": sv_video_pred[i]
        })
    
    if os.path.exists(filename):
        with open(filename, "r") as f:
            results = json.load(f)
    else:
        results = []
    
    results.append(video_result)
    
    with open(filename, "w") as f:
        json.dump(results, f, indent=4)

def load_existing_results(filename):
    if os.path.exists(filename):
        with open(filename, "r") as f:
            results = json.load(f)
            processed_files = {result["video_file"] for result in results}
            return results, processed_files
    return [], set()

def save_results(predictions, filename="results.json"):
    results = []
    for video_file, video_pred_label, video_pred_score, video_true_score, video_true_label, segment_outputs, sv_true_label, sv_video_pred in predictions:
        video_result = {
            "video_file": video_file,
            "video_pred_label": video_pred_label,
            "video_pred_score": video_pred_score,
            "video_true_score": video_true_score,
            "video_true_label": video_true_label,
            "segments": []
        }
        for i, (segment_label, segment_score, probabilities) in enumerate(segment_outputs):
            segment_video_label_score = probabilities[0, video_pred_label].item()
            segment_true_label_score = probabilities[0, video_true_label].item()
            video_result["segments"].append({
                "segment_index": i + 1,
                "segment_label": segment_label,
                "segment_score": segment_score,
                "segment_true_label_score": segment_true_label_score,
                "sv_true_label": sv_true_label[i],
                "sv_video_pred": sv_video_pred[i]
            })
            results.append(video_result)
        
        with open(filename, "w") as f:
            json.dump(results, f, indent=4)

def compute_metrics(predictions):
    true_labels = [pred[4] for pred in predictions]
    pred_labels = [pred[1] for pred in predictions]
    
    accuracy = accuracy_score(true_labels, pred_labels)
    precision = precision_score(true_labels, pred_labels, average='weighted')
    recall = recall_score(true_labels, pred_labels, average='weighted')
    f1 = f1_score(true_labels, pred_labels, average='weighted')
    
    return accuracy, precision, recall, f1

def save_performance_metrics(accuracy, precision, recall, f1, time_consumed, cpu_energy, gpu_energy, filename="performance.json"):
    performance = {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1_score": f1,
        "time_consumed": time_consumed,
        "cpu_energy": cpu_energy,
        "gpu_energy": gpu_energy
    }
    
    with open(filename, "w") as f:
        json.dump(performance, f, indent=4)

# 配置
config = {
    "model_name": "facebook/timesformer-base-finetuned-k400",  # 用户可以在这里更换模型名称，例如 "huggingface/vivit"
    "image_processor_name": "MCG-NJU/videomae-base-finetuned-kinetics",
    "num_samples": 100,  # 近似 Shapley Value 计算的采样次数
    "video_list_path": "archive/kinetics400_val_list_videos.txt",
    "video_directory": "archive/videos_val",
    "use_exact": True  # 设置为 True 以使用精确 Shapley Value 计算
}

# 初始化处理器
video_processor = VideoProcessor(config["model_name"], config["image_processor_name"])
shap_calculator = TemporalShap(num_samples=config["num_samples"])

# 读取视频列表和标签
video_files = []
true_labels = []
with open(config["video_list_path"], "r") as f:
    for line in f:
        name, label = line.strip().split()
        video_files.append(name)
        true_labels.append(int(label))

# 加载现有结果和已处理文件
existing_results, processed_files = load_existing_results("results.json")

# 筛选未处理文件
unprocessed_files = [f for f in video_files if f not in processed_files]
unprocessed_labels = [true_labels[video_files.index(f)] for f in unprocessed_files]

# 记录开始时间和能量
start_time = time.time()
initial_cpu_energy = get_cpu_energy()
initial_gpu_energy = get_gpu_energy()

# 获取数据和预测
video_data = process_videos(video_processor, shap_calculator, unprocessed_files, unprocessed_labels, use_exact=config["use_exact"], start_index=len(existing_results))

# 记录结束时间和能量
end_time = time.time()
final_cpu_energy = get_cpu_energy()
final_gpu_energy = get_gpu_energy()
time_consumed = end_time - start_time
cpu_energy_consumed = final_cpu_energy - initial_cpu_energy
gpu_energy_consumed = final_gpu_energy - initial_gpu_energy

# 合并现有结果与新结果
all_results = existing_results + video_data

# 打印并保存结果
save_results(all_results)

# 计算并输出指标
accuracy, precision, recall, f1 = compute_metrics(all_results)
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

# 保存性能指标
save_performance_metrics(accuracy, precision, recall, f1, time_consumed, cpu_energy_consumed, gpu_energy_consumed, filename="performance.json")

# 打印详细结果
for video_file, video_pred_label, video_pred_score, video_true_score, true_label, segment_outputs, sv_true_label, sv_video_pred in video_data:
    print(f"Video: {video_file}, Overall Predicted Label = {video_pred_label}, Overall Prediction Score = {video_pred_score:.4f}, True Label = {true_label}, True Label Score = {video_true_score:.4f}")
    for i, (segment_label, segment_score, probabilities) in enumerate(segment_outputs):
        segment_video_label_score = probabilities[0, video_pred_label].item()
        segment_true_label_score = probabilities[0, true_label].item()
        print(f"  Segment {i+1}: Predicted Label = {segment_label}, Prediction Score = {segment_score:.4f}, Segment Video Label Score = {segment_video_label_score:.4f}, Segment True Label Score = {segment_true_label_score:.4f}, SV True Label = {sv_true_label[i]:.4f}, SV Predicted Label = {sv_video_pred[i]:.4f}")




Error reading CPU energy: 'int' object has no attribute 'power'
Error reading GPU energy: Command '['nvidia-smi', '--query-gpu=energy.draw', '--format=csv,noheader,nounits']' returned non-zero exit status 2.


Processing videos:   0%|          | 0/19796 [00:00<?, ?video/s]

  return torch.tensor(value)


KeyboardInterrupt: 

指定Class数据集的完整功能

In [2]:
import os
import av
import torch
import numpy as np
import torch.nn.functional as F
import random
import json
import time
import subprocess
import psutil
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from transformers import AutoImageProcessor, TimesformerForVideoClassification
from collections import defaultdict
from tqdm.notebook import tqdm
from itertools import combinations

class VideoProcessor:
    def __init__(self, model_name, image_processor_name, device='cuda'):
        self.model = self.load_model(model_name)
        self.image_processor = AutoImageProcessor.from_pretrained(image_processor_name)
        self.device = device
        self.model.to(device)

    def load_model(self, model_name):
        if "timesformer" in model_name.lower():
            return TimesformerForVideoClassification.from_pretrained(model_name)
        else:
            raise ValueError(f"Unsupported model name: {model_name}")

    def split_video_into_segments(self, container, n_segments=8, frames_per_segment=16):
        frame_list = [frame.to_image() for frame in container.decode(video=0)]
        total_frames = len(frame_list)
        segment_length = total_frames // n_segments
        segments = []
        for i in range(n_segments):
            start = i * segment_length
            end = min(start + segment_length, total_frames)
            segment_frames = frame_list[start:end] if end - start == segment_length else frame_list[start:] + [frame_list[-1]] * (segment_length - (end - start))
            segments.append(segment_frames[:frames_per_segment])
        return segments

    def predict_video_and_segments(self, container, true_label):
        video_segments = self.split_video_into_segments(container)
        segment_outputs = []
        with torch.no_grad():
            for segment in video_segments:
                inputs = self.image_processor(list(segment), return_tensors="pt")
                inputs = {k: v.to(self.device) for k, v in inputs.items()}
                try:
                    outputs = self.model(**inputs)
                    logits = outputs.logits
                    probabilities = F.softmax(logits, dim=-1)
                    pred_label = logits.argmax(-1).item()
                    pred_score = probabilities[0, pred_label].item()
                    segment_outputs.append((pred_label, pred_score, probabilities))
                except RuntimeError as e:
                    print(f"Error processing segment: {e}")
                    continue
        return segment_outputs

class TemporalShap:
    def __init__(self, num_samples=100):
        self.num_samples = num_samples

    def approximate_shapley_values(self, segment_outputs, label_index):
        n = len(segment_outputs)
        shapley_values = [0] * n
        for _ in range(self.num_samples):
            random_subset = sorted(range(n), key=lambda _: random.random())
            subset_prob = torch.zeros_like(segment_outputs[0][2])
            for i, index in enumerate(random_subset):
                old_contribution = subset_prob[0, label_index].item()
                subset_prob += segment_outputs[index][2]
                subset_prob /= (i + 1)
                new_contribution = subset_prob[0, label_index].item()
                shapley_values[index] += new_contribution - old_contribution
        return [val / self.num_samples for val in shapley_values]

    def exact_shapley_values(self, segment_outputs, label_index):
        n = len(segment_outputs)
        shapley_values = [0] * n
        all_indices = list(range(n))
        for i in all_indices:
            marginal_contributions = []
            for subset_size in range(n):
                subsets = list(combinations([x for x in all_indices if x != i], subset_size))
                for subset in subsets:
                    subset_prob = torch.zeros_like(segment_outputs[0][2])
                    if subset:
                        subset_prob = torch.mean(torch.stack([segment_outputs[j][2] for j in subset]), dim=0)
                    with_i_prob = (subset_prob * len(subset) + segment_outputs[i][2]) / (len(subset) + 1)
                    marginal_contributions.append(with_i_prob[0, label_index].item() - subset_prob[0, label_index].item())
            shapley_values[i] = np.mean(marginal_contributions)
        return shapley_values

def get_gpu_energy():
    try:
        result = subprocess.check_output(['nvidia-smi', '--query-gpu=energy.draw', '--format=csv,noheader,nounits'])
        return float(result.strip().split()[0])
    except Exception as e:
        print(f"Error reading GPU energy: {e}")
        return 0.0

def get_cpu_energy():
    try:
        energy = 0.0
        for domain in psutil.sensors_battery():
            energy += domain.power * domain.energy
        return energy
    except Exception as e:
        print(f"Error reading CPU energy: {e}")
        return 0.0
    
import subprocess
import time

def get_gpu_power_draw():
    try:
        # This command retrieves the current power usage in watts.
        result = subprocess.check_output(['nvidia-smi', '--query-gpu=power.draw', '--format=csv,noheader,nounits'], text=True)
        return float(result.strip())
    except Exception as e:
        print(f"Error reading GPU power: {e}")
        return 0.0

def get_system_time():
    return time.time()  # Return the current system time in seconds

# Example usage in your processing function
start_time = get_system_time()
initial_gpu_power = get_gpu_power_draw()

# Execute your long-running process here
time.sleep(10)  # Simulating a delay

end_time = get_system_time()
final_gpu_power = get_gpu_power_draw()

time_consumed = end_time - start_time
average_gpu_power = (initial_gpu_power + final_gpu_power) / 2
energy_consumed = average_gpu_power * (time_consumed / 3600)  # Convert power usage in watts to kilowatt-hours if needed

def process_videos(video_processor, shap_calculator, sampled_files, true_labels, use_exact=False, start_index=0):
    predictions = []
    for idx, (video_file, true_label) in tqdm(enumerate(zip(sampled_files, true_labels)), desc="Processing videos", total=len(sampled_files), initial=start_index, unit="video"):
        if idx < start_index:
            continue
        file_path = os.path.join(config["video_directory"], video_file)
        container = av.open(file_path)
        try:
            segment_outputs = video_processor.predict_video_and_segments(container, true_label)
            if not segment_outputs:
                print(f"Skipping video {video_file} due to empty segment outputs.")
                continue
            video_probs = torch.mean(torch.stack([output[2] for output in segment_outputs]), dim=0)
            video_pred_label = video_probs.argmax().item()
            video_pred_score = video_probs[0, video_pred_label].item()
            video_true_score = video_probs[0, true_label].item()
            
            if use_exact:
                sv_true_label = shap_calculator.exact_shapley_values(segment_outputs, true_label)
                sv_video_pred = shap_calculator.exact_shapley_values(segment_outputs, video_pred_label)
            else:
                sv_true_label = shap_calculator.approximate_shapley_values(segment_outputs, true_label)
                sv_video_pred = shap_calculator.approximate_shapley_values(segment_outputs, video_pred_label)
            
            prediction = (video_file, video_pred_label, video_pred_score, video_true_score, true_label, segment_outputs, sv_true_label, sv_video_pred)
            predictions.append(prediction)
            save_partial_results(prediction, "results.json")
        except Exception as e:
            print(f"Error processing video {video_file}: {e}")
            continue
    return predictions

def save_partial_results(prediction, filename):
    video_file, video_pred_label, video_pred_score, video_true_score, video_true_label, segment_outputs, sv_true_label, sv_video_pred = prediction
    video_result = {
        "video_file": video_file,
        "video_pred_label": video_pred_label,
        "video_pred_score": video_pred_score,
        "video_true_score": video_true_score,
        "video_true_label": video_true_label,
        "segments": []
    }
    for i, (segment_label, segment_score, probabilities) in enumerate(segment_outputs):
        segment_video_label_score = probabilities[0, video_pred_label].item()
        segment_true_label_score = probabilities[0, video_true_label].item()
        video_result["segments"].append({
            "segment_index": i + 1,
            "segment_label": segment_label,
            "segment_score": segment_score,
            "segment_video_label_score": segment_video_label_score,
            "segment_true_label_score": segment_true_label_score,
            "sv_true_label": sv_true_label[i],
            "sv_video_pred": sv_video_pred[i]
        })
    
    if os.path.exists(filename):
        with open(filename, "r+") as f:
            results = json.load(f)
            results.append(video_result)
            f.seek(0)
            json.dump(results, f, indent=4)
    else:
        with open(filename, "w") as f:
            json.dump([video_result], f, indent=4)

def load_existing_results(filename):
    if os.path.exists(filename):
        with open(filename, "r") as f:
            results = json.load(f)
            processed_files = {result["video_file"] for result in results}
            return results, processed_files
    return [], set()

def save_results(predictions, filename="results.json"):
    results = []
    for video_file, video_pred_label, video_pred_score, video_true_score, video_true_label, segment_outputs, sv_true_label, sv_video_pred in predictions:
        video_result = {
            "video_file": video_file,
            "video_pred_label": video_pred_label,
            "video_pred_score": video_pred_score,
            "video_true_score": video_true_score,
            "video_true_label": video_true_label,
            "segments": []
        }
        for i, (segment_label, segment_score, probabilities) in enumerate(segment_outputs):
            segment_video_label_score = probabilities[0, video_pred_label].item()
            segment_true_label_score = probabilities[0, video_true_label].item()
            video_result["segments"].append({
                "segment_index": i + 1,
                "segment_label": segment_label,
                "segment_score": segment_score,
                "segment_video_label_score": segment_video_label_score,
                "segment_true_label_score": segment_true_label_score,
                "sv_true_label": sv_true_label[i],
                "sv_video_pred": sv_video_pred[i]
            })
        results.append(video_result)

    with open(filename, "w") as f:
        json.dump(results, f, indent=4)

def compute_metrics(predictions):
    true_labels = [pred[4] for pred in predictions]
    pred_labels = [pred[1] for pred in predictions]
    
    accuracy = accuracy_score(true_labels, pred_labels)
    precision = precision_score(true_labels, pred_labels, average='weighted')
    recall = recall_score(true_labels, pred_labels, average='weighted')
    f1 = f1_score(true_labels, pred_labels, average='weighted')
    
    return accuracy, precision, recall, f1

def save_performance_metrics(accuracy, precision, recall, f1, time_consumed, cpu_energy, gpu_energy, filename="performance.json"):
    performance = {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1_score": f1,
        "time_consumed": time_consumed,
        "cpu_energy": cpu_energy,
        "gpu_energy": gpu_energy
    }
    
    with open(filename, "w") as f:
        json.dump(performance, f, indent=4)

# Configuration
config = {
    "model_name": "facebook/timesformer-base-finetuned-k400",
    "image_processor_name": "MCG-NJU/videomae-base-finetuned-kinetics",
    "num_samples": 10,
    "num_classes": 4,  # For flexible dataset input
    "num_samples_per_class": 5,  # For flexible dataset input
    "video_list_path": "archive/kinetics400_val_list_videos.txt",
    "video_directory": "archive/zoom_blur",
    "use_exact": True
}

# Initialize processors
video_processor = VideoProcessor(config["model_name"], config["image_processor_name"])
shap_calculator = TemporalShap(num_samples=config["num_samples"])

# Load existing results
existing_results, processed_files = load_existing_results("results.json")

# Read video list and organize by categories if necessary
video_labels = defaultdict(list)
with open(config["video_list_path"], "r") as f:
    for line in f:
        name, label = line.strip().split()
        video_labels[int(label)].append(name)

# Prepare video samples for the configured number of classes and samples
sampled_files = []
true_labels = []
selected_classes = random.sample(list(video_labels.keys()), config["num_classes"])
for cls in selected_classes:
    sampled_files.extend(random.sample(video_labels[cls], config["num_samples_per_class"]))
    true_labels.extend([cls] * config["num_samples_per_class"])

# Filter unprocessed files
unprocessed_files = [f for f in sampled_files if f not in processed_files]
unprocessed_labels = [true_labels[sampled_files.index(f)] for f in unprocessed_files]

# Record start time and energy consumption
start_time = time.time()
initial_cpu_energy = get_cpu_energy()
initial_gpu_energy = get_gpu_energy()

# Process videos
video_data = process_videos(video_processor, shap_calculator, unprocessed_files, unprocessed_labels, use_exact=config["use_exact"], start_index=len(existing_results))

# Record end time and energy consumption
end_time = time.time()
final_cpu_energy = get_cpu_energy()
final_gpu_energy = get_gpu_energy()
time_consumed = end_time - start_time
cpu_energy_consumed = final_cpu_energy - initial_cpu_energy
gpu_energy_consumed = final_gpu_energy - initial_gpu_energy

# Combine existing results with new data
all_results = existing_results + video_data

# Save results
save_results(all_results)

# Compute and output metrics
accuracy, precision, recall, f1 = compute_metrics(all_results)
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

# Save performance metrics
save_performance_metrics(accuracy, precision, recall, f1, time_consumed, cpu_energy_consumed, gpu_energy_consumed, filename="performance.json")


# Print detailed results
for video_file, video_pred_label, video_pred_score, video_true_score, true_label, segment_outputs, sv_true_label, sv_video_pred in all_results:
    print(f"Video: {video_file}, Overall Predicted Label = {video_pred_label}, Overall Prediction Score = {video_pred_score:.4f}, True Label = {true_label}, True Label Score = {video_true_score:.4f}")
    for i, (segment_label, segment_score, probabilities) in enumerate(segment_outputs):
        segment_video_label_score = probabilities[0, video_pred_label].item()
        segment_true_label_score = probabilities[0, true_label].item()
        print(f"  Segment {i+1}: Predicted Label = {segment_label}, Prediction Score = {segment_score:.4f}, Segment Video Label Score = {segment_video_label_score:.4f}, Segment True Label Score = {segment_true_label_score:.4f}, SV True Label = {sv_true_label[i]:.4f}, SV Predicted Label = {sv_video_pred[i]:.4f}")




Error reading CPU energy: 'int' object has no attribute 'power'
Error reading GPU energy: Command '['nvidia-smi', '--query-gpu=energy.draw', '--format=csv,noheader,nounits']' returned non-zero exit status 2.


Processing videos: 100%|##########| 20/20 [00:00<?, ?video/s]

Error reading CPU energy: 'int' object has no attribute 'power'
Error reading GPU energy: Command '['nvidia-smi', '--query-gpu=energy.draw', '--format=csv,noheader,nounits']' returned non-zero exit status 2.


ValueError: not enough values to unpack (expected 8, got 6)

In [16]:
import os
import av
import torch
from transformers import ViTForImageClassification, ViTFeatureExtractor, ViTConfig
from PIL import Image
import numpy as np
from tqdm import tqdm
from lime import lime_image
from skimage.segmentation import mark_boundaries
import matplotlib.pyplot as plt
import random

# Configuration
config = {
    "model_config": "google/vit-base-patch16-224",
    "model_path": "finetuned_vit_model.pth",
    "feature_extractor_name": "google/vit-base-patch16-224",
    "video_directory": "archive/videos_val",
    "results_folder": "Results",
    "num_classes": 400,
    "num_videos_to_process": 10  # Number of videos to process
}

# Ensure the results directory exists
os.makedirs(config["results_folder"], exist_ok=True)

# Load the model
model_config = ViTConfig.from_pretrained(config["model_config"], num_labels=config["num_classes"])
model = ViTForImageClassification(model_config)
model.load_state_dict(torch.load(config["model_path"]), strict=False)  # Use strict=False to ignore non-matching keys
model.eval()

# Load the feature extractor
feature_extractor = ViTFeatureExtractor.from_pretrained(config["feature_extractor_name"])

# Function to process and explain a video file
def process_and_explain(video_path, model, feature_extractor):
    container = av.open(video_path)
    frame = next(container.decode(video=0)).to_image()
    inputs = feature_extractor(images=frame, return_tensors="pt")
    outputs = model(**inputs)
    preds = torch.nn.functional.softmax(outputs.logits, dim=-1)
    top_pred = preds.argmax().item()

    # Prepare frame for LIME
    def batch_predict(images):
        inputs = feature_extractor(images=[Image.fromarray(img.astype('uint8')) for img in images], return_tensors="pt")
        with torch.no_grad():
            outputs = model(**inputs)
            probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
        return probs.detach().cpu().numpy()

    # Explain with LIME
    explainer = lime_image.LimeImageExplainer()
    explanation = explainer.explain_instance(np.array(frame), batch_predict, top_labels=5, hide_color=0, num_samples=1000)
    temp, mask = explanation.get_image_and_mask(top_pred, positive_only=False, num_features=10, hide_rest=False)
    img_boundary = mark_boundaries(np.array(frame) / 255.0, mask)

    # Plot and save the results
    plt.figure(figsize=(10, 10))
    plt.imshow(img_boundary)
    plt.axis('off')
    plt_path = os.path.join(config["results_folder"], f"{os.path.basename(video_path)}_explanation.png")
    plt.savefig(plt_path)
    plt.close()

    return top_pred, plt_path

# Select and process videos
all_video_files = os.listdir(config["video_directory"])
selected_video_files = random.sample(all_video_files, min(config["num_videos_to_process"], len(all_video_files)))

for video_file in tqdm(selected_video_files, desc="Processing videos"):
    video_path = os.path.join(config["video_directory"], video_file)
    prediction, explanation_path = process_and_explain(video_path, model, feature_extractor)
    print(f"Processed {video_file}: Top prediction index = {prediction}, Explanation saved to {explanation_path}")


Processing videos:   0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

Processing videos:  10%|█         | 1/10 [01:25<12:50, 85.57s/it]

Processed WwZv2UcULCo.mp4: Top prediction index = 229, Explanation saved to Results/WwZv2UcULCo.mp4_explanation.png


  0%|          | 0/1000 [00:00<?, ?it/s]

Processing videos:  20%|██        | 2/10 [02:37<10:22, 77.81s/it]

Processed GuDFD9qmXkY.mp4: Top prediction index = 209, Explanation saved to Results/GuDFD9qmXkY.mp4_explanation.png


  0%|          | 0/1000 [00:00<?, ?it/s]

Processing videos:  30%|███       | 3/10 [03:50<08:48, 75.46s/it]

Processed 5wzGMlYQzDU.mp4: Top prediction index = 280, Explanation saved to Results/5wzGMlYQzDU.mp4_explanation.png


  0%|          | 0/1000 [00:00<?, ?it/s]

Processing videos:  40%|████      | 4/10 [05:00<07:18, 73.13s/it]

Processed zlWSKUJl76M.mp4: Top prediction index = 98, Explanation saved to Results/zlWSKUJl76M.mp4_explanation.png


  0%|          | 0/1000 [00:00<?, ?it/s]

Processing videos:  50%|█████     | 5/10 [06:14<06:08, 73.65s/it]

Processed -5-dvLrzE78.mp4: Top prediction index = 167, Explanation saved to Results/-5-dvLrzE78.mp4_explanation.png


  0%|          | 0/1000 [00:00<?, ?it/s]

Processing videos:  60%|██████    | 6/10 [07:27<04:54, 73.50s/it]

Processed -nTtZiY1He0.mp4: Top prediction index = 214, Explanation saved to Results/-nTtZiY1He0.mp4_explanation.png


  0%|          | 0/1000 [00:00<?, ?it/s]

Processing videos:  70%|███████   | 7/10 [08:41<03:40, 73.39s/it]

Processed fT8BdRZNfTg.mp4: Top prediction index = 349, Explanation saved to Results/fT8BdRZNfTg.mp4_explanation.png


  0%|          | 0/1000 [00:00<?, ?it/s]

Processing videos:  80%|████████  | 8/10 [09:50<02:24, 72.12s/it]

Processed yorLJaaroLs.mp4: Top prediction index = 280, Explanation saved to Results/yorLJaaroLs.mp4_explanation.png


  0%|          | 0/1000 [00:00<?, ?it/s]

Processing videos:  90%|█████████ | 9/10 [11:04<01:12, 72.62s/it]

Processed ZLlPJuLCHi8.mp4: Top prediction index = 280, Explanation saved to Results/ZLlPJuLCHi8.mp4_explanation.png


  0%|          | 0/1000 [00:00<?, ?it/s]

Processing videos: 100%|██████████| 10/10 [12:18<00:00, 73.80s/it]

Processed gOxEEuOVXEw.mp4: Top prediction index = 46, Explanation saved to Results/gOxEEuOVXEw.mp4_explanation.png



