STAA 

In [1]:
import numpy as np
import torch
import av
from tqdm import tqdm
import time
import os
from transformers import TimesformerForVideoClassification, AutoImageProcessor
from scipy.stats import kendalltau
import json

class AttentionExtractor:
    def __init__(self, model_name, device='cuda' if torch.cuda.is_available() else 'cpu'):
        self.model = TimesformerForVideoClassification.from_pretrained(model_name)
        self.model.to(device)
        self.device = device
        self.image_processor = AutoImageProcessor.from_pretrained(model_name)

    def extract_attention(self, frames):
        # 确保帧数据的形状正确
        if frames.ndim == 3:
            frames = frames[np.newaxis, ...]
        inputs = self.image_processor(list(frames), return_tensors="pt").to(self.device)
        with torch.no_grad():
            outputs = self.model(**inputs, output_attentions=True)
        last_layer_attention = outputs.attentions[-1]
        spatial_attention = last_layer_attention.mean(1)
        return outputs.logits.cpu().numpy(), spatial_attention.cpu().numpy()

def load_video(video_path, num_frames=8):
    container = av.open(video_path)
    video_stream = container.streams.video[0]
    
    frames = []
    for frame in container.decode(video=0):
        frames.append(frame.to_rgb().to_ndarray())
        if len(frames) == num_frames:
            break
    
    # 如果视频帧数少于 num_frames，则重复最后一帧
    while len(frames) < num_frames:
        frames.append(frames[-1])
    
    # 确保帧数组的形状正确
    frames = np.stack(frames)
    
    return frames

def calculate_faithfulness(extractor, video_path):
    frames = load_video(video_path)
    original_prediction, attention = extractor.extract_attention(frames)
    
    flat_attention = attention.flatten()
    k = int(0.5 * len(flat_attention))
    top_k_indices = np.argsort(flat_attention)[-k:]
    
    masked_frames = frames.copy()
    masked_frames.flat[top_k_indices] = 0
    
    masked_prediction, _ = extractor.extract_attention(masked_frames)
    
    faithfulness = 1 - np.abs(original_prediction - masked_prediction).mean()
    return faithfulness

def calculate_monotonicity(extractor, video_path):
    frames = load_video(video_path)
    original_prediction, attention = extractor.extract_attention(frames)
    
    flat_attention = attention.flatten()
    percentages = np.arange(0.1, 1.0, 0.1)
    diffs = []
    
    for p in percentages:
        k = int(p * len(flat_attention))
        top_k_indices = np.argsort(flat_attention)[-k:]
        
        masked_frames = frames.copy()
        masked_frames.flat[top_k_indices] = 0
        
        masked_prediction, _ = extractor.extract_attention(masked_frames)
        diff = np.abs(original_prediction - masked_prediction).mean()
        diffs.append(diff)
    
    tau, _ = kendalltau(percentages, diffs)
    return tau

def load_video_labels(label_file):
    video_labels = {}
    with open(label_file, 'r') as f:
        for line in f:
            parts = line.strip().split()
            if len(parts) == 2:
                video_name, label = parts
                video_labels[video_name.split('.')[0]] = int(label)
    return video_labels

def evaluate_staa(config):
    extractor = AttentionExtractor(config['model_name'])
    video_labels = load_video_labels(config['label_file'])
    
    results = {}
    video_files = [f for f in os.listdir(config['video_directory']) if f.endswith('.mp4')]
    
    for video_file in tqdm(video_files, desc="Processing videos"):
        video_name = video_file.split('.')[0]
        video_path = os.path.join(config['video_directory'], video_file)
        
        if video_name not in video_labels:
            print(f"Warning: No label found for {video_name}. Skipping.")
            continue
        
        true_label = video_labels[video_name]
        
        # Calculate faithfulness
        faithfulness = calculate_faithfulness(extractor, video_path)
        
        # Calculate monotonicity using Kendall's Tau
        monotonicity = calculate_monotonicity(extractor, video_path)
        
        # Calculate computation time
        start_time = time.time()
        _ = extractor.extract_attention(load_video(video_path))
        computation_time = time.time() - start_time
        
        results[video_name] = {
            "true_label": true_label,
            "faithfulness": faithfulness,
            "monotonicity": monotonicity,
            "computation_time": computation_time
        }
    
    # Calculate overall statistics
    faithfulness_scores = [r['faithfulness'] for r in results.values()]
    monotonicity_scores = [r['monotonicity'] for r in results.values()]
    computation_times = [r['computation_time'] for r in results.values()]
    
    print(f"Overall Faithfulness: {np.mean(faithfulness_scores):.2f} ± {np.std(faithfulness_scores):.2f}")
    print(f"Overall Monotonicity (Kendall's Tau): {np.mean(monotonicity_scores):.2f} ± {np.std(monotonicity_scores):.2f}")
    print(f"Average Computation Time: {np.mean(computation_times):.2f} seconds")
    
    # Save results to a JSON file
    with open(os.path.join(config['output_directory'], 'evaluation_results.json'), 'w') as f:
        json.dump(results, f, indent=4)

if __name__ == "__main__":
    config = {
        'model_name': 'facebook/timesformer-base-finetuned-k400',
        'video_directory': 'archive/videos_test',
        'output_directory': 'archive/videos_testRS',
        'label_file': 'archive/kinetics400_val_list_videos.txt',
    }
    
    evaluate_staa(config)

  return torch.tensor(value)
Processing videos:   3%|▎         | 37/1200 [01:25<44:47,  2.31s/it]


KeyboardInterrupt: 

修改STAA

In [None]:
import numpy as np
import torch
import av
from tqdm import tqdm
import time
import os
from transformers import TimesformerForVideoClassification, AutoImageProcessor
from scipy.stats import kendalltau
import json

class AttentionExtractor:
    def __init__(self, model_name, device='cuda' if torch.cuda.is_available() else 'cpu'):
        self.model = TimesformerForVideoClassification.from_pretrained(model_name)
        self.model.to(device)
        self.device = device
        self.image_processor = AutoImageProcessor.from_pretrained(model_name)

    def extract_attention(self, frames):
        # 确保帧数据的形状正确
        if frames.ndim == 3:
            frames = frames[np.newaxis, ...]
        inputs = self.image_processor(list(frames), return_tensors="pt").to(self.device)
        with torch.no_grad():
            outputs = self.model(**inputs, output_attentions=True)
        last_layer_attention = outputs.attentions[-1]
        spatial_attention = last_layer_attention.mean(1)
        return outputs.logits.cpu().numpy(), spatial_attention.cpu().numpy()

def load_video(video_path, num_frames=8):
    container = av.open(video_path)
    video_stream = container.streams.video[0]
    
    frames = []
    for frame in container.decode(video=0):
        frames.append(frame.to_rgb().to_ndarray())
        if len(frames) == num_frames:
            break
    
    # 如果视频帧数少于 num_frames，则重复最后一帧
    while len(frames) < num_frames:
        frames.append(frames[-1])
    
    # 确保帧数组的形状正确
    frames = np.stack(frames)
    
    return frames


def calculate_faithfulness(extractor, video_path):
    frames = load_video(video_path)
    original_prediction, attention = extractor.extract_attention(frames)
    
    print("Frames shape:", frames.shape)
    print("Attention shape:", attention.shape)
    
    # 获取帧的形状
    frame_shape = frames.shape
    attention_shape = attention.shape
    
    # 计算要保留的像素数量
    k = int(0.5 * attention.size)
    
    # 获取最重要的50%的索引
    flat_attention = attention.flatten()
    top_k_indices = np.argsort(flat_attention)[-k:]
    
    # 创建掩码
    mask = np.zeros_like(flat_attention)
    mask[top_k_indices] = 1
    mask = mask.reshape(attention_shape)
    
    # 将mask调整为与帧相同的大小
    mask_resized = np.zeros(frame_shape[:-1])  # 不包括颜色通道
    for i in range(frame_shape[0]):  # 对每一帧
        mask_resized[i] = cv2.resize(mask[i], (frame_shape[2], frame_shape[1]))
    
    # 应用掩码到原始帧
    masked_frames = frames.copy()
    for i in range(frame_shape[-1]):  # 对每个颜色通道应用掩码
        masked_frames[..., i] = frames[..., i] * mask_resized
    
    masked_prediction, _ = extractor.extract_attention(masked_frames)
    
    # 计算faithfulness
    faithfulness = 1 - np.abs(original_prediction - masked_prediction).mean()
    return faithfulness

def calculate_monotonicity(extractor, video_path):
    frames = load_video(video_path)
    original_prediction, attention = extractor.extract_attention(frames)
    
    frame_shape = frames.shape
    attention_shape = attention.shape
    
    flat_attention = attention.flatten()
    percentages = np.arange(0.1, 1.0, 0.1)
    diffs = []
    
    for p in percentages:
        k = int(p * flat_attention.size)
        top_k_indices = np.argsort(flat_attention)[-k:]
        
        # 创建掩码
        mask = np.zeros_like(flat_attention)
        mask[top_k_indices] = 1
        mask = mask.reshape(attention_shape)
        
        # 将mask调整为与帧相同的大小
        mask_resized = np.zeros(frame_shape[:-1])  # 不包括颜色通道
        for i in range(frame_shape[0]):  # 对每一帧
            mask_resized[i] = cv2.resize(mask[i], (frame_shape[2], frame_shape[1]))
        
        # 应用掩码到原始帧
        masked_frames = frames.copy()
        for i in range(frame_shape[-1]):  # 对每个颜色通道应用掩码
            masked_frames[..., i] = frames[..., i] * mask_resized
        
        masked_prediction, _ = extractor.extract_attention(masked_frames)
        diff = np.abs(original_prediction - masked_prediction).mean()
        diffs.append(diff)
    
    tau, _ = kendalltau(percentages, diffs)
    return tau

def load_video_labels(label_file):
    video_labels = {}
    with open(label_file, 'r') as f:
        for line in f:
            parts = line.strip().split()
            if len(parts) == 2:
                video_name, label = parts
                video_labels[video_name.split('.')[0]] = int(label)
    return video_labels

def evaluate_staa(config):
    extractor = AttentionExtractor(config['model_name'])
    video_labels = load_video_labels(config['label_file'])
    
    results = {}
    video_files = [f for f in os.listdir(config['video_directory']) if f.endswith('.mp4')]
    
    for video_file in tqdm(video_files, desc="Processing videos"):
        video_name = video_file.split('.')[0]
        video_path = os.path.join(config['video_directory'], video_file)
        
        if video_name not in video_labels:
            print(f"Warning: No label found for {video_name}. Skipping.")
            continue
        
        true_label = video_labels[video_name]
        
        # Calculate faithfulness
        faithfulness = calculate_faithfulness(extractor, video_path)
        
        # Calculate monotonicity using Kendall's Tau
        monotonicity = calculate_monotonicity(extractor, video_path)
        
        # Calculate computation time
        start_time = time.time()
        _ = extractor.extract_attention(load_video(video_path))
        computation_time = time.time() - start_time
        
        results[video_name] = {
            "true_label": true_label,
            "faithfulness": faithfulness,
            "monotonicity": monotonicity,
            "computation_time": computation_time
        }
    
    # Calculate overall statistics
    faithfulness_scores = [r['faithfulness'] for r in results.values()]
    monotonicity_scores = [r['monotonicity'] for r in results.values()]
    computation_times = [r['computation_time'] for r in results.values()]
    
    print(f"Overall Faithfulness: {np.mean(faithfulness_scores):.2f} ± {np.std(faithfulness_scores):.2f}")
    print(f"Overall Monotonicity (Kendall's Tau): {np.mean(monotonicity_scores):.2f} ± {np.std(monotonicity_scores):.2f}")
    print(f"Average Computation Time: {np.mean(computation_times):.2f} seconds")
    
    # Save results to a JSON file
    with open(os.path.join(config['output_directory'], 'evaluation_results.json'), 'w') as f:
        json.dump(results, f, indent=4)

if __name__ == "__main__":
    config = {
        'model_name': 'facebook/timesformer-base-finetuned-k400',
        'video_directory': 'archive/videos_test',
        'output_directory': 'archive/videos_testRS2',
        'label_file': 'archive/kinetics400_val_list_videos.txt',
    }
    
    evaluate_staa(config)

Processing videos:   0%|          | 0/4 [00:00<?, ?it/s]

Frames shape: (8, 320, 570, 3)
Attention shape: (8, 197, 197)


Processing videos:  25%|██▌       | 1/4 [00:02<00:07,  2.34s/it]

Frames shape: (8, 320, 426, 3)
Attention shape: (8, 197, 197)


Processing videos:  50%|█████     | 2/4 [00:04<00:04,  2.29s/it]

Frames shape: (8, 320, 426, 3)
Attention shape: (8, 197, 197)


Processing videos:  75%|███████▌  | 3/4 [00:06<00:02,  2.28s/it]

Frames shape: (8, 320, 426, 3)
Attention shape: (8, 197, 197)


Processing videos: 100%|██████████| 4/4 [00:09<00:00,  2.27s/it]

Overall Faithfulness: -1.15 ± 0.15
Overall Monotonicity (Kendall's Tau): -0.89 ± 0.04
Average Computation Time: 0.16 seconds





SHAP

保留重要的段

In [None]:
import os
import av
import torch
import numpy as np
import torch.nn.functional as F
from transformers import AutoImageProcessor, TimesformerForVideoClassification
from tqdm import tqdm
from scipy.stats import kendalltau
import time
import json
import random
from collections import defaultdict
from itertools import combinations 

class VideoProcessor:
    def __init__(self, model_name, image_processor_name, device='cuda'):
        self.model = TimesformerForVideoClassification.from_pretrained(model_name)
        self.image_processor = AutoImageProcessor.from_pretrained(image_processor_name)
        self.device = device
        self.model.to(device)

    def split_video_into_segments(self, container, n_segments=8, frames_per_segment=16):
        frame_list = [frame.to_image() for frame in container.decode(video=0)]
        total_frames = len(frame_list)
        segment_length = total_frames // n_segments
        segments = []
        for i in range(n_segments):
            start = i * segment_length
            end = min(start + segment_length, total_frames)
            segment_frames = frame_list[start:end] if end - start == segment_length else frame_list[start:] + [frame_list[-1]] * (segment_length - (end - start))
            segments.append(segment_frames[:frames_per_segment])
        return segments

    def predict_video_and_segments(self, container, true_label):
        video_segments = self.split_video_into_segments(container)
        segment_outputs = []
        with torch.no_grad():
            for segment in video_segments:
                inputs = self.image_processor(list(segment), return_tensors="pt")
                inputs = {k: v.to(self.device) for k, v in inputs.items()}
                try:
                    outputs = self.model(**inputs)
                    logits = outputs.logits
                    probabilities = F.softmax(logits, dim=-1)
                    pred_label = logits.argmax(-1).item()
                    pred_score = probabilities[0, pred_label].item()
                    segment_outputs.append((pred_label, pred_score, probabilities))
                except RuntimeError as e:
                    print(f"Error processing segment: {e}")
                    continue
        return segment_outputs

class TemporalShap:
    def __init__(self, num_samples=100):
        self.num_samples = num_samples

    def exact_shapley_values(self, segment_outputs, label_index):
        n = len(segment_outputs)
        shapley_values = [0] * n
        all_indices = list(range(n))
        for i in all_indices:
            marginal_contributions = []
            for subset_size in range(n):
                subsets = list(combinations([x for x in all_indices if x != i], subset_size))
                for subset in subsets:
                    subset_prob = torch.zeros_like(segment_outputs[0][2])
                    if subset:
                        subset_prob = torch.mean(torch.stack([segment_outputs[j][2] for j in subset]), dim=0)
                    with_i_prob = (subset_prob * len(subset) + segment_outputs[i][2]) / (len(subset) + 1)
                    marginal_contributions.append(with_i_prob[0, label_index].item() - subset_prob[0, label_index].item())
            shapley_values[i] = np.mean(marginal_contributions)
        return shapley_values

class ShapEvaluator:
    def __init__(self, video_processor, shap_calculator):
        self.video_processor = video_processor
        self.shap_calculator = shap_calculator

    def calculate_faithfulness(self, video_path, true_label):
        container = av.open(video_path)
        segment_outputs = self.video_processor.predict_video_and_segments(container, true_label)
        if not segment_outputs:
            return None

        video_probs = torch.mean(torch.stack([output[2] for output in segment_outputs]), dim=0)
        original_prediction = video_probs[0, true_label].item()

        shapley_values = self.shap_calculator.exact_shapley_values(segment_outputs, true_label)
        
        # 选择 Shapley 值最高的 50% 的段（最重要的段）
        k = len(shapley_values) // 2
        most_important_indices = np.argsort(shapley_values)[-k:]
        
        # 只保留重要的段
        important_probs = [output[2] for i, output in enumerate(segment_outputs) if i in most_important_indices]
        
        if important_probs:
            important_video_probs = torch.mean(torch.stack(important_probs), dim=0)
            important_prediction = important_video_probs[0, true_label].item()
        else:
            # 如果没有重要的段，则假设预测是随机的
            important_prediction = 1.0 / video_probs.shape[1]  # 假设是均匀分布

        faithfulness = 1 - abs(original_prediction - important_prediction)
        return faithfulness

    def calculate_monotonicity(self, video_path, true_label):
        container = av.open(video_path)
        segment_outputs = self.video_processor.predict_video_and_segments(container, true_label)
        if not segment_outputs:
            return None

        shapley_values = self.shap_calculator.exact_shapley_values(segment_outputs, true_label)
        
        percentages = np.arange(0.1, 1.0, 0.1)
        diffs = []

        original_probs = torch.mean(torch.stack([output[2] for output in segment_outputs]), dim=0)
        original_prediction = original_probs[0, true_label].item()

        for p in percentages:
            k = int(p * len(shapley_values))
            most_important_indices = np.argsort(shapley_values)[-k:]
            
            important_probs = [output[2] for i, output in enumerate(segment_outputs) if i in most_important_indices]
            
            if important_probs:
                important_video_probs = torch.mean(torch.stack(important_probs), dim=0)
                important_prediction = important_video_probs[0, true_label].item()
            else:
                important_prediction = 1.0 / original_probs.shape[1]  # 假设是均匀分布
            
            diff = abs(original_prediction - important_prediction)
            diffs.append(diff)

        tau, _ = kendalltau(percentages, diffs)
        return tau

def load_video_labels(label_file):
    video_labels = {}
    with open(label_file, 'r') as f:
        for line in f:
            parts = line.strip().split()
            if len(parts) == 2:
                video_name, label = parts
                video_labels[video_name.split('.')[0]] = int(label)
    return video_labels

def evaluate_shap(config):
    video_processor = VideoProcessor(config["model_name"], config["image_processor_name"])
    shap_calculator = TemporalShap(num_samples=config["num_samples"])
    evaluator = ShapEvaluator(video_processor, shap_calculator)

    video_labels = load_video_labels(config["video_list_path"])
    
    faithfulness_scores = []
    monotonicity_scores = []
    computation_times = []

    for video_name, true_label in tqdm(list(video_labels.items())[:config["num_eval_videos"]], desc="Evaluating videos"):
        video_path = os.path.join(config["video_directory"], video_name + '.mp4')
        
        start_time = time.time()
        
        faithfulness = evaluator.calculate_faithfulness(video_path, true_label)
        monotonicity = evaluator.calculate_monotonicity(video_path, true_label)
        
        end_time = time.time()
        computation_time = end_time - start_time

        if faithfulness is not None and monotonicity is not None:
            faithfulness_scores.append(faithfulness)
            monotonicity_scores.append(monotonicity)
            computation_times.append(computation_time)

    results = {
        "faithfulness": {
            "mean": np.mean(faithfulness_scores),
            "std": np.std(faithfulness_scores)
        },
        "monotonicity": {
            "mean": np.mean(monotonicity_scores),
            "std": np.std(monotonicity_scores)
        },
        "computation_time": {
            "mean": np.mean(computation_times),
            "std": np.std(computation_times)
        }
    }

    with open("shap_evaluation_results.json", "w") as f:
        json.dump(results, f, indent=4)

    print(f"Faithfulness: {results['faithfulness']['mean']:.4f} ± {results['faithfulness']['std']:.4f}")
    print(f"Monotonicity: {results['monotonicity']['mean']:.4f} ± {results['monotonicity']['std']:.4f}")
    print(f"Computation Time: {results['computation_time']['mean']:.2f} ± {results['computation_time']['std']:.2f} seconds")

if __name__ == "__main__":
    config = {
        "model_name": "facebook/timesformer-base-finetuned-k400",
        "image_processor_name": "MCG-NJU/videomae-base-finetuned-kinetics",
        "num_samples": 100,
        "video_list_path": "archive/kinetics400_val_list_videos.txt",
        "video_directory": "archive/videos_test",
        "num_eval_videos": 1200  # 设置要评估的视频数量
    }
    
    evaluate_shap(config)

Evaluating videos: 100%|██████████| 10/10 [00:57<00:00,  5.74s/it]

Faithfulness: 0.8515 ± 0.1543
Monotonicity: -0.5353 ± 0.0000
Computation Time: 5.74 ± 0.13 seconds





LIME


In [None]:
import os
import av
import torch
from transformers import ViTForImageClassification, ViTFeatureExtractor, ViTConfig
from PIL import Image
import numpy as np
from tqdm import tqdm
from lime import lime_image
import matplotlib.pyplot as plt
import json
import logging
from torch.cuda.amp import autocast
from scipy.stats import kendalltau
import time

# 设置日志
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Configuration
config = {
    "model_config": "google/vit-base-patch16-224",
    "model_path": "finetuned_vit_model_20.pth",
    "feature_extractor_name": "google/vit-base-patch16-224",
    "video_directory": "archive/videos_test",
    "results_folder": "archive/videos_testRSLIME",
    "num_classes": 400,
    "num_frames_per_video": 8,
    "lime_num_samples": 300,
    "video_list_path": "archive/kinetics400_val_list_videos.txt",
    "num_eval_videos": 1200  # 设置要评估的视频数量，如果想处理所有视频，可以设置为一个很大的数
}

# 确保结果目录存在
os.makedirs(config["results_folder"], exist_ok=True)

# 设置设备
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
logging.info(f"Using device: {device}")

# 加载模型和特征提取器
try:
    model_config = ViTConfig.from_pretrained(config["model_config"], num_labels=config["num_classes"])
    model = ViTForImageClassification(model_config)
    model.load_state_dict(torch.load(config["model_path"], map_location=device))
    model.to(device)
    model.eval()
    feature_extractor = ViTFeatureExtractor.from_pretrained(config["feature_extractor_name"])
    logging.info("Model and feature extractor loaded successfully")
except Exception as e:
    logging.error(f"Error loading model or feature extractor: {e}")
    raise

def load_video_labels(label_file):
    video_labels = {}
    with open(label_file, 'r') as f:
        for line in f:
            parts = line.strip().split()
            if len(parts) == 2:
                video_name, label = parts
                video_labels[video_name.split('.')[0]] = int(label)
    return video_labels

def extract_frames(video_path, num_frames):
    frames = []
    try:
        with av.open(video_path) as container:
            stream = container.streams.video[0]
            duration = stream.duration * stream.time_base
            for i in range(num_frames):
                target_ts = duration * (i + 1) / (num_frames + 1)
                container.seek(int(target_ts / stream.time_base))
                for frame in container.decode(video=0):
                    frames.append(frame.to_image())
                    break
    except Exception as e:
        logging.error(f"Error extracting frames from {video_path}: {e}")
    return frames

def calculate_faithfulness(original_prediction, masked_prediction):
    return 1 - abs(original_prediction - masked_prediction)

def calculate_monotonicity(percentages, diffs):
    if len(percentages) != len(diffs):
        logging.warning(f"Mismatch in lengths: percentages ({len(percentages)}) and diffs ({len(diffs)})")
        return None
    
    if len(set(diffs)) == 1:  # 如果所有的差异值都相同
        logging.warning("All difference values are the same, monotonicity is undefined")
        return None
    
    if np.isnan(diffs).any() or np.isinf(diffs).any():
        logging.warning("NaN or Inf values found in diffs")
        return None
    
    try:
        tau, p_value = kendalltau(percentages, diffs)
        if np.isnan(tau):
            logging.warning("Kendall's tau is NaN")
            return None
        return tau
    except Exception as e:
        logging.error(f"Error in calculating Kendall's tau: {e}")
        return None

def process_and_explain(video_path, model, feature_extractor, num_frames):
    frames = extract_frames(video_path, num_frames)
    if not frames:
        logging.error(f"No frames extracted from {video_path}")
        return None

    results = []
    for i, frame in enumerate(frames):
        try:
            inputs = feature_extractor(images=frame, return_tensors="pt").to(device)
            with autocast():
                with torch.no_grad():
                    outputs = model(**inputs)
            preds = torch.nn.functional.softmax(outputs.logits, dim=-1)
            top_pred = preds.argmax().item()
            original_prediction = preds[0, top_pred].item()

            def batch_predict(images):
                batch_inputs = feature_extractor(images=[Image.fromarray(img.astype('uint8')) for img in images], return_tensors="pt").to(device)
                with autocast():
                    with torch.no_grad():
                        batch_outputs = model(**batch_inputs)
                return torch.nn.functional.softmax(batch_outputs.logits, dim=-1).cpu().numpy()

            explainer = lime_image.LimeImageExplainer()
            explanation = explainer.explain_instance(np.array(frame), 
                                                     batch_predict, 
                                                     top_labels=5, 
                                                     hide_color=0, 
                                                     num_samples=config["lime_num_samples"])
            
            saliency_map = explanation.get_image_and_mask(top_pred, positive_only=True, num_features=10, hide_rest=False)[1]
            saliency_map = (saliency_map - saliency_map.min()) / (saliency_map.max() - saliency_map.min())

            # Calculate faithfulness
            mask = saliency_map > 0.5
            masked_frame = np.array(frame) * mask[..., np.newaxis]
            masked_inputs = feature_extractor(images=Image.fromarray(masked_frame.astype('uint8')), return_tensors="pt").to(device)
            with autocast():
                with torch.no_grad():
                    masked_outputs = model(**masked_inputs)
            masked_preds = torch.nn.functional.softmax(masked_outputs.logits, dim=-1)
            masked_prediction = masked_preds[0, top_pred].item()
            faithfulness = calculate_faithfulness(original_prediction, masked_prediction)

            # Calculate monotonicity
            percentages = np.linspace(0.1, 1.0, 10)
            diffs = []
            for p in percentages:
                threshold = np.percentile(saliency_map, 100 * (1 - p))
                temp_mask = saliency_map > threshold
                temp_masked_frame = np.array(frame) * temp_mask[..., np.newaxis]
                temp_inputs = feature_extractor(images=Image.fromarray(temp_masked_frame.astype('uint8')), return_tensors="pt").to(device)
                with autocast():
                    with torch.no_grad():
                        temp_outputs = model(**temp_inputs)
                temp_preds = torch.nn.functional.softmax(temp_outputs.logits, dim=-1)
                temp_prediction = temp_preds[0, top_pred].item()
                diffs.append(abs(original_prediction - temp_prediction))
            
            logging.debug(f"Percentages: {percentages}")
            logging.debug(f"Diffs: {diffs}")
            
            monotonicity = calculate_monotonicity(percentages, diffs)
            if monotonicity is None:
                logging.warning(f"Unable to calculate monotonicity for frame {i+1}")

            results.append({
                "frame_index": i,
                "top_prediction": top_pred,
                "prediction_score": original_prediction,
                "faithfulness": faithfulness,
                "monotonicity": monotonicity
            })

        except Exception as e:
            logging.error(f"Error processing frame {i+1} of {video_path}: {e}")

    return results

def evaluate_lime(config):
    video_labels = load_video_labels(config["video_list_path"])
    
    faithfulness_scores = []
    monotonicity_scores = []
    computation_times = []

    video_files = [f for f in os.listdir(config["video_directory"]) if f.endswith('.mp4')][:config["num_eval_videos"]]

    for video_file in tqdm(video_files, desc="Evaluating videos"):
        video_path = os.path.join(config["video_directory"], video_file)
        video_name = os.path.splitext(video_file)[0]
        true_label = video_labels.get(video_name)
        
        if true_label is None:
            logging.warning(f"Label not found for video: {video_file}")
            continue

        logging.info(f"Processing video: {video_path} (Label: {true_label})")
        
        start_time = time.time()
        results = process_and_explain(video_path, model, feature_extractor, config["num_frames_per_video"])
        end_time = time.time()
        
        if results:
            frame_faithfulness = [r['faithfulness'] for r in results]
            frame_monotonicity = [r['monotonicity'] for r in results if r['monotonicity'] is not None]
            faithfulness_scores.extend(frame_faithfulness)
            monotonicity_scores.extend(frame_monotonicity)
            computation_times.append(end_time - start_time)
            logging.info(f"Processed {video_file}")
            logging.info(f"  Faithfulness: {np.mean(frame_faithfulness):.4f} ± {np.std(frame_faithfulness):.4f}")
            if frame_monotonicity:
                logging.info(f"  Monotonicity: {np.mean(frame_monotonicity):.4f} ± {np.std(frame_monotonicity):.4f}")
            else:
                logging.warning(f"  No valid monotonicity scores for {video_file}")
        else:
            logging.warning(f"Failed to process {video_file}")

    # 计算结果时处理可能的空列表
    results = {
        "faithfulness": {
            "mean": np.mean(faithfulness_scores) if faithfulness_scores else "N/A",
            "std": np.std(faithfulness_scores) if faithfulness_scores else "N/A"
        },
        "monotonicity": {
            "mean": np.mean(monotonicity_scores) if monotonicity_scores else "N/A",
            "std": np.std(monotonicity_scores) if monotonicity_scores else "N/A"
        },
        "computation_time": {
            "mean": np.mean(computation_times) if computation_times else "N/A",
            "std": np.std(computation_times) if computation_times else "N/A"
        }
    }

    with open(os.path.join(config["results_folder"], "lime_evaluation_results.json"), "w") as f:
        json.dump(results, f, indent=4)

    print(f"Faithfulness: {results['faithfulness']['mean']:.4f} ± {results['faithfulness']['std']:.4f}")
    print(f"Monotonicity: {results['monotonicity']['mean']:.4f} ± {results['monotonicity']['std']:.4f}")
    print(f"Computation Time: {results['computation_time']['mean']:.2f} ± {results['computation_time']['std']:.2f} seconds")

if __name__ == "__main__":
    evaluate_lime(config)

2024-10-02 17:07:55,374 - INFO - Using device: cuda
2024-10-02 17:07:56,530 - INFO - Model and feature extractor loaded successfully
Evaluating videos:   0%|          | 0/4 [00:00<?, ?it/s]2024-10-02 17:07:56,541 - INFO - Processing video: archive/example/-8oPwToqArE.mp4 (Label: 265)


  0%|          | 0/1000 [00:00<?, ?it/s]



  0%|          | 0/1000 [00:00<?, ?it/s]



  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]



  0%|          | 0/1000 [00:00<?, ?it/s]



  0%|          | 0/1000 [00:00<?, ?it/s]



  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

2024-10-02 17:09:41,377 - INFO - Processed -8oPwToqArE.mp4
2024-10-02 17:09:41,377 - INFO -   Faithfulness: 0.0182 ± 0.0079
2024-10-02 17:09:41,377 - INFO -   Monotonicity: -0.4472 ± 0.0000
Evaluating videos:  25%|██▌       | 1/4 [01:44<05:14, 104.84s/it]2024-10-02 17:09:41,378 - INFO - Processing video: archive/example/-6wNVod8iag.mp4 (Label: 241)


  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]



  0%|          | 0/1000 [00:00<?, ?it/s]

2024-10-02 17:11:00,496 - INFO - Processed -6wNVod8iag.mp4
2024-10-02 17:11:00,497 - INFO -   Faithfulness: 0.2128 ± 0.0017
2024-10-02 17:11:00,497 - INFO -   Monotonicity: -0.4472 ± 0.0000
Evaluating videos:  50%|█████     | 2/4 [03:03<02:59, 89.71s/it] 2024-10-02 17:11:00,498 - INFO - Processing video: archive/example/-9i4bm2OiZ4.mp4 (Label: 201)


  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

2024-10-02 17:12:18,495 - INFO - Processed -9i4bm2OiZ4.mp4
2024-10-02 17:12:18,496 - INFO -   Faithfulness: 0.9589 ± 0.0051
2024-10-02 17:12:18,496 - INFO -   Monotonicity: -0.4472 ± 0.0000
Evaluating videos:  75%|███████▌  | 3/4 [04:21<01:24, 84.36s/it]2024-10-02 17:12:18,497 - INFO - Processing video: archive/example/-0ew-c0w7uc.mp4 (Label: 122)


  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

2024-10-02 17:13:35,081 - INFO - Processed -0ew-c0w7uc.mp4
2024-10-02 17:13:35,083 - INFO -   Faithfulness: 0.3956 ± 0.0063
2024-10-02 17:13:35,083 - INFO -   Monotonicity: -0.5031 ± 0.0722
Evaluating videos: 100%|██████████| 4/4 [05:38<00:00, 84.64s/it]

Faithfulness: 0.3964 ± 0.3512
Monotonicity: -0.4651 ± 0.0484
Computation Time: 84.63 ± 11.70 seconds



