V-BAD (Video-Based Adversarial Density) 

In [2]:
import torch
import torch.nn as nn
import torchvision.transforms as transforms
import torchvision.models.video as video_models
import numpy as np
from tqdm import tqdm
import os
import av
from PIL import Image
import cv2

class VBAD:
    def __init__(self, config):
        self.config = config
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        
        # Options
        if config['model_type'] == 'r3d_18':
            self.model = video_models.r3d_18(pretrained=True)
        elif config['model_type'] == 'mc3_18':
            self.model = video_models.mc3_18(pretrained=True)
        elif config['model_type'] == 'r2plus1d_18':
            self.model = video_models.r2plus1d_18(pretrained=True)
        else:
            raise ValueError(f"Unsupported model type: {config['model_type']}")
        

        num_features = self.model.fc.in_features
        self.model.fc = nn.Linear(num_features, config['num_classes'])
        
        self.model.eval()  
        self.model = self.model.to(self.device)
        

        self.transform = transforms.Compose([
            transforms.Resize((112, 112)),  # 3D CNN通常使用较小的输入尺寸
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.43216, 0.394666, 0.37645],
                              std=[0.22803, 0.22145, 0.216989])
        ])

    def preprocess_video(self, frames):
        processed_frames = []
        for frame in frames:
            frame_pil = Image.fromarray(frame)
            processed_frame = self.transform(frame_pil)
            processed_frames.append(processed_frame)
            
        # 转换为模型期望的格式 [C, T, H, W]
        video_tensor = torch.stack(processed_frames, dim=1)
        # 添加batch维度
        video_tensor = video_tensor.unsqueeze(0)
        return video_tensor.to(self.device)

    def generate_density_map(self, video_tensor):
        with torch.enable_grad():
            C, T, H, W = video_tensor.shape[1:]
            density_map = torch.zeros((T, H, W), device=self.device)
            
            window_size = min(16, T) 
            stride = window_size // 2
            
            for t in range(0, T - window_size + 1, stride):
                clip = video_tensor[:, :, t:t+window_size].clone()
                clip.requires_grad_(True)
                
                outputs = self.model(clip)
                loss = outputs.mean()
                
                loss.backward(retain_graph=True)
                
                if clip.grad is not None:
                    grad_norm = torch.norm(clip.grad, dim=1, p=2)
                    density_map[t:t+window_size] += grad_norm[0]
                
                clip.grad = None
            
            if torch.sum(density_map) > 0:
                density_map = (density_map - density_map.min()) / (density_map.max() - density_map.min() + 1e-8)
            
            return density_map

    def generate_adversarial_perturbation(self, video_tensor, true_label, target_label):
        with torch.enable_grad():
            video_tensor = video_tensor.clone()
            video_tensor.requires_grad_(True)
            
            density_map = self.generate_density_map(video_tensor)
            
            target = torch.tensor([target_label], device=self.device)
            
            outputs = self.model(video_tensor)
            criterion = nn.CrossEntropyLoss()
            adv_loss = criterion(outputs, target)
            
            density_loss = torch.mean(density_map.unsqueeze(0) * torch.norm(video_tensor, dim=1))
            
            loss = adv_loss + self.config['density_weight'] * density_loss
            
            loss.backward()
            
            if video_tensor.grad is not None:
                grad = video_tensor.grad.clone()
                grad_norm = torch.norm(grad, p=2)
                scaled_grad = grad / (grad_norm + 1e-8)
                
                perturbation = self.config['epsilon'] * scaled_grad
                perturbation = torch.clamp(perturbation, -0.1, 0.1)
            else:
                perturbation = torch.zeros_like(video_tensor)
            
            return perturbation, density_map

    def apply_perturbation(self, frames):

        video_tensor = self.preprocess_video(frames)
        
        perturbation, density_map = self.generate_adversarial_perturbation(
            video_tensor,
            self.config['true_label'],
            self.config['target_label']
        )
        
        perturbed_video = video_tensor + perturbation
        perturbed_video = torch.clamp(perturbed_video, 0, 1)
        
        perturbed_frames = []
        for t in range(perturbed_video.shape[2]):
            frame = perturbed_video[0, :, t].cpu().detach().numpy()
            frame = np.transpose(frame, (1, 2, 0))


            mean = np.array([0.43216, 0.394666, 0.37645])
            std = np.array([0.22803, 0.22145, 0.216989])
            frame = (frame * std) + mean
            
            frame = np.clip(frame, 0, 1)
            
            frame = cv2.resize(frame, (frames[0].shape[1], frames[0].shape[0]), 
                            interpolation=cv2.INTER_CUBIC)
            
            frame = (frame * 255).astype(np.uint8)
            perturbed_frames.append(frame)
        
        return perturbed_frames, density_map.cpu().numpy()


def load_video(video_path):
    frames = []
    container = av.open(video_path)
    stream = container.streams.video[0]
    fps = float(stream.average_rate)
    
    for frame in container.decode(video=0):
        frames.append(frame.to_rgb().to_ndarray())
    
    return frames, fps

def save_video(frames, output_path, fps=30):
    container = av.open(output_path, mode='w')
    stream = container.add_stream('h264', rate=fps)
    
    height, width = frames[0].shape[:2]
    stream.width = width
    stream.height = height
    stream.pix_fmt = 'yuv420p'
    
    for frame in frames:
        frame = av.VideoFrame.from_ndarray(frame, format='rgb24')
        packet = stream.encode(frame)
        container.mux(packet)
    
    packet = stream.encode(None)
    container.mux(packet)
    container.close()

def process_videos(config):
    vbad = VBAD(config)
    
    adv_video_dir = os.path.join(
        os.path.dirname(config['video_directory']), 
        f'VBAD_{config["model_type"]}'
    )
    os.makedirs(adv_video_dir, exist_ok=True)
    
    for video_name in tqdm(os.listdir(config['video_directory'])):
        if not video_name.endswith(('.mp4', '.avi', '.mov')):
            continue
            
        video_path = os.path.join(config['video_directory'], video_name)
        
        try:
            frames, fps = load_video(video_path)
            
            true_label = 0 
            
            possible_targets = list(range(config['num_classes']))
            possible_targets.remove(true_label)
            target_label = np.random.choice(possible_targets)
            
            config.update({
                'true_label': true_label,
                'target_label': target_label
            })
            
            perturbed_frames, density_map = vbad.apply_perturbation(frames)
            
            output_path = os.path.join(adv_video_dir, video_name)
            save_video(perturbed_frames, output_path, fps)
            
            if config['save_density_map']:
                density_path = output_path.rsplit('.', 1)[0] + '_density.npy'
                np.save(density_path, density_map)
            
        except Exception as e:
            print(f"Error processing {video_name}: {str(e)}")
            continue

config = {
    'video_directory': '/home/z/Music/st/Kinetics-400/RQ1/videos_val',
    'label_file': '/home/z/Music/st/Kinetics-400/RQ1/kinetics400_val_list_videos.txt',
    
    'model_type': 'r3d_18',
    'num_classes': 400,
    
    'epsilon': 0.01,  
    'density_weight': 0.01,  
    

    'save_density_map': True,
}


process_videos(config)

FileNotFoundError: [Errno 2] No such file or directory: '/home/z/Music/st/Kinetics-400/RQ1/videos_val'