Parsewise Attack with Optical Flow

In [None]:
import torch
import torch.nn as nn
import torchvision.models as models
import torchvision.transforms as transforms
import av
import numpy as np
from tqdm import tqdm
import os
from PIL import Image
import cv2


model = models.resnet101(pretrained=True)
model.eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)


transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                       std=[0.229, 0.224, 0.225])
])

def load_video_labels(label_file):

    video_labels = {}
    with open(label_file, 'r') as f:
        for line in f:
            video_name, label = line.strip().split()
            video_labels[video_name] = int(label)
    return video_labels

def generate_adversarial_frame(frame, true_label, target_label, config):

    original_size = frame.shape[:2]
    
    frame_pil = Image.fromarray(frame)
    image_tensor = transform(frame_pil).unsqueeze(0).to(device)
    image_tensor.requires_grad = True
    
    target = torch.tensor([target_label], device=device)
    output = model(image_tensor)
    
    criterion = nn.CrossEntropyLoss()
    classification_loss = criterion(output, target)
    

    tv_loss = torch.sum(torch.abs(image_tensor[:, :, :, :-1] - image_tensor[:, :, :, 1:])) + \
              torch.sum(torch.abs(image_tensor[:, :, :-1, :] - image_tensor[:, :, 1:, :]))
    

    l2_loss = torch.norm(image_tensor, p=2)
    

    loss = classification_loss + config['tv_weight'] * tv_loss + config['l2_weight'] * l2_loss
    
    model.zero_grad()
    loss.backward()
    
    data_grad = image_tensor.grad.data
    data_grad = data_grad / (data_grad.norm() + 1e-10)
    

    perturbed_image = image_tensor + config['epsilon'] * data_grad
    perturbed_image = torch.clamp(perturbed_image, 0, 1)
    

    perturbed_frame = perturbed_image.squeeze(0).cpu().detach().numpy()
    perturbed_frame = np.transpose(perturbed_frame, (1, 2, 0))
    
    mean = np.array([0.485, 0.456, 0.406])
    std = np.array([0.229, 0.224, 0.225])
    perturbed_frame = std * perturbed_frame + mean
    perturbed_frame = np.clip(perturbed_frame * 255, 0, 255).astype(np.uint8)
    
    perturbed_frame = cv2.resize(perturbed_frame, (original_size[1], original_size[0]))
    
    return perturbed_frame

def select_keyframes(frames, interval):
    num_frames = len(frames)
    keyframe_indices = list(range(0, num_frames, interval))
    return keyframe_indices

def compute_optical_flow(frame1, frame2):
    gray1 = cv2.cvtColor(frame1, cv2.COLOR_RGB2GRAY)
    gray2 = cv2.cvtColor(frame2, cv2.COLOR_RGB2GRAY)
    flow = cv2.calcOpticalFlowFarneback(gray1, gray2, None, 0.5, 3, 15, 3, 5, 1.2, 0)
    return flow

def warp_frame(frame, flow, progress):
    h, w = flow.shape[:2]
    flow_map = np.column_stack((
        progress * flow[..., 0].flatten(),
        progress * flow[..., 1].flatten()
    ))
    
    map_x = np.tile(np.arange(w), (h, 1))
    map_y = np.tile(np.arange(h), (w, 1)).T
    
    dst_x = (map_x + flow_map[:, 0].reshape(h, w)).astype(np.float32)
    dst_y = (map_y + flow_map[:, 1].reshape(h, w)).astype(np.float32)
    
    return cv2.remap(frame, dst_x, dst_y, cv2.INTER_LINEAR)

def temporal_propagation(frames, keyframe_indices, perturbed_keyframes, config):
    all_perturbed_frames = frames.copy()
    
    for idx, key_idx in enumerate(keyframe_indices):
        if perturbed_keyframes[idx].shape != frames[key_idx].shape:
            perturbed_keyframes[idx] = cv2.resize(
                perturbed_keyframes[idx], 
                (frames[key_idx].shape[1], frames[key_idx].shape[0])
            )

        frame = frames[key_idx].astype(np.float32)
        perturbed = perturbed_keyframes[idx].astype(np.float32)
        

        all_perturbed_frames[key_idx] = cv2.addWeighted(
            frame, 1 - config['keyframe_weight'],
            perturbed, config['keyframe_weight'],
            0
        ).astype(np.uint8)
    

    for i in range(len(keyframe_indices)-1):
        start_idx = keyframe_indices[i]
        end_idx = keyframe_indices[i+1]
        
        flow_forward = compute_optical_flow(frames[start_idx], frames[end_idx])
        flow_backward = compute_optical_flow(frames[end_idx], frames[start_idx])
        
        for idx in range(start_idx + 1, end_idx):

            progress = (idx - start_idx) / (end_idx - start_idx)
            weight = np.exp(-((progress - 0.5) ** 2) / config['gaussian_sigma'])
            

            forward_warped = warp_frame(all_perturbed_frames[start_idx], flow_forward, progress)
            backward_warped = warp_frame(all_perturbed_frames[end_idx], flow_backward, 1-progress)
            

            forward_warped = forward_warped.astype(np.float32)
            backward_warped = backward_warped.astype(np.float32)
            

            perturbation = ((forward_warped + backward_warped) / 2).astype(np.float32)
            frame = frames[idx].astype(np.float32)
            

            temporal_weight = config['propagation_weight'] * weight
            all_perturbed_frames[idx] = cv2.addWeighted(
                frame, 1 - temporal_weight,
                perturbation, temporal_weight,
                0
            ).astype(np.uint8)
    
    return all_perturbed_frames

def load_video(video_path):
    frames = []
    container = av.open(video_path)
    stream = container.streams.video[0]
    fps = stream.average_rate
    
    for frame in container.decode(video=0):
        frames.append(frame.to_rgb().to_ndarray())
    
    return np.stack(frames), fps

def save_video(frames, output_path, fps=30):
    container = av.open(output_path, mode='w')
    stream = container.add_stream('h264', rate=fps)
    
    height = frames.shape[1] - (frames.shape[1] % 2)
    width = frames.shape[2] - (frames.shape[2] % 2)
    
    stream.width = width
    stream.height = height
    stream.pix_fmt = 'yuv420p'
    
    for frame in frames:
        frame = frame[:height, :width]
        frame = av.VideoFrame.from_ndarray(frame, format='rgb24')
        packet = stream.encode(frame)
        container.mux(packet)
    
    packet = stream.encode(None)
    container.mux(packet)
    container.close()

def process_videos(config):

    video_labels = load_video_labels(config['label_file'])
    
    adv_video_dir = os.path.join(os.path.dirname(config['video_directory']), 'Sparse_Keyframe2')
    os.makedirs(adv_video_dir, exist_ok=True)
    
    for video_name, true_label in tqdm(video_labels.items(), desc="Processing videos"):
        video_path = os.path.join(config['video_directory'], video_name)
        
        try:
            output_subdir = os.path.dirname(video_name)
            if output_subdir:
                os.makedirs(os.path.join(adv_video_dir, output_subdir), exist_ok=True)
            
            frames, fps = load_video(video_path)
            keyframe_indices = select_keyframes(frames, config['keyframe_interval'])
            keyframes = [frames[i] for i in keyframe_indices]
            
            possible_targets = list(range(config['num_classes']))
            possible_targets.remove(true_label)
            target_label = np.random.choice(possible_targets)
            
            perturbed_keyframes = []
            for frame in tqdm(keyframes, desc=f"Processing keyframes in {video_name}", leave=False):
                perturbed_frame = generate_adversarial_frame(
                    frame,
                    true_label,
                    target_label,
                    config
                )
                perturbed_keyframes.append(perturbed_frame)
            

            perturbed_frames = temporal_propagation(frames, keyframe_indices, perturbed_keyframes, config)
            perturbed_frames = np.stack(perturbed_frames)
            
            output_path = os.path.join(adv_video_dir, video_name)
            save_video(perturbed_frames, output_path, fps)
            
        except Exception as e:
            print(f"Error processing {video_name}: {str(e)}")
            continue

if __name__ == "__main__":
    config = {
        'video_directory': '/home/z/Music/st/Kinetics-400/RQ1/videos_val',
        'label_file': '/home/z/Music/st/Kinetics-400/RQ1/kinetics400_val_list_videos.txt',
        'num_classes': 400,  
        'epsilon': 0.03,  
        'tv_weight': 0.1,  
        'l2_weight': 0.01,  
        'keyframe_interval': 30,  
        'keyframe_weight': 0.02,  
        'propagation_weight': 0.15, 
        'gaussian_sigma': 0.08,  
    }
    
    process_videos(config)

Processing videos:  31%|███       | 6167/19796 [5:18:36<12:12:04,  3.22s/it]

Demo Visualization of Optical Flow

In [8]:
import torch
import torch.nn as nn
import torchvision.models as models
import torchvision.transforms as transforms
import av
import numpy as np
from tqdm import tqdm
import os
from PIL import Image
import cv2

def load_video(video_path):
    frames = []
    container = av.open(video_path)
    stream = container.streams.video[0]
    fps = stream.average_rate
    
    for frame in container.decode(video=0):
        frames.append(frame.to_rgb().to_ndarray())
    
    return np.stack(frames), fps

def compute_optical_flow(frame1, frame2):
    gray1 = cv2.cvtColor(frame1, cv2.COLOR_RGB2GRAY)
    gray2 = cv2.cvtColor(frame2, cv2.COLOR_RGB2GRAY)
    flow = cv2.calcOpticalFlowFarneback(gray1, gray2, None, 0.5, 3, 15, 3, 5, 1.2, 0)
    return flow

def draw_flow(img, flow, step=32):  # 增加步长从16到32，使箭头更稀疏
    h, w = img.shape[:2]
    y, x = np.mgrid[step/2:h:step, step/2:w:step].reshape(2,-1).astype(int)
    fx, fy = flow[y,x].T
    
    lines = np.vstack([x, y, x+fx*1.5, y+fy*1.5]).T.reshape(-1, 2, 2)  
    lines = np.int32(lines)

    vis = img.copy()
    

    for (x1, y1), (x2, y2) in lines:
        if np.sqrt((x2-x1)**2 + (y2-y1)**2) > 1:  
            cv2.arrowedLine(vis, (x1, y1), (x2, y2), (0, 255, 0), 3, tipLength=0.4)
    
    return vis

def visualize_optical_flow(video_path, output_path):

    frames, fps = load_video(video_path)

    height, width = frames[0].shape[:2]
    container = av.open(output_path, mode='w')
    stream = container.add_stream('h264', rate=fps)
    stream.width = width
    stream.height = height
    stream.pix_fmt = 'yuv420p'
    

    prev_frame = None
    for i in tqdm(range(len(frames)), desc="Processing frames"):
        current_frame = frames[i]
        
        if prev_frame is not None:
            flow = compute_optical_flow(prev_frame, current_frame)
            
            flow_vis = draw_flow(current_frame, flow)
            
            frame = av.VideoFrame.from_ndarray(flow_vis, format='rgb24')
            packet = stream.encode(frame)
            container.mux(packet)
        
        prev_frame = current_frame
    
    # 完成视频写入
    packet = stream.encode(None)
    container.mux(packet)
    container.close()

def visualize_flow_magnitude(video_path, output_path):

    frames, fps = load_video(video_path)

    height, width = frames[0].shape[:2]
    container = av.open(output_path, mode='w')
    stream = container.add_stream('h264', rate=fps)
    stream.width = width
    stream.height = height
    stream.pix_fmt = 'yuv420p'
    
    prev_frame = None
    for i in tqdm(range(len(frames)), desc="Processing frames"):
        current_frame = frames[i]
        
        if prev_frame is not None:
            flow = compute_optical_flow(prev_frame, current_frame)
            
            magnitude = np.sqrt(flow[..., 0]**2 + flow[..., 1]**2)
            
            magnitude = np.clip(magnitude * 10, 0, 255).astype(np.uint8)
            
            heatmap = cv2.applyColorMap(magnitude, cv2.COLORMAP_JET)
            
            overlay = cv2.addWeighted(current_frame, 0.7, heatmap, 0.3, 0)
            
            frame = av.VideoFrame.from_ndarray(overlay, format='rgb24')
            packet = stream.encode(frame)
            container.mux(packet)
        
        prev_frame = current_frame
    
    packet = stream.encode(None)
    container.mux(packet)
    container.close()

if __name__ == "__main__":
    import os
    
    input_video = "/home/z/Music/st/Kinetics-400/RQ3/example/select/selected/0H3dSeJ58Hc.mp4"
    output_dir = "/home/z/Music/st/Kinetics-400/RQ3/example/select/ooptical"
    output_video_arrows = os.path.join(output_dir, "flow_arrows.mp4")
    output_video_heatmap = os.path.join(output_dir, "flow_heatmap.mp4")
    
    # 确保输出目录存在
    os.makedirs(output_dir, exist_ok=True)
    
    # 生成带箭头的光流可视化
    print("Generating arrow visualization...")
    visualize_optical_flow(input_video, output_video_arrows)
    
    # 生成热力图可视化
    print("Generating heatmap visualization...")
    visualize_flow_magnitude(input_video, output_video_heatmap)
    
    print("Done!")

Generating arrow visualization...


Processing frames: 100%|██████████| 300/300 [00:09<00:00, 30.15it/s]


Generating heatmap visualization...


Processing frames: 100%|██████████| 300/300 [00:10<00:00, 29.69it/s]


Done!
