In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import numpy as np
import os
from PIL import Image
import cv2
from torchvision import transforms

### 1. Architecture of Dual-Stream networks

In [None]:
class ConvBlock(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size, stride, use_pool=True, pool_size=2):
        super().__init__()
        self.conv = nn.Conv2d(in_channels, out_channels, kernel_size=kernel_size, stride=stride, padding=kernel_size//2)
        self.bn = nn.BatchNorm2d(out_channels)
        self.use_pool = use_pool
        if use_pool:
            self.pool = nn.MaxPool2d(kernel_size=pool_size, stride=pool_size)
    
    def forward(self, x):
        x = self.conv(x)
        x = self.bn(x)
        if self.use_pool:
            x = self.pool(x)
        return x

class DualStreamNetwork(nn.Module):
    def __init__(self, num_classes):
        super().__init__()
        
        # Common stream structure (used for both spatial and temporal)
        def create_stream():
            return nn.Sequential(
                # conv1: 7x7x96, stride 2, norm, pool 2x2
                ConvBlock(3, 96, kernel_size=7, stride=2),
                
                # conv2: 5x5x256, stride 2, norm, pool 2x2
                ConvBlock(96, 256, kernel_size=5, stride=2),
                
                # conv3: 3x3x512, stride 1
                ConvBlock(256, 512, kernel_size=3, stride=1, use_pool=False),
                
                # conv4: 3x3x512, stride 1
                ConvBlock(512, 512, kernel_size=3, stride=1, use_pool=False),
                
                # conv5: 3x3x512, stride 1, pool 2x2
                ConvBlock(512, 512, kernel_size=3, stride=1)
            )
        
        # Spatial stream
        self.spatial_stream = create_stream()
        
        # Temporal stream (modify first conv layer for 2-channel input)
        self.temporal_stream = create_stream()
        # Replace first conv layer to accept 2-channel input
        self.temporal_stream[0].conv = nn.Conv2d(2, 96, kernel_size=7, stride=2, padding=3)
        
        # Fully connected layers
        self.fc6 = nn.Linear(512 * 7 * 7, 4096)
        self.fc7 = nn.Linear(4096, 2048)
        
        # Classification layer
        self.fc8 = nn.Linear(2048 * 2, num_classes)
        
        self.dropout = nn.Dropout(0.5)
        
    def forward_single_stream(self, x, stream):
        # Convolutional layers
        x = stream(x)
        
        # Flatten
        x = x.view(x.size(0), -1)
        
        # FC layers
        x = F.relu(self.fc6(x))
        x = self.dropout(x)
        x = F.relu(self.fc7(x))
        x = self.dropout(x)
        
        return x
    
    def forward(self, spatial_input, temporal_input):
        # Process streams
        spatial_features = self.forward_single_stream(spatial_input, self.spatial_stream)
        temporal_features = self.forward_single_stream(temporal_input, self.temporal_stream)
        
        # Concatenate features
        combined_features = torch.cat([spatial_features, temporal_features], dim=1)
        
        # Final classification
        output = self.fc8(combined_features)
        output = F.softmax(output, dim=1)
        
        return output

### 2. load the input dataset

In [3]:
def load_optical_flow(flow_path):
    """Load pre-computed optical flow data"""
    try:
        flow = np.load(flow_path)
        # Ensure flow has correct dimensions (H, W, 2)
        if flow.shape[-1] != 2:
            raise ValueError(f"Invalid flow shape: {flow.shape}")
        # Normalize flow values
        flow = (flow - flow.min()) / (flow.max() - flow.min() + 1e-6)
        flow = torch.from_numpy(flow.transpose(2, 0, 1)).float()
        return flow
    except Exception as e:
        print(f"Error loading optical flow from {flow_path}: {e}")
        # Return zero tensor as fallback
        return torch.zeros(2, 224, 224)

def load_video_frame(frame_path):
    """Load and preprocess video frame"""
    try:
        frame = Image.open(frame_path)
        frame = frame.convert('RGB')
        frame = frame.resize((224, 224))
        frame = np.array(frame) / 255.0
        frame = torch.from_numpy(frame.transpose(2, 0, 1)).float()
        return frame
    except Exception as e:
        print(f"Error loading frame from {frame_path}: {e}")
        # Return zero tensor as fallback
        return torch.zeros(3, 224, 224)

In [None]:
class FlowVideoDataset(Dataset):
    def __init__(self, root_dir, split='train', transform=None):
        self.root_dir = root_dir
        self.split = split
        self.transform = transform
        
        # Load dataset splits
        self.frames_dir = os.path.join(root_dir, 'frames')
        self.flows_dir = os.path.join(root_dir, 'flows')
        
        # Load video paths and labels
        self.samples = []  # [(frame_path, flow_path, label), ...]
        self._load_dataset()
        
        # Data augmentation for training
        if split == 'train':
            self.transform = transforms.Compose([
                transforms.RandomHorizontalFlip(),
                transforms.RandomRotation(15),
                transforms.ColorJitter(brightness=0.4, contrast=0.4, saturation=0.4),
                transforms.Normalize(mean=[0.485, 0.456, 0.406], 
                                  std=[0.229, 0.224, 0.225])
            ])
        else:
            self.transform = transforms.Compose([
                transforms.Normalize(mean=[0.485, 0.456, 0.406], 
                                  std=[0.229, 0.224, 0.225])
            ])
        
    def _load_dataset(self):
        """Load dataset paths and labels from ucf101_noleakage"""
        split_file = os.path.join(self.root_dir, f'{self.split}list01.txt')
        try:
            with open(split_file, 'r') as f:
                for line in f:
                    video_path, label = line.strip().split()
                    frame_path = os.path.join(self.frames_dir, video_path)
                    flow_path = os.path.join(self.flows_dir, video_path.replace('.jpg', '.npy'))
                    if os.path.exists(frame_path) and os.path.exists(flow_path):
                        self.samples.append((frame_path, flow_path, int(label)))
        except Exception as e:
            print(f"Error loading dataset from {split_file}: {e}")
                
    def __len__(self):
        return len(self.samples)
    
    def __getitem__(self, idx):
        frame_path, flow_path, label = self.samples[idx]
        
        # Load frame and flow
        frame = load_video_frame(frame_path)
        flow = load_optical_flow(flow_path)
        
        # Apply transformations
        if self.transform:
            frame = self.transform(frame)
            # Only apply spatial transformations to flow, not color transformations
            if isinstance(self.transform, transforms.Compose):
                for t in self.transform.transforms:
                    if isinstance(t, (transforms.RandomHorizontalFlip, transforms.RandomRotation)):
                        flow = t(flow)
            
        return frame, flow, label

### 3. Train-val-test functions

In [None]:
def train_epoch(model, dataloader, criterion, optimizer, device):
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0
    
    for batch_idx, (frames, flows, labels) in enumerate(dataloader):
        frames, flows, labels = frames.to(device), flows.to(device), labels.to(device)
        
        optimizer.zero_grad()
        outputs = model(frames, flows)
        loss = criterion(outputs, labels)
        
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
        _, predicted = outputs.max(1)
        total += labels.size(0)
        correct += predicted.eq(labels).sum().item()
        
        if batch_idx % 100 == 99:
            print(f'Batch: {batch_idx+1}, Loss: {running_loss/100:.3f}, '
                  f'Acc: {100.*correct/total:.2f}%')
            running_loss = 0.0
            
    return correct/total

def validate(model, dataloader, criterion, device):
    model.eval()
    running_loss = 0.0
    correct = 0
    total = 0
    
    with torch.no_grad():
        for frames, flows, labels in dataloader:
            frames, flows, labels = frames.to(device), flows.to(device), labels.to(device)
            outputs = model(frames, flows)
            loss = criterion(outputs, labels)
            
            running_loss += loss.item()
            _, predicted = outputs.max(1)
            total += labels.size(0)
            correct += predicted.eq(labels).sum().item()
            
    accuracy = correct / total
    avg_loss = running_loss / len(dataloader)
    return accuracy, avg_loss

def save_checkpoint(model, optimizer, epoch, accuracy, filename):
    checkpoint = {
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'accuracy': accuracy
    }
    torch.save(checkpoint, filename)


### 4.Main

In [None]:
# Hyperparameters
num_epochs = 100
batch_size = 32
learning_rate = 0.001
num_classes = 101  # UCF101 dataset

# Setup device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Initialize model
model = DualStreamNetwork(num_classes).to(device)

# Setup data
data_root = '/dtu/datasets1/02516'  # Adjust path as needed
train_dataset = FlowVideoDataset(root_dir=data_root, split='train')
val_dataset = FlowVideoDataset(root_dir=data_root, split='val')

train_loader = DataLoader(train_dataset, batch_size=batch_size, 
                        shuffle=True, num_workers=4, pin_memory=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, 
                        shuffle=False, num_workers=4, pin_memory=True)

# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', 
                                                factor=0.1, patience=5, verbose=True)

# Training loop
best_acc = 0.0
checkpoint_dir = 'checkpoints'
os.makedirs(checkpoint_dir, exist_ok=True)

for epoch in range(num_epochs):
    print(f'Epoch {epoch+1}/{num_epochs}')
    
    # Train
    train_acc = train_epoch(model, train_loader, criterion, optimizer, device)
    print(f'Training Accuracy: {train_acc*100:.2f}%')
    
    # Validate
    val_acc, val_loss = validate(model, val_loader, criterion, device)
    print(f'Validation Accuracy: {val_acc*100:.2f}%, Loss: {val_loss:.3f}')
    
    # Learning rate scheduling
    scheduler.step(val_acc)
    
    # Save checkpoint
    if val_acc > best_acc:
        best_acc = val_acc
        save_checkpoint(model, optimizer, epoch, val_acc,
                        os.path.join(checkpoint_dir, 'best_model.pth'))
    
    # Save regular checkpoint every 5 epochs
    if (epoch + 1) % 5 == 0:
        save_checkpoint(model, optimizer, epoch, val_acc,
                        os.path.join(checkpoint_dir, f'checkpoint_epoch_{epoch+1}.pth'))
        
    print(f'Best accuracy: {best_acc*100:.2f}%')
    print('-' * 60)