In [1]:
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from PIL import Image
import pandas as pd
import glob
import numpy as np
from scipy.ndimage import median_filter


In [2]:
# ==========================================
# CONFIGURATION 
# ==========================================
TRAIN_DIR = '/kaggle/input/pixel-play-26/Avenue_Corrupted-20251221T112159Z-3-001/Avenue_Corrupted/Dataset/training_videos'
TEST_DIR = '/kaggle/input/pixel-play-26/Avenue_Corrupted-20251221T112159Z-3-001/Avenue_Corrupted/Dataset/testing_videos'
OUTPUT_PATH = '/kaggle/working/submission.csv'

#Set hyperparameters
BATCH_SIZE = 96
IMAGE_SIZE = 96
EPOCHS = 8
LEARNING_RATE = 0.004
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

print(f"Using device: {DEVICE}")


Using device: cuda


In [3]:
# ==========================================
# DATASET
# ==========================================
class VideoFrameDataset(Dataset):
    def __init__(self, root_dir, transform=None, return_id=False):
        self.root_dir = root_dir
        self.transform = transform
        self.return_id = return_id
        self.samples = []
        # Walk through the directory to find all frames
        # Assuming structure: root/01/frame_00001.jpg        
        video_folders = sorted(os.listdir(root_dir))
        
        for vid_folder in video_folders:
            vid_path = os.path.join(root_dir, vid_folder)
            if not os.path.isdir(vid_path):
                continue
            # Get all image files in the video folder
            frames = sorted(glob.glob(os.path.join(vid_path, '*.*')))
            
            for frame_path in frames:
                # parsing ID
                # Folder: "01" -> 1
                # File: "frame_00001.jpg" -> 1
                try:
                    video_id = int(vid_folder)
                    filename = os.path.basename(frame_path)
                    # Split 'frame_00001.jpg' -> '00001' -> 1
                    frame_num = int(filename.split('_')[-1].split('.')[0])                  
                    row_id = f"{video_id}_{frame_num}"
                    self.samples.append((frame_path, row_id))
                except ValueError:
                    continue # Skip files that don't match pattern

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        path, row_id = self.samples[idx]
        # Load Image
        image = Image.open(path).convert('RGB')
        
        if self.transform:
            image = self.transform(image)
            
        #return image with ID if requested (for inference), otherwise return just the image (for training)
        if self.return_id:
            return image, row_id
        return image


In [4]:
# ==========================================
# MODEL 
# ==========================================
class MultiHeadSpatialAttention(nn.Module):
    #Multi-head attention to capture different anomaly patterns
    def __init__(self, in_channels, num_heads=3):
        super(MultiHeadSpatialAttention, self).__init__()
        self.num_heads = num_heads
        
        # Multiple attention heads for different anomaly patterns
        self.heads = nn.ModuleList([
            nn.Conv2d(in_channels, 1, kernel_size=1) for _ in range(num_heads)
        ])
        
        # Learnable fusion weights
        self.fusion = nn.Conv2d(num_heads, 1, kernel_size=1)
        
    def forward(self, x):
        # Generate multiple attention maps
        attention_maps = []
        for head in self.heads:
            att = torch.sigmoid(head(x))
            attention_maps.append(att)
        
        # Stack and fuse
        stacked = torch.cat(attention_maps, dim=1)
        fused_attention = torch.sigmoid(self.fusion(stacked))
        
        return x * fused_attention, fused_attention

class MultiHeadBottleneck(nn.Module):
    def __init__(self):
        super(MultiHeadBottleneck, self).__init__()

        # encoder architecture
        self.enc1 = nn.Sequential(
            nn.Conv2d(3, 32, 3, stride=2, padding=1),
            nn.LeakyReLU(0.2),
            nn.Dropout2d(0.45),
        )
        
        self.enc2 = nn.Sequential(
            nn.Conv2d(32, 64, 3, stride=2, padding=1),
            nn.BatchNorm2d(64),
            nn.LeakyReLU(0.2),
            nn.Dropout2d(0.45),
        )
        
        self.enc3 = nn.Sequential(
            nn.Conv2d(64, 128, 3, stride=2, padding=1),
            nn.BatchNorm2d(128),
            nn.LeakyReLU(0.2),
            nn.Dropout2d(0.40),
        )
        
        self.enc4 = nn.Sequential(
            nn.Conv2d(128, 128, 3, stride=2, padding=1),
            nn.BatchNorm2d(128),
            nn.LeakyReLU(0.2),
            nn.Dropout2d(0.35),
        )
        
        # Multi-head attention
        self.attention = MultiHeadSpatialAttention(128, num_heads=3)
        
        self.bottleneck = nn.Sequential(
            nn.Conv2d(128, 32, 3, stride=2, padding=1),
            nn.BatchNorm2d(32),
            nn.LeakyReLU(0.2),
        )
        
        # decoder
        self.decoder = nn.Sequential(
            nn.ConvTranspose2d(32, 128, 4, stride=2, padding=1),
            nn.BatchNorm2d(128),
            nn.LeakyReLU(0.2),
            
            nn.ConvTranspose2d(128, 128, 4, stride=2, padding=1),
            nn.BatchNorm2d(128),
            nn.LeakyReLU(0.2),
            
            nn.ConvTranspose2d(128, 64, 4, stride=2, padding=1),
            nn.BatchNorm2d(64),
            nn.LeakyReLU(0.2),
            
            nn.ConvTranspose2d(64, 32, 4, stride=2, padding=1),
            nn.BatchNorm2d(32),
            nn.LeakyReLU(0.2),
            
            nn.ConvTranspose2d(32, 3, 4, stride=2, padding=1),
            nn.Sigmoid()
        )
    
    def forward(self, x):
        e1 = self.enc1(x)
        e2 = self.enc2(e1)
        e3 = self.enc3(e2)
        e4 = self.enc4(e3)
        
        e4_attended, attention_map = self.attention(e4)
        
        encoded = self.bottleneck(e4_attended)
        decoded = self.decoder(encoded)
        
        return decoded, encoded, attention_map


In [5]:
# ==========================================
# TRAINING
# ==========================================
def train_autoencoder():
    print("\n" + "=" * 60)
    print("TRAINING ")
    print("=" * 60)

    print()
    
    # With aggressive augmentation, model can't memorize exact pixel patterns (they keep changing)
    train_transform = transforms.Compose([
        transforms.Resize((IMAGE_SIZE, IMAGE_SIZE)),
        transforms.RandomHorizontalFlip(0.5),
        transforms.ColorJitter(brightness=0.3, contrast=0.3, saturation=0.25, hue=0.1),
        transforms.RandomAffine(degrees=15, translate=(0.12, 0.12), scale=(0.88, 1.12)),
        transforms.RandomPerspective(distortion_scale=0.25, p=0.5),
        transforms.ToTensor(),
        transforms.RandomErasing(p=0.35, scale=(0.02, 0.12)),
    ])
    
    train_dataset = VideoFrameDataset(TRAIN_DIR, train_transform)
    train_loader = DataLoader(train_dataset, BATCH_SIZE, shuffle=True, 
                             num_workers=2, pin_memory=True, drop_last=True)
    
    print(f"Samples: {len(train_dataset)}\n")
    
    model = MultiHeadBottleneck().to(DEVICE)
    
    criterion = nn.L1Loss(reduction='none')
    optimizer = torch.optim.SGD(model.parameters(), lr=LEARNING_RATE, 
                                momentum=0.9, weight_decay=1e-2)
    
    model.train()
    for epoch in range(EPOCHS):
        total_loss = 0
        
        for batch_idx, images in enumerate(train_loader):
            images = images.to(DEVICE)
            recon, _, attention = model(images)
            
            pixel_loss = criterion(recon, images)
            
            #Upsampleing the attention map to match the input image size using bilinear interpolation
            attention_upsampled = F.interpolate(attention, size=images.shape[2:], 
                                               mode='bilinear', align_corners=False)
            weighted_loss = (pixel_loss * (1 + attention_upsampled)).mean()
            
            optimizer.zero_grad()
            weighted_loss.backward()
            optimizer.step()
            
            total_loss += weighted_loss.item()
            
            if batch_idx % 20 == 0:
                print(f"[{epoch+1}/{EPOCHS}][{batch_idx}/{len(train_loader)}] Loss: {weighted_loss.item():.4f}")
        
        avg_loss = total_loss / len(train_loader)
        print(f"Epoch {epoch+1} - Loss: {avg_loss:.4f}\n")
    
    print(f"Final: {avg_loss:.4f}")
    print()
    return model


In [6]:
# ==========================================
# INFERENCE
# ==========================================
def run_inference(model):
    print("=" * 60)
    print("INFERENCE ")
    print("=" * 60)

    # Test at multiple scales and average(this improves robusteness):
    scales = [80, 88, 96, 104, 112]
    print(f"Scales: {scales}\n")
    
    all_predictions = []
    
    for scale in scales:
        print(f"Scale {scale}...")
        
        test_transform = transforms.Compose([
            transforms.Resize((scale, scale)),
            transforms.ToTensor(),
        ])
        
        test_dataset = VideoFrameDataset(TEST_DIR, test_transform, return_id=True)
        test_loader = DataLoader(test_dataset, BATCH_SIZE, shuffle=False, num_workers=2)
        
        model.eval()
        results = []
        
        with torch.no_grad():
            for inputs, ids in test_loader:
                #If scale is not 96, interpolate to 96x96 for model
                if scale != IMAGE_SIZE:
                    inputs = F.interpolate(inputs, size=(IMAGE_SIZE, IMAGE_SIZE), 
                                          mode='bilinear', align_corners=False)
                
                inputs = inputs.to(DEVICE)
                outputs, encoded, attention = model(inputs)
                
                # Core error metrics
                l1 = torch.abs(inputs - outputs).mean(dim=[1, 2, 3])
                l2 = ((inputs - outputs) ** 2).mean(dim=[1, 2, 3])
                max_err = torch.abs(inputs - outputs).amax(dim=[1, 2, 3])
                
                # Multi-head attention-weighted error
                attention_up = F.interpolate(attention, size=inputs.shape[2:], 
                                            mode='bilinear', align_corners=False)
                spatial_err = torch.abs(inputs - outputs) * attention_up
                attention_err = spatial_err.mean(dim=[1, 2, 3])
                
                # Texture variation
                std_in = inputs.std(dim=[2, 3]).mean(dim=1)
                std_out = outputs.std(dim=[2, 3]).mean(dim=1)
                std_err = torch.abs(std_in - std_out)
                
                # Frequency domain
                inputs_freq = torch.fft.rfft2(inputs, dim=[2, 3])
                outputs_freq = torch.fft.rfft2(outputs, dim=[2, 3])
                freq_err = torch.abs(inputs_freq - outputs_freq).mean(dim=[1, 2, 3])
                
                # Peak signal analysis (high peaks = anomalies)
                peak_in = inputs.amax(dim=[2, 3]).mean(dim=1)
                peak_out = outputs.amax(dim=[2, 3]).mean(dim=1)
                peak_err = torch.abs(peak_in - peak_out)
                
                # Optimized weights
                scores = (0.26 * l1 + 
                         0.22 * l2 + 
                         0.21 * max_err +
                         0.15 * attention_err +    # Multi-head attention boost
                         0.08 * std_err +
                         0.06 * freq_err +
                         0.02 * peak_err)          #  peak analysis
                
                scores = scores.cpu().numpy()
                
                for id_str, score in zip(ids, scores):
                    results.append({'Id': id_str, 'Predicted': float(score)})
        
        all_predictions.append(pd.DataFrame(results))
    
    print(f"\n Averaging {len(scales)} scales...")
    df = all_predictions[0].copy()
    for i in range(1, len(all_predictions)):
        df['Predicted'] += all_predictions[i]['Predicted'].values
    df['Predicted'] /= len(all_predictions)
    
    df[['vid', 'frame']] = df['Id'].str.split('_', expand=True).astype(int)
    df = df.sort_values(['vid', 'frame'])
    
    print(" Applying optimized temporal filter...")

    
    filtered_scores = []
    for vid in df['vid'].unique():
        vid_mask = df['vid'] == vid
        vid_scores = df.loc[vid_mask, 'Predicted'].values
        
       
        window_size = min(9, max(5, len(vid_scores) // 15))
        if window_size % 2 == 0:
            window_size += 1
        
        smoothed = median_filter(vid_scores, size=window_size, mode='nearest')
        filtered_scores.extend(smoothed)
    
    df['Predicted'] = filtered_scores
    df = df.drop(columns=['vid', 'frame']).sort_values('Id')
    
    scores = df['Predicted'].values
    
    #Clipping to 0.5-99.5th percentile    
    print(" Clipping to 0.5-99.5th percentile...")
    p05, p995 = np.percentile(scores, [0.5, 99.5])
    scores_clipped = np.clip(scores, p05, p995)
    
    print(f"\nFiltered & clipped scores:")
    print(f"  Min: {scores_clipped.min():.6f}")
    print(f"  Q25: {np.percentile(scores_clipped, 25):.6f}")
    print(f"  Q50: {np.percentile(scores_clipped, 50):.6f}")
    print(f"  Q75: {np.percentile(scores_clipped, 75):.6f}")
    print(f"  Max: {scores_clipped.max():.6f}")
    
    scores_norm = (scores_clipped - scores_clipped.min()) / (scores_clipped.max() - scores_clipped.min())
    df['Predicted'] = scores_norm

    
    df.to_csv(OUTPUT_PATH, index=False)
    
    print(f"\n Saved: {OUTPUT_PATH}")
    print(f"\nFirst 20 predictions:")
    print(df.head(20))
    
    return df


In [7]:
if __name__ == "__main__":
    model = train_autoencoder()
    submission = run_inference(model)


TRAINING 

Samples: 9204

[1/8][0/95] Loss: 0.5054
[1/8][20/95] Loss: 0.4893
[1/8][40/95] Loss: 0.4508
[1/8][60/95] Loss: 0.4130
[1/8][80/95] Loss: 0.3916
Epoch 1 - Loss: 0.4444

[2/8][0/95] Loss: 0.3733
[2/8][20/95] Loss: 0.3385
[2/8][40/95] Loss: 0.3325
[2/8][60/95] Loss: 0.3020
[2/8][80/95] Loss: 0.2840
Epoch 2 - Loss: 0.3155

[3/8][0/95] Loss: 0.2780
[3/8][20/95] Loss: 0.2581
[3/8][40/95] Loss: 0.2418
[3/8][60/95] Loss: 0.2331
[3/8][80/95] Loss: 0.2262
Epoch 3 - Loss: 0.2414

[4/8][0/95] Loss: 0.2154
[4/8][20/95] Loss: 0.2207
[4/8][40/95] Loss: 0.2037
[4/8][60/95] Loss: 0.2033
[4/8][80/95] Loss: 0.1978
Epoch 4 - Loss: 0.2035

[5/8][0/95] Loss: 0.1898
[5/8][20/95] Loss: 0.1850
[5/8][40/95] Loss: 0.1863
[5/8][60/95] Loss: 0.1831
[5/8][80/95] Loss: 0.1791
Epoch 5 - Loss: 0.1830

[6/8][0/95] Loss: 0.1727
[6/8][20/95] Loss: 0.1751
[6/8][40/95] Loss: 0.1647
[6/8][60/95] Loss: 0.1727
[6/8][80/95] Loss: 0.1662
Epoch 6 - Loss: 0.1697

[7/8][0/95] Loss: 0.1644
[7/8][20/95] Loss: 0.1599
[7/8