# 08 - Audio-Visual Lip-Sync Deepfake Detection

## Objective
Detect deepfakes by analyzing audio-visual synchronization.

## Key Insight
Many deepfakes have lip-sync errors between audio and mouth movements.

## Approach
1. Extract mouth region from video frames
2. Extract audio features (MFCC, mel-spectrogram)
3. Use SyncNet-style architecture to measure synchronization
4. Combine with content-based features

In [None]:
class LipSyncDetector(nn.Module):
    def __init__(self):
        super().__init__()
        # Visual encoder (for mouth region)
        self.visual_encoder = nn.Sequential(
            nn.Conv2d(3, 64, kernel_size=5, stride=2, padding=2),
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.MaxPool2d(2),
            nn.Conv2d(64, 128, kernel_size=3, padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU(),
            nn.MaxPool2d(2),
            nn.AdaptiveAvgPool2d((4, 4)),
            nn.Flatten(),
            nn.Linear(128 * 4 * 4, 512)
        )
        
        # Audio encoder
        self.audio_encoder = nn.Sequential(
            nn.Conv1d(1, 64, kernel_size=3, padding=1),
            nn.BatchNorm1d(64),
            nn.ReLU(),
            nn.MaxPool1d(2),
            nn.Conv1d(64, 128, kernel_size=3, padding=1),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.AdaptiveAvgPool1d(32),
            nn.Flatten(),
            nn.Linear(128 * 32, 512)
        )
        
        # Sync measurement
        self.sync_classifier = nn.Sequential(
            nn.Linear(512 * 2, 256),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(256, 2)  # In-sync vs out-of-sync
        )
    
    def forward(self, mouth_frames, audio_segment):
        # mouth_frames: (B, T, C, H, W)
        # audio_segment: (B, 1, L)
        
        batch_size, num_frames = mouth_frames.shape[:2]
        
        # Process visual
        visual_feats = []
        for t in range(num_frames):
            feat = self.visual_encoder(mouth_frames[:, t])
            visual_feats.append(feat)
        visual_feat = torch.stack(visual_feats, dim=1).mean(dim=1)  # Average over time
        
        # Process audio
        audio_feat = self.audio_encoder(audio_segment)
        
        # Combine and classify
        combined = torch.cat([visual_feat, audio_feat], dim=1)
        sync_score = self.sync_classifier(combined)
        
        return sync_score

print('Lip-Sync Detector Defined!')