In [1]:
#@title Load packages
# Import necessary PyTorch modules for building neural networks, data handling, and visualization

import torch
import torch.nn as nn
import torch.nn.functional as F

from torch.utils.data import DataLoader, Dataset

In [None]:
#@title Download data
# Download the multimodal dataset (images and audio) from Google Drive using gdown

!gdown --folder https://drive.google.com/drive/folders/1sfPdmwKSg1hf-Kn7k2piaXA1OPy7rVQI?usp=share_link

In [None]:
#@title Load data
# Load the pre-processed training and test data tensors for both image and audio modalities

image_train = torch.load('/content/nldl-2026-multimodal-tutorial/training_images.pth').float()
image_train_labels = torch.load('/content/nldl-2026-multimodal-tutorial/training_images_labels.pth').long()
image_test = torch.load('/content/nldl-2026-multimodal-tutorial/test_images.pth').float()

audio_train = torch.load('/content/nldl-2026-multimodal-tutorial/training_audio.pth').float()
audio_train_labels = torch.load('/content/nldl-2026-multimodal-tutorial/training_audio_labels.pth').long()
audio_test = torch.load('/content/nldl-2026-multimodal-tutorial/test_audio.pth').float()

In [None]:
#@title Create dataset
# Define a custom PyTorch Dataset class that pairs images with audio samples and their labels.
# Split the training data into train and validation sets (80/20 split).


from typing import Optional, Tuple, Any

class MultiModalDataset(Dataset):
    """
    Small Torch Dataset-style wrapper that returns (image, audio, label).
    - image_tensor, audio_tensor: tensors or array-like with same length
    - image_labels, audio_labels: optional label tensors
    - unify_labels: if True, will return a single label (requires matching label tensors).
      If False, returns a tuple (image_label, audio_label).
    Note: removed any transform / preprocessing hooks â€” dataset returns raw tensors.
    """
    def __init__(self,
                 image_tensor: torch.Tensor,
                 audio_tensor: torch.Tensor,
                 image_labels: Optional[torch.Tensor] = None,
                 audio_labels: Optional[torch.Tensor] = None,
                 unify_labels: bool = True):
        super().__init__()
        if len(image_tensor) != len(audio_tensor):
            raise ValueError("image_tensor and audio_tensor must have the same length")
        self.images = image_tensor
        self.audios = audio_tensor
        self.image_labels = image_labels
        self.audio_labels = audio_labels
        self.unify_labels = unify_labels

        if unify_labels and image_labels is not None and audio_labels is not None:
            if not torch.equal(image_labels, audio_labels):
                raise ValueError("image_labels and audio_labels differ but unify_labels=True")

    def __len__(self) -> int:
        return len(self.images)

    def __getitem__(self, index: int) -> Tuple[Any, Any, Any]:

        img = self.images[index]
        aud = self.audios[index]

        if self.unify_labels:
            label = None
            if self.image_labels is not None:
                label = self.image_labels[index]
            elif self.audio_labels is not None:
                label = self.audio_labels[index]
            return img, aud, label
        else:
            img_lbl = None if self.image_labels is None else self.image_labels[index]
            aud_lbl = None if self.audio_labels is None else self.audio_labels[index]
            return img, aud, (img_lbl, aud_lbl)


train_dataset_pre = MultiModalDataset(image_train,
                                  audio_train,
                                  image_labels=image_train_labels,
                                  audio_labels=audio_train_labels,
                                  unify_labels=True)

test_dataset = MultiModalDataset(image_test,
                                 audio_test,
                                 image_labels=torch.zeros(len(image_test)).long(),  # dummy labels
                                 audio_labels=torch.zeros(len(image_test)).long(),  # dummy labels
                                 unify_labels=True)

val_fraction = 0.2
_total_train = len(train_dataset_pre)
_val_len = int(_total_train * val_fraction)
_train_len = _total_train - _val_len

_generator = torch.Generator().manual_seed(42)
train_dataset, val_dataset = torch.utils.data.random_split(train_dataset_pre, [_train_len, _val_len], generator=_generator)


In [None]:
#@title Define model
# Define the neural network architecture with separate CNN encoders for image and audio,
# and a multimodal classifier that fuses features from both modalities.
# Students need to implement the fusion strategy.


class ImageEncoder(nn.Module):
    "Simple CNN Encoder for Image Data"
    def __init__(self):
        super(ImageEncoder, self).__init__()
        self.conv1 = nn.Conv2d(1,32, kernel_size=3, stride=1, padding=1)
        self.bn1 = nn.BatchNorm2d(32)
        self.conv2 = nn.Conv2d(32,64, kernel_size=3, stride=1, padding=1)
        self.bn2 = nn.BatchNorm2d(64)
        self.fc1 = nn.Linear(64 * 7 * 7, 100)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x= F.relu(self.conv1(x))
        x = F.max_pool2d(x, 2)
        x = F.relu(self.conv2(x))
        x = F.max_pool2d(x, 2)
        x = x.view(x.size(0), -1)
        x = F.dropout(x, p=0.25, training=self.training)
        x = self.fc1(x)

        return x

class AudioEncoder(nn.Module):
    "Simple CNN Encoder for Audio Data"
    def __init__(self):
        super(AudioEncoder, self).__init__()
        self.conv1 = nn.Conv1d(1,32, kernel_size=7, stride=2)
        self.bn1 = nn.BatchNorm1d(32)
        self.conv2 = nn.Conv1d(32,64, kernel_size=7, stride=2)
        self.bn2 = nn.BatchNorm1d(64)
        self.fc1 = nn.Linear(7936, 100)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x= F.relu(self.conv1(x))
        x = F.max_pool1d(x, 4)
        x = F.relu(self.conv2(x))
        x = F.max_pool1d(x, 4)
        x = x.view(x.size(0), -1)
        x = F.dropout(x, p=0.2, training=self.training)
        x = self.fc1(x)

        return x

class MultiModalClassifier(nn.Module):
    "Multi-Modal Classifier combining Image and Audio Encoders"
    def __init__(self, image_encoder: nn.Module, audio_encoder: nn.Module):
        super(MultiModalClassifier, self).__init__()
        self.image_encoder = image_encoder
        self.audio_encoder = audio_encoder

        #self.fc_combined = #### ## your code here ###### depending on your fusion strategy
        # self.output_layer = nn.Linear(, 10) ### your code here ###### depending on your fusion strategy

    def forward(self, image: torch.Tensor, audio: torch.Tensor) -> torch.Tensor:
        img_features = self.image_encoder(image)
        aud_features = self.audio_encoder(audio)

        ######
        ###### your code here
        ######

        output = self.output_layer(output)
        return output

    def fusion_module(self, img_features: torch.Tensor, aud_features: torch.Tensor) -> torch.Tensor:
        ######
        ###### your code here
        ######
        return combined



In [None]:
#@title Training and evaluation functions
# Helper functions to train the model for one epoch and evaluate performance on validation/test sets


def train_one_epoch(model, loader, optimizer, criterion, device):
    model.train()
    running_loss = 0.0
    total = 0
    for images, audios, labels in loader:
        images = images.to(device)
        audios = audios.to(device)
        labels = labels.to(device, dtype=torch.long)

        optimizer.zero_grad()
        outputs = model(images, audios)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item() * images.size(0)
        total += images.size(0)

    return running_loss / total if total > 0 else 0.0

def evaluate(model, loader, criterion, device):
    model.eval()
    running_loss = 0.0
    correct = 0
    total = 0
    with torch.no_grad():
        for images, audios, labels in loader:
            images = images.to(device)
            audios = audios.to(device)
            labels = labels.to(device, dtype=torch.long)

            outputs = model(images, audios)
            loss = criterion(outputs, labels)

            running_loss += loss.item() * images.size(0)
            preds = outputs.argmax(dim=1)
            correct += (preds == labels).sum().item()
            total += images.size(0)

    avg_loss = running_loss / total if total > 0 else 0.0
    accuracy = correct / total if total > 0 else 0.0
    return avg_loss, accuracy



In [None]:
#@title Training and evaluation loop
# Set up data loaders, instantiate the model, and run the training loop for 20 epochs.
# Monitor training loss and validation accuracy after each epoch.

batch_size = 64
device = 'cuda'

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True,  num_workers=0)
val_loader   = DataLoader(val_dataset,   batch_size=batch_size, shuffle=False, num_workers=0)
test_loader  = DataLoader(test_dataset,  batch_size=batch_size, shuffle=False, num_workers=0)

model = MultiModalClassifier(ImageEncoder(), AudioEncoder())
model = model.to(device)

num_epochs = 20
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters())

# Run training with validation after each epoch
for epoch in range(1, num_epochs + 1):
    train_loss = train_one_epoch(model, train_loader, optimizer, criterion, device)
    val_loss, val_acc = evaluate(model, val_loader, criterion, device)

    print(f"Epoch {epoch:02d} | Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f} | Val Acc: {val_acc:.4f}")



In [None]:
#@title Generate predictions on test set and save as CSV
# Run inference on the test set using the trained model and save predictions to a CSV file

import numpy as np
import pandas as pd

# Ensure model is on the correct device
model = model.to(device)

# Generate predictions on the multimodal test set and save as CSV with columns: id, digit, Usage
model.eval()
pred_chunks = []
with torch.no_grad():
    for images, audios, _labels in test_loader:
        images = images.to(device, dtype=torch.float32)
        audios = audios.to(device, dtype=torch.float32)
        outputs = model(images, audios)
        preds = outputs.argmax(dim=1).cpu().numpy()
        pred_chunks.append(preds)

preds = np.concatenate(pred_chunks, axis=0)
n = len(test_dataset)
assert preds.shape[0] == n, f"Predictions ({preds.shape[0]}) != test samples ({n})"

df = pd.DataFrame({
    "id": np.arange(n),
    "digit": preds
})
df.to_csv('test_multimodal_predictions.csv', index=False)
print(f"Saved predictions for {n} samples to test_multimodal_predictions.csv")
#