In [9]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from torch.utils.data import DataLoader, Dataset
import numpy as np

# Parameters
num_modalities = 3  # Images, text, and speech
input_dim = 16  # Input feature dimension for each modality
output_dim = 8  # Output dimension after attention layer
batch_size = 32
num_epochs = 200
learning_rate = 0.01

# Dummy Data Creation
class MultimodalDummyDataset(Dataset):
    def __init__(self, num_samples=1000):
        self.num_samples = num_samples
        self.image_data = torch.randn(num_samples, input_dim)  # Random image features
        self.text_data = torch.randn(num_samples, input_dim)   # Random text features
        self.speech_data = torch.randn(num_samples, input_dim) # Random speech features
        self.labels = torch.randint(0, 2, (num_samples,))      # Random binary labels (0 or 1)

    def __len__(self):
        return self.num_samples

    def __getitem__(self, idx):
        return {
            'image': self.image_data[idx],
            'text': self.text_data[idx],
            'speech': self.speech_data[idx],
            'label': self.labels[idx]
        }

# Multimodal Graph Attention Layer
class MultimodalGraphAttentionLayer(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(MultimodalGraphAttentionLayer, self).__init__()
        self.attention_weights = nn.Parameter(torch.FloatTensor(input_dim, output_dim))
        nn.init.xavier_uniform_(self.attention_weights)

    def forward(self, x):
        attention_scores = torch.matmul(x, self.attention_weights)
        attention_weights = torch.softmax(attention_scores, dim=1)
        return attention_weights * x

# Multimodal Adversarial Neural Network
# Multimodal Graph Attention Layer
class MultimodalGraphAttentionLayer(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(MultimodalGraphAttentionLayer, self).__init__()
        self.attention_weights = nn.Parameter(torch.FloatTensor(output_dim, output_dim))
        self.linear_projection = nn.Linear(input_dim, output_dim)  # Project input to output_dim
        nn.init.xavier_uniform_(self.attention_weights)

    def forward(self, x):
        x_proj = self.linear_projection(x)  # Project input to output_dim
        # Adjust the dimensions for matrix multiplication
        attention_scores = torch.matmul(x_proj, self.attention_weights.t())  # Transpose weights
        attention_weights = torch.softmax(attention_scores, dim=1)
        return attention_weights * x_proj  # Multiply with projected input


# Training and Evaluation
train_dataset = MultimodalDummyDataset(num_samples=800)
validate_dataset = MultimodalDummyDataset(num_samples=200)
train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)
validate_loader = DataLoader(dataset=validate_dataset, batch_size=batch_size, shuffle=False)

model = MultimodalAdversarialNN(input_dim, output_dim, num_modalities)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Training Loop
for epoch in range(num_epochs):
    model.train()
    for batch in train_loader:
        image = batch['image']
        text = batch['text']
        speech = batch['speech']
        labels = batch['label']

        optimizer.zero_grad()
        outputs = model(image, text, speech)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}")

# Evaluation
model.eval()
y_true, y_pred = [], []
with torch.no_grad():
    for batch in validate_loader:
        image = batch['image']
        text = batch['text']
        speech = batch['speech']
        labels = batch['label']

        outputs = model(image, text, speech)
        _, predicted = torch.max(outputs, 1)

        y_true.extend(labels.numpy())
        y_pred.extend(predicted.numpy())

# Performance Metrics
accuracy = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred)
recall = recall_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred)

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

Epoch [1/200], Loss: 0.6775
Epoch [2/200], Loss: 0.6670
Epoch [3/200], Loss: 0.6860
Epoch [4/200], Loss: 0.6565
Epoch [5/200], Loss: 0.6838
Epoch [6/200], Loss: 0.5521
Epoch [7/200], Loss: 0.5768
Epoch [8/200], Loss: 0.3839
Epoch [9/200], Loss: 0.4500
Epoch [10/200], Loss: 0.4030
Epoch [11/200], Loss: 0.4865
Epoch [12/200], Loss: 0.4438
Epoch [13/200], Loss: 0.5197
Epoch [14/200], Loss: 0.3598
Epoch [15/200], Loss: 0.5107
Epoch [16/200], Loss: 0.4010
Epoch [17/200], Loss: 0.3872
Epoch [18/200], Loss: 0.3967
Epoch [19/200], Loss: 0.3528
Epoch [20/200], Loss: 0.3606
Epoch [21/200], Loss: 0.3815
Epoch [22/200], Loss: 0.4583
Epoch [23/200], Loss: 0.3484
Epoch [24/200], Loss: 0.3988
Epoch [25/200], Loss: 0.3777
Epoch [26/200], Loss: 0.4028
Epoch [27/200], Loss: 0.5175
Epoch [28/200], Loss: 0.3801
Epoch [29/200], Loss: 0.4022
Epoch [30/200], Loss: 0.3797
Epoch [31/200], Loss: 0.3466
Epoch [32/200], Loss: 0.3570
Epoch [33/200], Loss: 0.4612
Epoch [34/200], Loss: 0.3763
Epoch [35/200], Loss: 0