# Scenario 1: All modalities exists, no heterginity

In [1]:
import os
import torch
import torch.cuda as cuda
from torch import nn, optim
from torch.utils.data import DataLoader
import torchvision.transforms as transf
from torchsummary import summary
import numpy as np
import pandas as pd

from data_feed import DataFeed, DataFeed_image_pos
from build_net import resnet50, NN_beam_pred, MultinomialLogisticRegression, resnet18_mod
import networkx as nx
import matplotlib.pyplot as plt
import json

In [2]:
if torch.cuda.is_available():
    print("CUDA is available! PyTorch can use the GPU.")
else:
    print("CUDA is not available. PyTorch will use the CPU.")


CUDA is available! PyTorch can use the GPU.


In [3]:
batch_size = 64
val_batch_size = 1
lr = 1e-3
decay = 1e-4
num_epochs = 20
train_size = [1]
no_users = 20

val_losses_stand_alone = []
val_losses_FL = []
val_losses_SFMTL = []


In [4]:
########################################################################
########################### Data pre-processing ########################
########################################################################


img_resize = transf.Resize((224, 224))
img_norm = transf.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225))
proc_pipe = transf.Compose(
    [transf.ToPILImage(),
     img_resize,
     transf.ToTensor(),
     img_norm]
)
dataset_dir = "feature_IID/"
train_loaders = []
test_loaders = []
val_loaders = []

for user_id in range(no_users):
    train_dir = dataset_dir + f'user_{user_id}_pos_height_beam_train.csv'
    val_dir = dataset_dir + f'user_{user_id}_pos_height_beam_val.csv'
    test_dir = dataset_dir + f'user_{user_id}_pos_height_beam_test.csv'
    
    train_dataset = DataFeed_image_pos(train_dir, transform=proc_pipe)
    val_dataset = DataFeed_image_pos(root_dir=val_dir, transform=proc_pipe)
    test_dataset = DataFeed_image_pos(root_dir=test_dir, transform=proc_pipe)
    
    
    train_loaders.append(DataLoader(train_dataset,
                              batch_size=batch_size,
                              #num_workers=8,
                              shuffle=False))
    val_loaders.append(DataLoader(val_dataset,
                            batch_size=val_batch_size,
                            #num_workers=8,
                            shuffle=False))
    test_loaders.append(DataLoader(test_dataset,
                            batch_size=val_batch_size,
                            #num_workers=8,
                            shuffle=False))
    print(f"Loaded_user: {user_id}")

Loaded_user: 0
Loaded_user: 1
Loaded_user: 2
Loaded_user: 3
Loaded_user: 4
Loaded_user: 5
Loaded_user: 6
Loaded_user: 7
Loaded_user: 8
Loaded_user: 9
Loaded_user: 10
Loaded_user: 11
Loaded_user: 12
Loaded_user: 13
Loaded_user: 14
Loaded_user: 15
Loaded_user: 16
Loaded_user: 17
Loaded_user: 18
Loaded_user: 19


In [5]:
for _, (x, y) in zip(range(1), train_loaders[0]):
    print(x["pos_height"].shape)

torch.Size([64, 4])


In [6]:
# Model Preperation#
all_models = []
available_modalities = ["pos_height", "images"]
user_modalities = [available_modalities for _ in range(no_users)]
modality_size = {"pos_height": 512, "images": 2048}
output_sizes = [sum([modality_size[i] for i in user_modality]) for user_modality in user_modalities]

local_classifiers = [MultinomialLogisticRegression(output_sizes[i], 64) for i in range(no_users)]
for user_id in range(no_users):
    user_model = {}
    if "images" in user_modalities[user_id]:
        user_model["images"] = resnet50(pretrained=True, progress=True, num_classes=64).cuda()
    if "pos_height" in user_modalities[user_id]:
        user_model["pos_height"] = NN_beam_pred(num_features=4, num_output=64).cuda()
    all_models.append(user_model)
    

Output layer dim = 64
<class 'build_net.Bottleneck'>
Output layer dim = 64
<class 'build_net.Bottleneck'>
Output layer dim = 64
<class 'build_net.Bottleneck'>
Output layer dim = 64
<class 'build_net.Bottleneck'>
Output layer dim = 64
<class 'build_net.Bottleneck'>
Output layer dim = 64
<class 'build_net.Bottleneck'>
Output layer dim = 64
<class 'build_net.Bottleneck'>
Output layer dim = 64
<class 'build_net.Bottleneck'>
Output layer dim = 64
<class 'build_net.Bottleneck'>
Output layer dim = 64
<class 'build_net.Bottleneck'>
Output layer dim = 64
<class 'build_net.Bottleneck'>
Output layer dim = 64
<class 'build_net.Bottleneck'>
Output layer dim = 64
<class 'build_net.Bottleneck'>
Output layer dim = 64
<class 'build_net.Bottleneck'>
Output layer dim = 64
<class 'build_net.Bottleneck'>
Output layer dim = 64
<class 'build_net.Bottleneck'>
Output layer dim = 64
<class 'build_net.Bottleneck'>
Output layer dim = 64
<class 'build_net.Bottleneck'>
Output layer dim = 64
<class 'build_net.Bottle

In [7]:
# Create a dictionary to store similarity matrices for each modality
similarity_matrices = {modality: np.zeros((no_users, no_users), dtype=int) for modality in available_modalities}

# Populate the similarity matrices
for modality in available_modalities:
    for i in range(no_users):
        for j in range(no_users):
            if modality in user_modalities[i] and modality in user_modalities[j]:
                similarity_matrices[modality][i, j] = 1

# Print the resulting matrices
for modality, matrix in similarity_matrices.items():
    print(f"Similarity Matrix for '{modality}':")
    print(matrix)

Similarity Matrix for 'pos_height':
[[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
 [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
 [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
 [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
 [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
 [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
 [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
 [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
 [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
 [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
 [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
 [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
 [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
 [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
 [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
 [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
 [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
 [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
 [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
 [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]]
Similarity Matrix for 'images':
[[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
 [1 1 1 1 1 1 1 1 1 1 1 1 1 

In [31]:
def sinkhorn_knopp(matrix, tol=1e-9, max_iter=1000):
    """
    Converts a given matrix to a doubly stochastic matrix using the Sinkhorn-Knopp algorithm.
    
    Parameters:
        matrix (np.ndarray): The input matrix to be transformed.
        tol (float): The tolerance for convergence.
        max_iter (int): Maximum number of iterations for convergence.
    
    Returns:
        np.ndarray: A doubly stochastic matrix.
    """
    matrix = matrix.copy()
    for _ in range(max_iter):
        # Normalize rows
        row_sums = matrix.sum(axis=1, keepdims=True)
        matrix /= row_sums

        # Normalize columns
        col_sums = matrix.sum(axis=0, keepdims=True)
        matrix /= col_sums

        # Check for convergence
        if np.allclose(matrix.sum(axis=1), 1, atol=tol) and np.allclose(matrix.sum(axis=0), 1, atol=tol):
            break

    return matrix
    
def create_random_topology(num_users, edge_probability=0.3):
    """
    Creates a connected random topology using NetworkX.
    Returns the adjacency matrix.
    """
    while True:
        graph = nx.erdos_renyi_graph(num_users, edge_probability)
        if nx.is_connected(graph):
            break

    # Convert graph to adjacency matrix
    adjacency_matrix = nx.to_numpy_array(graph)
    return adjacency_matrix

def prepare_mixing_matrices(adjacency_matrix, similarity_matrices):
    """
    Computes a mixing matrix for each modality by multiplying the adjacency matrix 
    with the similarity matrix for that modality.
    Returns a dictionary of mixing matrices.
    """
    adjacency_matrices = {}
    mixing_matrices = {}
    for modality, similarity_matrix in similarity_matrices.items():
        # Element-wise multiplication of adjacency and similarity matrices
        combined_matrix = adjacency_matrix * similarity_matrix
        adjacency_matrices[modality] = combined_matrix
        
        # Normalize to create a doubly matrix
        mixing_matrix = sinkhorn_knopp(combined_matrix)
        
        
        mixing_matrices[modality] = mixing_matrix
    
    return mixing_matrices, adjacency_matrices




In [32]:
# Create random connected topology
adjacency_matrix = create_random_topology(no_users, edge_probability=0.3)

# Prepare mixing matrices for each modality
mixing_matrices, adjacency_matrices = prepare_mixing_matrices(adjacency_matrix, similarity_matrices)


In [10]:


# Decentralized aggregation function
def decentralized_aggregation(user_models, mixing_matrices, available_modalities):
    num_users = len(user_models)
    
    for modality in available_modalities:
        # Get the mixing matrix for the current modality
        mixing_matrix = mixing_matrices[modality]
        
        # Convert user model parameters to vectors for aggregation
        aggregated_models = [torch.nn.utils.parameters_to_vector(user_model[modality].parameters()) for user_model in user_models]
        
        # Initialize aggregated updates
        aggregated_updates = [torch.zeros_like(aggregated_models[0]) for _ in range(num_users)]
        
        # Perform model aggregation based on the mixing matrix for this modality
        for i in range(num_users):
            for j in range(num_users):
                if mixing_matrix[i, j] > 0:
                    aggregated_updates[i] += mixing_matrix[i, j] * aggregated_models[j]
        
        # Update user models with aggregated parameters for the current modality
        for i in range(num_users):
            torch.nn.utils.vector_to_parameters(aggregated_updates[i], user_models[i][modality].parameters())


def train_local_model(local_modalities, models, train_loader, criterion, optimizers, epochs):

    for modality in local_modalities:
        print(f"Training for modality: {modality}")
        
        model = models[modality]
        optimizer = optimizers[modality]
        
        model.train()
        for epoch in range(epochs):
            for data, labels in train_loader:
                # Move data to GPU if available
                data = data[modality]
                data, labels = data.cuda(), labels.cuda()
                
                # Zero the gradients
                optimizer.zero_grad()
                
                # Forward pass
                temp, outputs = model(data)
                print(temp.shape)

               
                # Compute loss
                loss = criterion(outputs, labels)
                
                # Backward pass and optimization
                loss.backward()
                optimizer.step()
                
                # Optional: Print loss for debugging
                print(f"Epoch [{epoch + 1}/{epochs}], Modality: {modality}, Loss: {loss.item():.4f}")
    
    return models



In [11]:
def validate_user_models(user_id, user_models, val_loaders, criterion):

    print(f"Validating model for User {user_id + 1}")


    with torch.no_grad():  # Disable gradient computation for validation
        for modality, model in user_models.items():
            total_loss = 0.0
            total_correct = 0
            total_samples = 0
            model.eval()
            if modality not in user_models.keys():
                print(f"Skipping modality {modality} for User {user_id + 1}, no validation data.")
                continue
            
            for data, labels in val_loaders:  # Iterate over validation data for the modality
                data = data[modality]
                data, labels = data.cuda(), labels.cuda()

                # Forward pass
                _, outputs = model(data)
                loss = criterion(outputs, labels)

                # Accumulate loss and accuracy
                total_loss += loss.item() * labels.size(0)  # Sum loss for the batch
                _, predicted = torch.max(outputs, 1)
                total_correct += (predicted == labels).sum().item()
                total_samples += labels.size(0)

            # Compute metrics
            avg_loss = total_loss / total_samples if total_samples > 0 else 0.0
            accuracy = total_correct / total_samples if total_samples > 0 else 0.0
        
            print(f"User {user_id + 1}, modality: {modality} - Validation Loss: {avg_loss:.4f}, Accuracy: {accuracy:.4f}")
        return {'loss': avg_loss, 'accuracy': accuracy}


In [12]:

local_epochs = 1
global_rounds = 10
weight_decay = 1e-5
criterion = nn.CrossEntropyLoss()

# Decentralized Training
for round_num in range(global_rounds):
    print(f"Global Round {round_num + 1}")

    # Training for image_modalities 
    print("Training image modalitity models")
    # Train each user's local model
    for user_id in range(no_users):
        print(f"Training model for User {user_id + 1}")
        user_models = all_models[user_id]
        optimizers = {}
        for modality in user_model.keys():
            optimizers[modality] = optim.Adam(user_models[modality].parameters(), lr=lr, weight_decay=weight_decay)
        train_local_model(user_modalities[user_id], user_models, train_loaders[user_id], criterion, optimizers, local_epochs)

    # Decentralized aggregation
    print("Performing decentralized aggregation")
    #decentralized_aggregation(user_models, mixing_matrix)

    # Optionally, validate models
    print("Validating user models...")
    for user_id in range(no_users):
        user_models = all_models[user_id]
        val_dict = validate_user_models(user_id, user_models, val_loaders[user_id], criterion)
        val_losses_stand_alone.append(val_dict)


print("Decentralized federated learning complete.")

file_path = 'val_losses_stand_alone.json'

# Save the list of dictionaries to the file
with open(file_path, 'w') as file:
    json.dump(val_losses_stand_alone, file, indent=4)

Global Round 1
Training image modalitity models
Training model for User 1
Training for modality: pos_height
torch.Size([64, 128])
Epoch [1/1], Modality: pos_height, Loss: 4.1766
torch.Size([64, 128])
Epoch [1/1], Modality: pos_height, Loss: 4.1608
torch.Size([64, 128])
Epoch [1/1], Modality: pos_height, Loss: 4.1276
torch.Size([64, 128])
Epoch [1/1], Modality: pos_height, Loss: 4.1209
torch.Size([64, 128])
Epoch [1/1], Modality: pos_height, Loss: 4.1139
torch.Size([64, 128])
Epoch [1/1], Modality: pos_height, Loss: 4.0743
torch.Size([64, 128])
Epoch [1/1], Modality: pos_height, Loss: 4.0565
torch.Size([64, 128])
Epoch [1/1], Modality: pos_height, Loss: 4.0252
torch.Size([64, 128])
Epoch [1/1], Modality: pos_height, Loss: 3.9783
torch.Size([64, 128])
Epoch [1/1], Modality: pos_height, Loss: 3.9705
torch.Size([64, 128])
Epoch [1/1], Modality: pos_height, Loss: 3.9210
torch.Size([64, 128])
Epoch [1/1], Modality: pos_height, Loss: 3.8806
torch.Size([64, 128])
Epoch [1/1], Modality: pos_hei

KeyboardInterrupt: 

# Decentralized FL

In [None]:
# Model Preperation#
all_models = []
available_modalities = ["pos_height", "images"]
user_modalities = [available_modalities for _ in range(no_users)]
modality_size = {"pos_height": 512, "images": 2048}
output_sizes = [sum([modality_size[i] for i in user_modality]) for user_modality in user_modalities]

local_classifiers = [MultinomialLogisticRegression(output_sizes[i], 64) for i in range(no_users)]
for user_id in range(no_users):
    user_model = {}
    if "images" in user_modalities[user_id]:
        user_model["images"] = resnet50(pretrained=True, progress=True, num_classes=64).cuda()
    if "pos_height" in user_modalities[user_id]:
        user_model["pos_height"] = NN_beam_pred(num_features=4, num_output=64).cuda()
    all_models.append(user_model)
    

In [40]:

local_epochs = 1
global_rounds = 10
weight_decay = 1e-5
criterion = nn.CrossEntropyLoss()
# Decentralized Training
for round_num in range(global_rounds):
    print(f"Global Round {round_num + 1}")

    # Training for image_modalities 
    print("Training image modalitity models")
    # Train each user's local model
    for user_id in range(no_users):
        print(f"Training model for User {user_id + 1}")
        user_models = all_models[user_id]
        optimizers = {}
        for modality in user_model.keys():
            optimizers[modality] = optim.Adam(user_models[modality].parameters(), lr=lr, weight_decay=weight_decay)
        #train_local_model(user_modalities[user_id], user_models, train_loaders[user_id], criterion, optimizers, local_epochs)

    # Decentralized aggregation
    print("Performing decentralized aggregation")
    decentralized_aggregation(all_models, mixing_matrices, available_modalities)

    # Optionally, validate models
    print("Validating user models...")
    for user_id in range(no_users):
        user_models = all_models[user_id]
        val_dict = validate_user_models(user_id, user_models, val_loaders[user_id], criterion)
        val_losses_FL.append(val_dict)


print("Decentralized federated learning complete.")

file_path = 'val_losses_FL.json'

# Save the list of dictionaries to the file
with open(file_path, 'w') as file:
    json.dump(val_losses_FL, file, indent=4)

Global Round 1
Training image modalitity models
Training model for User 1
Performing decentralized aggregation
Validating user models...
Validating model for User 1
User 1, modality: images - Validation Loss: 4.1566, Accuracy: 0.0000
User 1, modality: pos_height - Validation Loss: 4.1579, Accuracy: 0.0117
Global Round 2
Training image modalitity models
Training model for User 1
Performing decentralized aggregation
Validating user models...
Validating model for User 1


KeyboardInterrupt: 

# Sheaf_assisted FML

In [50]:
# Model Preperation#
all_models = []
available_modalities = ["pos_height", "images"]
user_modalities = [available_modalities for _ in range(no_users)]
modality_size = {"pos_height": 128, "images": 128}
output_sizes = [sum([modality_size[i] for i in user_modality]) for user_modality in user_modalities]

local_classifiers = [MultinomialLogisticRegression(output_sizes[i], 64).cuda() for i in range(no_users)]
for user_id in range(no_users):
    user_model = {}
    if "images" in user_modalities[user_id]:
        user_model["images"] = resnet50(pretrained=True, progress=True, num_classes=64).cuda()
    if "pos_height" in user_modalities[user_id]:
        user_model["pos_height"] = NN_beam_pred(num_features=4, num_output=64).cuda()
    all_models.append(user_model)
    

Output layer dim = 64
<class 'build_net.Bottleneck'>
Output layer dim = 64
<class 'build_net.Bottleneck'>
Output layer dim = 64
<class 'build_net.Bottleneck'>
Output layer dim = 64
<class 'build_net.Bottleneck'>
Output layer dim = 64
<class 'build_net.Bottleneck'>
Output layer dim = 64
<class 'build_net.Bottleneck'>
Output layer dim = 64
<class 'build_net.Bottleneck'>
Output layer dim = 64
<class 'build_net.Bottleneck'>
Output layer dim = 64
<class 'build_net.Bottleneck'>
Output layer dim = 64
<class 'build_net.Bottleneck'>
Output layer dim = 64
<class 'build_net.Bottleneck'>
Output layer dim = 64
<class 'build_net.Bottleneck'>
Output layer dim = 64
<class 'build_net.Bottleneck'>
Output layer dim = 64
<class 'build_net.Bottleneck'>
Output layer dim = 64
<class 'build_net.Bottleneck'>
Output layer dim = 64
<class 'build_net.Bottleneck'>
Output layer dim = 64
<class 'build_net.Bottleneck'>
Output layer dim = 64
<class 'build_net.Bottleneck'>
Output layer dim = 64
<class 'build_net.Bottle

In [51]:
factor = 0.05
neighbors = [np.nonzero(adjacency_matrix[i])[0].tolist() for i in range(no_users)]

# Initialize P_ij matrices
# Initialize P_ij matrices
P = {}
for i, j in zip(*adjacency_matrix.nonzero()):
    num_params_i = sum(p.numel() for p in local_classifiers[i].parameters())
    num_params_j = sum(p.numel() for p in local_classifiers[j].parameters())
    P[(i, j)] = torch.randn(int(factor*(num_params_i + num_params_j) // 2), num_params_i).cuda()
    P[(j, i)] = torch.randn(int(factor*(num_params_i + num_params_j) // 2), num_params_j).cuda()


In [63]:
def validate_user_model_FMTL(user_id, user_models, client_model, val_loader, loss_func):


    total_loss = 0.0
    total_correct = 0
    total_samples = 0

    # Disable gradient computation for validation
    with torch.no_grad():
        # Set all models to evaluation mode
        client_model.eval()
        for model in user_models.values():
            model.eval()

        for X_batch, y_batch in val_loader:
            # Concatenate outputs from all modality models
            modality_outputs = []
            y_batch = y_batch.cuda()

            for modality, model in user_models.items():

                # Forward pass through modality model
                data = X_batch[modality].cuda()
                modality_output, _ = model(data)
                modality_outputs.append(modality_output)

            # Skip if no valid modalities are present
            if len(modality_outputs) == 0:
                print(f"No valid modalities for User {user_id + 1}, skipping batch.")
                continue

            # Concatenate all modality outputs
            concatenated_output = torch.cat(modality_outputs, dim=1)

            # Forward pass through the classifier
            predictions = client_model(concatenated_output)

            # Compute loss
            loss = loss_func(client_model, concatenated_output, y_batch, l2_strength=0.01)
            total_loss += loss.item() * y_batch.size(0)

            # Compute accuracy
            _, predicted = torch.max(predictions, 1)
            total_correct += (predicted == y_batch).sum().item()
            total_samples += y_batch.size(0)

    # Compute overall metrics
    avg_loss = total_loss / total_samples if total_samples > 0 else 0.0
    accuracy = total_correct / total_samples if total_samples > 0 else 0.0

    print(f"User {user_id + 1} - Validation Loss: {avg_loss:.4f}, Accuracy: {accuracy:.4f}")
    return {'loss': avg_loss, 'accuracy': accuracy}


In [64]:
from utils import cross_entropy_loss_with_l2
local_epochs = 1
global_rounds = 10
weight_decay = 1e-5
criterion = nn.CrossEntropyLoss()
loss_func = cross_entropy_loss_with_l2
alpha = 0.01
lambda_reg = 0.01
eta = 0.01

# Decentralized Training
for round_num in range(global_rounds):
    print(f"Global Round {round_num + 1}")

    # Training for image_modalities 
    print("Training image modalitity models")
    # Train each user's local model
    for user_id in range(no_users):
        print(f"Training model for User {user_id + 1}")
        user_models = all_models[user_id]
        optimizers = {}
        for modality in user_model.keys():
            optimizers[modality] = optim.Adam(user_models[modality].parameters(), lr=lr, weight_decay=weight_decay)
        train_local_model(user_modalities[user_id], user_models, train_loaders[user_id], criterion, optimizers, local_epochs)

    # FMTL Training Part
    print("Performing FMTL aggregation")
    for i in range(no_users):
        client_model = local_classifiers[i]
        user_models = all_models[user_id]
        optimizer = optim.Adam(client_model.parameters(), lr=alpha, weight_decay=weight_decay)
        
        # Training the model
        client_model.train()
        for X_batch, y_batch in train_loaders[i]:
            # Ensure all modality models are in evaluation mode
            for modality, model in user_models.items():
                model.eval()
        
            # Concatenate outputs from all modality models
            modality_outputs = []
            for modality, model in user_models.items():
                with torch.no_grad():  # Do not compute gradients for modality models 
                    data, y_batch = X_batch[modality].cuda(), y_batch.cuda()
                    modality_output, _ = model(data)
                    modality_outputs.append(modality_output)

        
            # Concatenate all modality outputs
            concatenated_output = torch.cat(modality_outputs, dim=1)

            # Forward pass through the classifier
            optimizer.zero_grad()
            predictions = client_model(concatenated_output)
            
                    
            # Compute the loss
            loss = loss_func(client_model, concatenated_output, y_batch, l2_strength=0.01)
        
            # Backward pass and optimizer step
            loss.backward()
            optimizer.step()
        
    
        with torch.no_grad():
            theta_i = torch.cat([param.view(-1) for param in client_model.parameters()])
    
            # Update theta_i based on P_ij and neighbors
            sum_P_terms = torch.zeros_like(theta_i)
            for j in range(no_users):
                if adjacency_matrix[i, j] == 1:
                    P_ij = P[(i, j)]
                    P_ji = P[(j, i)]
                    theta_j = torch.cat([param.view(-1) for param in local_classifiers[j].parameters()])
                    sum_P_terms += P_ij.T @ (P_ij @ theta_i - P_ji @ theta_j)
    
            # Apply update to theta_i
            theta_i -= alpha * lambda_reg * sum_P_terms
    
            # Put updated theta_i back into the model
            idx = 0
            for param in client_model.parameters():
                numel = param.numel()
                param.data.copy_(theta_i[idx:idx+numel].reshape(param.size()))
                idx += numel
    
            # Update P_ij matrices
            for j in range(no_users):
                if adjacency_matrix[i, j] == 1:
                    P_ij = P[(i, j)]
                    P_ji = P[(j, i)]
                    theta_j = torch.cat([param.view(-1) for param in local_classifiers[j].parameters()])
    
                    # Update P_ij
                    P[(i, j)] -= eta * lambda_reg * torch.outer(P_ij @ theta_i - P_ji @ theta_j, theta_i)
        

    # Optionally, validate models
    print("Validating user models...")
    for user_id in range(no_users):
        user_models = all_models[user_id]
        client_model = local_classifiers[i]
        val_dict = validate_user_model_FMTL(user_id, user_models, client_model, val_loaders[user_id], loss_func)
        val_losses_SFMTL.append(val_dict)



print("Decentralized federated learning complete.")

file_path = 'val_losses_SFMTL.json'

# Save the list of dictionaries to the file
with open(file_path, 'w') as file:
    json.dump(val_losses_SFMTL, file, indent=4)

Global Round 1
Training image modalitity models
Performing FMTL aggregation
Validating user models...
User 1 - Validation Loss: 4.4811, Accuracy: 0.0000
User 2 - Validation Loss: 4.7501, Accuracy: 0.0000


KeyboardInterrupt: 