In [1]:
from meps_model import UnWeightedLitDNN

from nvflare.app_common.workflows.fedavg import FedAvg
from nvflare.app_opt.pt.job_config.base_fed_job import BaseFedJob
from nvflare.job_config.script_runner import ScriptRunner



In [2]:
job = BaseFedJob(
    name="meps_lightning_fedavg",
    initial_model=UnWeightedLitDNN(),
    key_metric="val_loss"
)

In [3]:
n_clients = 3

controller = FedAvg(
    num_clients=n_clients,
    num_rounds=5,
)
job.to(controller, "server")

In [4]:
for i in range(n_clients):
    runner = ScriptRunner(
        script="meps_nvflare.py", script_args=f"--batch_size 256 --local_epochs 100 --no-weights --dataset_path /data/shared/analysis/bias_fairness/for_fl/site_{i+1}"
    )
    job.to(runner, f"site_{i+1}")

In [5]:
job.simulator_run("/data/shared/analysis/bias_fairness/nvflare_wkdir")

2025-07-13 09:22:41,273 - SimulatorRunner - INFO - Create the Simulator Server.
2025-07-13 09:22:41,277 - CoreCell - INFO - server: creating listener on tcp://0:57403
2025-07-13 09:22:41,343 - CoreCell - INFO - server: created backbone external listener for tcp://0:57403
2025-07-13 09:22:41,344 - ConnectorManager - INFO - 545801: Try start_listener Listener resources: {'secure': False, 'host': 'localhost'}
2025-07-13 09:22:41,347 - nvflare.fuel.f3.sfm.conn_manager - INFO - Connector [CH00002 PASSIVE tcp://0:17452] is starting
2025-07-13 09:22:41,849 - CoreCell - INFO - server: created backbone internal listener for tcp://localhost:17452
2025-07-13 09:22:41,850 - nvflare.fuel.f3.sfm.conn_manager - INFO - Connector [CH00001 PASSIVE tcp://0:57403] is starting
2025-07-13 09:22:41,853 - SimulatorServer - INFO - max_reg_duration=60.0
2025-07-13 09:22:42,142 - nvflare.fuel.hci.server.hci - INFO - Starting Admin Server localhost on Port 33733
2025-07-13 09:22:42,143 - SimulatorRunner - INFO - 

INFO: 
  | Name  | Type   | Params | Mode 
-----------------------------------------
0 | model | DeepNN | 7.6 K  | train
-----------------------------------------
7.6 K     Trainable params
0         Non-trainable params
7.6 K     Total params
0.030     Total estimated model params size (MB)
8         Modules in train mode
0         Modules in eval mode


Epoch 0:   6%|▌         | 1/17 [00:00<00:00, 47.00it/s, v_num=0]2025-07-13 09:22:55,515 - pytorch_lightning.utilities.rank_zero - INFO - You are using the plain ModelCheckpoint callback. Consider using LitModelCheckpoint which with seamless uploading to Model registry.
Epoch 0:  12%|█▏        | 2/17 [00:00<00:00, 48.84it/s, v_num=0]2025-07-13 09:22:55,530 - pytorch_lightning.utilities.rank_zero - INFO - GPU available: False, used: False
2025-07-13 09:22:55,531 - pytorch_lightning.utilities.rank_zero - INFO - TPU available: False, using: 0 TPU cores
2025-07-13 09:22:55,531 - pytorch_lightning.utilities.rank_zero - INFO - HPU available: False, using: 0 HPUs
2025-07-13 09:22:55,532 - nvflare.app_common.executors.task_script_runner - INFO - 
[Current Round=0, Site = site_3]

2025-07-13 09:22:55,532 - nvflare.app_common.executors.task_script_runner - INFO - --- validate global model ---
Epoch 0:  24%|██▎       | 4/17 [00:00<00:00, 50.37it/s, v_num=0]2025-07-13 09:22:55,564 - pytorch_lightni

INFO: 
  | Name  | Type   | Params | Mode 
-----------------------------------------
0 | model | DeepNN | 7.6 K  | train
-----------------------------------------
7.6 K     Trainable params
0         Non-trainable params
7.6 K     Total params
0.030     Total estimated model params size (MB)
8         Modules in train mode
0         Modules in eval mode
INFO: 
  | Name  | Type   | Params | Mode 
-----------------------------------------
0 | model | DeepNN | 7.6 K  | train
-----------------------------------------
7.6 K     Trainable params
0         Non-trainable params
7.6 K     Total params
0.030     Total estimated model params size (MB)
8         Modules in train mode
0         Modules in eval mode


Epoch 0: 100%|██████████| 17/17 [00:00<00:00, 37.52it/s, v_num=0]
Validation: |          | 0/? [00:00<?, ?it/s][A
Validation:   0%|          | 0/3 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|          | 0/3 [00:00<?, ?it/s][A
Epoch 0:  24%|██▎       | 4/17 [00:00<00:00, 18.70it/s, v_num=0]1it/s][A
Validation DataLoader 0:  67%|██████▋   | 2/3 [00:00<00:00, 43.16it/s][A
Validation DataLoader 0: 100%|██████████| 3/3 [00:00<00:00, 57.54it/s][A
Epoch 0: 100%|██████████| 17/17 [00:00<00:00, 20.26it/s, v_num=0]     [A
Validation: |          | 0/? [00:00<?, ?it/s][A
Validation:   0%|          | 0/3 [00:00<?, ?it/s][A
Epoch 1:  76%|███████▋  | 13/17 [00:00<00:00, 20.73it/s, v_num=0]
Epoch 0: 100%|██████████| 17/17 [00:00<00:00, 20.44it/s, v_num=0]2it/s][A
Validation: |          | 0/? [00:00<?, ?it/s][A
Validation DataLoader 0:  67%|██████▋   | 2/3 [00:00<00:00, 60.61it/s] [A
Validation:   0%|          | 0/3 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|          | 0/3 [00:

INFO: 
  | Name  | Type   | Params | Mode 
-----------------------------------------
0 | model | DeepNN | 7.6 K  | train
-----------------------------------------
7.6 K     Trainable params
0         Non-trainable params
7.6 K     Total params
0.030     Total estimated model params size (MB)
8         Modules in train mode
0         Modules in eval mode


Epoch 16: 100%|██████████| 17/17 [00:00<00:00, 19.82it/s, v_num=0]
Validation: |          | 0/? [00:00<?, ?it/s][A
Validation:   0%|          | 0/3 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|          | 0/3 [00:00<?, ?it/s][A
Validation DataLoader 0:  33%|███▎      | 1/3 [00:00<00:00, 89.85it/s][A
Validation DataLoader 0:  67%|██████▋   | 2/3 [00:00<00:00, 46.16it/s][A
Validation DataLoader 0: 100%|██████████| 3/3 [00:00<00:00, 58.05it/s][A
Epoch 17:  18%|█▊        | 3/17 [00:00<00:00, 19.52it/s, v_num=0]2025-07-13 09:23:23,476 - ServerRunner - INFO - [identity=simulator_server, run=simulate_job, wf=controller, peer=site_3, peer_run=simulate_job, task_name=train, task_id=69d977e0-3736-427b-a1ad-82692b64ccd6]: assigned task to client site_3: name=train, id=69d977e0-3736-427b-a1ad-82692b64ccd6
Epoch 17:  24%|██▎       | 4/17 [00:00<00:00, 19.61it/s, v_num=0]2025-07-13 09:23:23,478 - ServerRunner - INFO - [identity=simulator_server, run=simulate_job, wf=controller, peer=site_3

INFO: 
  | Name  | Type   | Params | Mode 
-----------------------------------------
0 | model | DeepNN | 7.6 K  | train
-----------------------------------------
7.6 K     Trainable params
0         Non-trainable params
7.6 K     Total params
0.030     Total estimated model params size (MB)
8         Modules in train mode
0         Modules in eval mode
INFO: 
  | Name  | Type   | Params | Mode 
-----------------------------------------
0 | model | DeepNN | 7.6 K  | train
-----------------------------------------
7.6 K     Trainable params
0         Non-trainable params
7.6 K     Total params
0.030     Total estimated model params size (MB)
8         Modules in train mode
0         Modules in eval mode


Epoch 17: 100%|██████████| 17/17 [00:00<00:00, 17.10it/s, v_num=0]
Epoch 13:  12%|█▏        | 2/17 [00:00<00:01, 11.34it/s, v_num=0]
Validation:   0%|          | 0/3 [00:00<?, ?it/s][A
Epoch 19:  24%|██▎       | 4/17 [00:00<00:01, 12.27it/s, v_num=0]
Epoch 13:  18%|█▊        | 3/17 [00:00<00:01, 12.45it/s, v_num=0]it/s][A
Validation DataLoader 0:  67%|██████▋   | 2/3 [00:00<00:00, 36.34it/s][A
Validation DataLoader 0: 100%|██████████| 3/3 [00:00<00:00, 40.01it/s][A
Epoch 19: 100%|██████████| 17/17 [00:01<00:00, 11.98it/s, v_num=0]    [A
Epoch 13:  88%|████████▊ | 15/17 [00:01<00:00, 11.44it/s, v_num=0]
Validation:   0%|          | 0/3 [00:00<?, ?it/s][A
Epoch 18:  71%|███████   | 12/17 [00:01<00:00, 11.50it/s, v_num=0]
Epoch 13:  94%|█████████▍| 16/17 [00:01<00:00, 11.34it/s, v_num=0]t/s][A
Validation DataLoader 0:  67%|██████▋   | 2/3 [00:00<00:00, 25.47it/s][A
Validation DataLoader 0: 100%|██████████| 3/3 [00:00<00:00, 34.39it/s][A
Epoch 13: 100%|██████████| 17/17 [00:01<00:

INFO: 
  | Name  | Type   | Params | Mode 
-----------------------------------------
0 | model | DeepNN | 7.6 K  | train
-----------------------------------------
7.6 K     Trainable params
0         Non-trainable params
7.6 K     Total params
0.030     Total estimated model params size (MB)
8         Modules in train mode
0         Modules in eval mode


Epoch 20:  65%|██████▍   | 11/17 [00:00<00:00, 19.64it/s, v_num=0]2025-07-13 09:23:32,398 - ServerRunner - INFO - [identity=simulator_server, run=simulate_job, wf=controller, peer=site_3, peer_run=simulate_job, task_name=train, task_id=01f7673a-6f68-4506-a35f-909ff914ab10]: assigned task to client site_3: name=train, id=01f7673a-6f68-4506-a35f-909ff914ab10
2025-07-13 09:23:32,399 - ServerRunner - INFO - [identity=simulator_server, run=simulate_job, wf=controller, peer=site_3, peer_run=simulate_job, task_name=train, task_id=01f7673a-6f68-4506-a35f-909ff914ab10]: sent task assignment to client. client_name:site_3 task_id:01f7673a-6f68-4506-a35f-909ff914ab10
2025-07-13 09:23:32,400 - GetTaskCommand - INFO - return task to client.  client_name: site_3  task_name: train   task_id: 01f7673a-6f68-4506-a35f-909ff914ab10  sharable_header_task_id: 01f7673a-6f68-4506-a35f-909ff914ab10
2025-07-13 09:23:32,412 - Communicator - INFO - Received from simulator_server server. getTask: train size: 36KB 

INFO: 
  | Name  | Type   | Params | Mode 
-----------------------------------------
0 | model | DeepNN | 7.6 K  | train
-----------------------------------------
7.6 K     Trainable params
0         Non-trainable params
7.6 K     Total params
0.030     Total estimated model params size (MB)
8         Modules in train mode
0         Modules in eval mode
INFO: 
  | Name  | Type   | Params | Mode 
-----------------------------------------
0 | model | DeepNN | 7.6 K  | train
-----------------------------------------
7.6 K     Trainable params
0         Non-trainable params
7.6 K     Total params
0.030     Total estimated model params size (MB)
8         Modules in train mode
0         Modules in eval mode


Validation: |          | 0/? [00:00<?, ?it/s]2025-07-13 09:23:32,941 - lightning.pytorch.callbacks.model_summary - INFO - 
  | Name  | Type   | Params | Mode 
-----------------------------------------
0 | model | DeepNN | 7.6 K  | train
-----------------------------------------
7.6 K     Trainable params
0         Non-trainable params
7.6 K     Total params
0.030     Total estimated model params size (MB)
8         Modules in train mode
0         Modules in eval mode
2025-07-13 09:23:32,943 - RestoreState - INFO - optimizer states restored.
2025-07-13 09:23:32,948 - ClientRunner - INFO - [identity=site_2, run=simulate_job, peer=simulator_server, peer_run=simulate_job, task_name=train, task_id=cf8fddc0-9fd3-475f-acf9-6128109ccd4f]: finished processing task
2025-07-13 09:23:32,949 - ClientRunner - INFO - [identity=site_2, run=simulate_job, peer=simulator_server, peer_run=simulate_job, task_name=train, task_id=cf8fddc0-9fd3-475f-acf9-6128109ccd4f]: try #1: sending task result to server
20

INFO: 
  | Name  | Type   | Params | Mode 
-----------------------------------------
0 | model | DeepNN | 7.6 K  | train
-----------------------------------------
7.6 K     Trainable params
0         Non-trainable params
7.6 K     Total params
0.030     Total estimated model params size (MB)
8         Modules in train mode
0         Modules in eval mode


2025-07-13 09:23:38,985 - lightning.pytorch.callbacks.model_summary - INFO - 
  | Name  | Type   | Params | Mode 
-----------------------------------------
0 | model | DeepNN | 7.6 K  | train
-----------------------------------------
7.6 K     Trainable params
0         Non-trainable params
7.6 K     Total params
0.030     Total estimated model params size (MB)
8         Modules in train mode
0         Modules in eval mode
2025-07-13 09:23:38,987 - RestoreState - INFO - optimizer states restored.
Epoch 21:  88%|████████▊ | 15/17 [00:00<00:00, 19.98it/s, v_num=0]2025-07-13 09:23:39,758 - ServerRunner - INFO - [identity=simulator_server, run=simulate_job, wf=controller, peer=site_3, peer_run=simulate_job, task_name=train, task_id=38b9a8ce-0082-47a0-832b-d84f13736e58]: assigned task to client site_3: name=train, id=38b9a8ce-0082-47a0-832b-d84f13736e58
2025-07-13 09:23:39,759 - ServerRunner - INFO - [identity=simulator_server, run=simulate_job, wf=controller, peer=site_3, peer_run=simulate

INFO: 
  | Name  | Type   | Params | Mode 
-----------------------------------------
0 | model | DeepNN | 7.6 K  | train
-----------------------------------------
7.6 K     Trainable params
0         Non-trainable params
7.6 K     Total params
0.030     Total estimated model params size (MB)
8         Modules in train mode
0         Modules in eval mode
INFO: 
  | Name  | Type   | Params | Mode 
-----------------------------------------
0 | model | DeepNN | 7.6 K  | train
-----------------------------------------
7.6 K     Trainable params
0         Non-trainable params
7.6 K     Total params
0.030     Total estimated model params size (MB)
8         Modules in train mode
0         Modules in eval mode


Epoch 24:   0%|          | 0/17 [00:00<?, ?it/s]2025-07-13 09:23:40,684 - Communicator - INFO -  SubmitUpdate size: 36.2KB (36191 Bytes). time: 0.313414 seconds
2025-07-13 09:23:40,685 - ClientRunner - INFO - [identity=site_2, run=simulate_job, peer=simulator_server, peer_run=simulate_job, task_name=train, task_id=1036253f-7088-47a3-b2af-e6f5182c7ca5]: task result sent to server
2025-07-13 09:23:40,685 - ClientTaskWorker - INFO - Finished one task run for client: site_2 interval: 2 task_processed: True
Epoch 22: 100%|██████████| 17/17 [00:00<00:00, 17.08it/s, v_num=0]
Validation: |          | 0/? [00:00<?, ?it/s][A
Validation:   0%|          | 0/3 [00:00<?, ?it/s][A
Epoch 24:  88%|████████▊ | 15/17 [00:00<00:00, 17.60it/s, v_num=0]
Validation DataLoader 0:  33%|███▎      | 1/3 [00:00<00:00, 103.50it/s][A
Validation DataLoader 0:  67%|██████▋   | 2/3 [00:00<00:00, 41.55it/s] [A
Validation DataLoader 0: 100%|██████████| 3/3 [00:00<00:00, 52.56it/s][A
Epoch 24: 100%|██████████| 17/17

INFO: 
  | Name  | Type   | Params | Mode 
-----------------------------------------
0 | model | DeepNN | 7.6 K  | train
-----------------------------------------
7.6 K     Trainable params
0         Non-trainable params
7.6 K     Total params
0.030     Total estimated model params size (MB)
8         Modules in train mode
0         Modules in eval mode
INFO: 
  | Name  | Type   | Params | Mode 
-----------------------------------------
0 | model | DeepNN | 7.6 K  | train
-----------------------------------------
7.6 K     Trainable params
0         Non-trainable params
7.6 K     Total params
0.030     Total estimated model params size (MB)
8         Modules in train mode
0         Modules in eval mode


Validation DataLoader 0: 100%|██████████| 3/3 [00:00<00:00, 53.80it/s]
2025-07-13 09:23:46,060 - nvflare.app_common.executors.task_script_runner - INFO - ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
     Validate metric           DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
        val_loss            0.7748446464538574
──────────────────────────────────���─────────────────────────────────────────────────────────────────────────────────────
2025-07-13 09:23:46,065 - nvflare.app_common.executors.task_script_runner - INFO - --- train new model ---
Epoch 25:  12%|█▏        | 2/17 [00:00<00:00, 19.61it/s, v_num=0]2025-07-13 09:23:46,164 - lightning.pytorch.callbacks.model_summary - INFO - 
  | Name  | Type   | Params | Mode 
-----------------------------------------
0 | model | DeepNN | 7.6 K  | train
--------------------------

INFO: 
  | Name  | Type   | Params | Mode 
-----------------------------------------
0 | model | DeepNN | 7.6 K  | train
-----------------------------------------
7.6 K     Trainable params
0         Non-trainable params
7.6 K     Total params
0.030     Total estimated model params size (MB)
8         Modules in train mode
0         Modules in eval mode


Epoch 26: 100%|██████████| 17/17 [00:00<00:00, 20.02it/s, v_num=0]
Validation: |          | 0/? [00:00<?, ?it/s][A
Validation:   0%|          | 0/3 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|          | 0/3 [00:00<?, ?it/s][A
Validation DataLoader 0:  33%|███▎      | 1/3 [00:00<00:00, 80.05it/s][A
Validation DataLoader 0:  67%|██████▋   | 2/3 [00:00<00:00, 43.63it/s][A
Validation DataLoader 0: 100%|██████████| 3/3 [00:00<00:00, 55.52it/s][A
Epoch 26: 100%|██████████| 17/17 [00:00<00:00, 17.82it/s, v_num=0]    [A
2025-07-13 09:23:48,749 - InProcessClientAPI - INFO - Try to send local model back to peer 
2025-07-13 09:23:49,157 - ClientRunner - INFO - [identity=site_3, run=simulate_job, peer=simulator_server, peer_run=simulate_job, task_name=train, task_id=72bcc029-2cf8-4830-bc18-b5c7c8a00e09]: finished processing task
2025-07-13 09:23:49,158 - ClientRunner - INFO - [identity=site_3, run=simulate_job, peer=simulator_server, peer_run=simulate_job, task_name=train, task_id=72b