# Federated Learning on small GPT model

In [1]:
!pip install -q 'flwr[simulation]' torch torchvision matplotlib

In [2]:
from collections import OrderedDict
from typing import List, Tuple, Dict, Optional

import matplotlib.pyplot as plt
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
# import torchvision
# import torchvision.transforms as tfms
# from torchvision.datasets import CIFAR10
from torch.utils.data import Dataset, DataLoader, random_split
import string
import pandas as pd

import flwr as fl
from flwr.common import Metrics
from tqdm import tqdm

DEVICE = torch.device(
    "cuda" if torch.backends.cuda.is_built()
    else "mps" if torch.backends.mps.is_built()
    else "cpu"
)
# DEVICE = "cpu"

print(f"Training on {DEVICE} using PyTorch {torch.__version__} and Flwr {fl.__version__}")

Training on cuda using PyTorch 2.1.0+cu118 and Flwr 1.5.0


In [3]:
df = pd.read_csv('en-fr-mini.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,en,fr,en_len,fr_len
0,0,Changing Lives | Changing Society | How It Wor...,Il a transformé notre vie | Il a transformé la...,201,248
1,1,Site map,Plan du site,8,12
2,2,Feedback,Rétroaction,8,11
3,3,Credits,Crédits,7,7
4,4,Français,English,8,7


In [4]:
df.dropna(inplace=True)
df = df.reset_index(drop=True)
df = df.head(5000)
df['en_len'] = df['en'].apply(lambda x: len(x))
df['fr_len'] = df['fr'].apply(lambda x: len(x))
df.shape

(4999, 5)

In [5]:
df[['en_len', 'fr_len']].describe()

Unnamed: 0,en_len,fr_len
count,4999.0,4999.0
mean,82.970194,92.084217
std,83.590436,94.963313
min,1.0,1.0
25%,16.0,17.0
50%,72.0,80.0
75%,127.0,140.5
max,1814.0,2549.0


In [6]:
# CLASSES = ("plane", "car", "bird", "cat", "deer", "dog", "frog", "horse", "ship", "truck",)
BATCH_SIZE = 32
NUM_CLIENTS = 3

vocab = list(set(' '.join(df['en'].values.tolist() + df['fr'].values.tolist())))
vocab = ["<s>", "</s>", "<pad>"] + vocab
ix2ch = {ix:ch for ix,ch in enumerate(vocab)}
ch2ix = {ch:ix for ix,ch in ix2ch.items()}
encode = lambda s: [ch2ix[c] for c in s]
decode = lambda l: ''.join([ix2ch[i] for i in l])


class NanoGptDataset(Dataset):
    def __init__(self, texts: List[str]) -> None:
        super().__init__()
        self.texts = texts

    def __len__(self) -> int:
        return len(self.texts)

    def __getitem__(self, ix: int):
        text = self.texts[ix]
#         text = ''.join([i if ord(i) < 128 else ' ' for i in text.strip])
        input_ids = [ch2ix['<s>']] + encode(text) # [<s> a b c d   e ]
        input_ids = input_ids[:256]
        output_ids = input_ids[1:] + [ch2ix['</s>']] # [ a  b c d e </s>]
        assert len(input_ids) == len(output_ids), print(input_ids, output_ids, "\n\n======= Something went wrong when encoding the input and outputs ========\n\n")
        return  {
            'input_ids': torch.tensor(input_ids, dtype=torch.long),
            'labels': torch.tensor(output_ids, dtype=torch.long)
        }


def collate_fn(batch):
    max_len = 0
    for b in batch:
        max_len = max(len(b['input_ids']), max_len)
#         print({k:v.shape for k, v in b.items()})

    res = None

    for b in batch:
        req_padding = max_len - len(b['input_ids'])
        if res is None:
            if req_padding == 0:
                res = {k:v[None, ...] for k,v in b.items()}
            else:
                res = {
                    'input_ids': torch.hstack([b['input_ids'], torch.tensor([ch2ix['<pad>']]*req_padding, dtype=torch.long)])[None, ...],
                    'labels': torch.hstack([b['labels'],  torch.tensor([ch2ix['<pad>']]*req_padding, dtype=torch.long)])[None, ...]
                }
            continue

        if res is not None:
            if req_padding == 0:
                res = {
                    k: torch.cat([res[k], b[k].view(1, max_len)], dim=0) for k,v in res.items()
                }
            else:
                tmp = {
                    'input_ids': torch.hstack([b['input_ids'], torch.tensor([ch2ix['<pad>']]*req_padding, dtype=torch.long)])[None, ...],
                    'labels': torch.hstack([b['labels'], torch.tensor([ch2ix['<pad>']]*req_padding, dtype=torch.long)])[None, ...]
                }
                res = {
                    k: torch.cat([res[k], tmp[k].view(1, max_len)], dim=0) for k,v in res.items()
                }
    return res


def load_datasets(train_texts: List[str], test_texts: List[str], num_clients: int):
    trainset = NanoGptDataset(texts=train_texts)
    testset = NanoGptDataset(texts=test_texts)

    # Split training set into `num_clients` partitions to simulate different local datasets
    partition_size = len(trainset) // num_clients
    lengths = [partition_size] * num_clients
    datasets = random_split(
        trainset, lengths=lengths, generator=torch.Generator().manual_seed(42))

    # Split each partition into train/val and create DataLoader
    trainloaders = []
    validloaders = []
    for ds in datasets:
        len_val = len(ds) // 10
        len_train = len(ds) - len_val
        lengths = [len_train, len_val]
        print(lengths)
        ds_train, ds_val = random_split(
            ds, lengths=lengths, generator=torch.Generator().manual_seed(42)
        )
        trainloaders.append(DataLoader(ds_train, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn))
        validloaders.append(DataLoader(ds_val, batch_size=BATCH_SIZE, collate_fn=collate_fn))
    testloader = DataLoader(testset, batch_size=BATCH_SIZE, collate_fn=collate_fn)
    return trainloaders, validloaders, testloader

In [7]:
texts = df['en'].values.tolist() + df['fr'].values.tolist()
import random
random.shuffle(texts)

train_texts = texts[:9000]
test_texts = texts[9000:]

trainloaders, validloaders, testloader = load_datasets(train_texts=train_texts, test_texts=test_texts, num_clients=NUM_CLIENTS)

[2700, 300]
[2700, 300]
[2700, 300]


## Step 1: Centralized Training with PyTorch

### Define a model

In [8]:
import math
from dataclasses import dataclass

import torch
import torch.nn as nn
import torch.nn.functional as F


# ---------------------------------- Config ---------------------------------- #
@dataclass
class GPTConfig:
    buffer_size: int = 256
    vocab_size: int =  len(vocab) # 50304  # GPT2 has a total of 50257, padded to nearest multiple of 64 for efficiency
    n_layers: int = 3
    n_head: int = 4
    n_embed: int = 368
    dropout: float = 0.1
    bias: bool = False
    use_sinusoidal: bool = True


# ----------------------------- Attention Module ----------------------------- #
class Attention(nn.Module):
    '''Unlike RNNs where we were required to get one output and then pass it back onto the RNN and repeat the process
    again and again, here with masked attention, we simply find the lower triangular matrix and then weight them according
    the vector product the lower triangular matrix and the embedding vectors, we  build a masked representation for each word only using
    the values which occured/was predicted prior to the current index.
        - This is achieved by the torch.tril function and masking all zeros to -torch.inf (taking softmax makes it equal to zero)
    '''
    def __init__(self, n_embed: int, head_size: int) -> None:
        super().__init__()
        self.Q = nn.Linear(n_embed, head_size, bias=GPTConfig.bias)
        self.K = nn.Linear(n_embed, head_size, bias=GPTConfig.bias)
        self.V = nn.Linear(n_embed, head_size, bias=GPTConfig.bias)
        tril = torch.tril(
            torch.ones(size=(GPTConfig.buffer_size, GPTConfig.buffer_size))
        )
        self.register_buffer("tril", tril)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        _, T, _ = x.shape
        q, k, v = self.Q(x), self.K(x), self.V(x)  # (B, T, C) => (B, T, H)
        wei = (
            q @ k.mT * (1.0 / math.sqrt(k.size(-1)))
        )  # (B, T, H) @ (B, H, T) = (B, T, T)
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float("-inf"))
        wei = F.softmax(wei, dim=-1)
        return wei @ v  # (B, T, T) @ (B, T, H) => (B, T, H)


# --------------------------- Multi Head Attention --------------------------- #
class MultiHeadAttention(nn.Module):
    def __init__(self, n_embed: int, n_heads: int) -> None:
        super().__init__()
        assert (
            n_embed % n_heads == 0
        ), "The number of heads must divide the embedding dimensions"
        head_size = n_embed // n_heads
        self.heads = nn.ModuleList(
            [Attention(n_embed=n_embed, head_size=head_size) for _ in range(n_heads)]
        )
        self.proj = nn.Linear(n_embed, n_embed, bias=GPTConfig.bias)
        self.dropout = nn.Dropout(p=GPTConfig.dropout)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = torch.cat([h(x) for h in self.heads], dim=-1) # (B, T, C) -> (B, T, C//N_HEADS) -> (B, T, C)
        return self.dropout(self.proj(x)) #  (B, T, C) -> (B, T, C)


# ------------------------------- Feed Forward ------------------------------- #
class FeedForward(nn.Module):
    def __init__(self, n_embed: int) -> None:
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embed, 4 * n_embed), nn.GELU(), nn.Linear(4 * n_embed, n_embed)
        )

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        return self.net(x)


# ------------------------------ Attention Block ----------------------------- #
class AttentionBlock(nn.Module):
    def __init__(self, n_embed: int, n_heads: int) -> None:
        super().__init__()
        self.sa = MultiHeadAttention(n_embed=n_embed, n_heads=n_heads)
        self.ffwd = FeedForward(n_embed=n_embed)
        self.ln1 = nn.LayerNorm((n_embed,))
        self.ln2 = nn.LayerNorm((n_embed,))

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = x + self.sa(self.ln1(x)) # (B, T, C) -> (B, T, C)
        x = x + self.ffwd(self.ln2(x))
        return x


# --------------------------- Positional Embeddings -------------------------- #
class PositionalEncoding(nn.Module):
    def __init__(self, n_embed: int, max_seq_len: int) -> None:
        super().__init__()

        position_id = torch.arange(0, max_seq_len).unsqueeze(1)
        frequencies = torch.arange(0, n_embed, 2, dtype=torch.float32) / n_embed
        frequencies = torch.pow(10000.0, -frequencies)

        positional_encodings = torch.zeros(size=(max_seq_len, n_embed))
        # print(frequencies.shape, position_id.shape, positional_encodings.shape)

        positional_encodings[:, 0::2] = torch.sin(position_id * frequencies)
        positional_encodings[:, 1::2] = torch.cos(position_id * frequencies)

        self.register_buffer("positional_encodings", positional_encodings)

        self.dropout = nn.Dropout(p=GPTConfig.dropout)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        pos_encodings = self.positional_encodings[: x.shape[1]]
        return self.dropout(pos_encodings + x)


# ------------------------------ NanoGPT Module ------------------------------ #
class NanoGPT(nn.Module):
    def __init__(
        self,
        vocab_size: int,
        n_embed: int,
        n_heads: int,
        buffer_size: int,
        n_blocks: int,
    ) -> None:
        super().__init__()
        self.vocab_size = vocab_size
        self.n_embed = n_embed
        self.n_heads = n_heads
        self.buffer_size = buffer_size
        self.n_blocks = n_blocks

        self.token_embeddings = nn.Embedding(
            num_embeddings=vocab_size, embedding_dim=n_embed
        )

        if GPTConfig.use_sinusoidal:
            self.positional_encodings = PositionalEncoding(
                n_embed=n_embed, max_seq_len=GPTConfig.buffer_size
            )
        else:
            self.positional_encodings = nn.Embedding(
                num_embeddings=GPTConfig.buffer_size, embedding_dim=n_embed
            )

        self.blocks = nn.Sequential(
            *[AttentionBlock(n_embed=n_embed, n_heads=n_heads) for _ in range(n_blocks)]
        )
        self.ln = nn.LayerNorm((n_embed,))
        self.lm_head = nn.Sequential(
            nn.Linear(n_embed, n_embed // 2), nn.GELU(), nn.Linear(n_embed//2, vocab_size)
        )

    def forward(
        self, input_ids: torch.Tensor, labels: torch.Tensor = None
    ) -> torch.Tensor:
        B, T = input_ids.shape
        tok_emb = self.token_embeddings(input_ids) # (B, T, C)
        if GPTConfig.use_sinusoidal:
            x = self.positional_encodings.forward(tok_emb) # (B, T, C) -> (B, T, C)
        else:
            x = tok_emb + self.positional_encodings(
                torch.arange(T, dtype=torch.long, device=input_ids.shape)
            ) # (B, T, C) -> (B, T, C)
        x = self.blocks(x)
        x = self.ln(x)
        x = self.lm_head(x)

        loss = None
        if labels is not None:
            B, T, C = x.shape
            loss = F.cross_entropy(x.view(B * T, C), labels.view(B * T))

        return x, loss

    def generate(
        self, ids: torch.Tensor, max_len: int, temperature: float = 0.7
    ) -> int:
        for _ in range(max_len):
            ids_cond = ids[:, -GPTConfig.buffer_size :]
            logits, _ = self.forward(input_ids=ids_cond, labels=None)
            logit = logits[:, -1, :]
            probs = F.softmax(logit, dim=-1)
            val, idx = torch.topk(probs, k=int(probs.shape[1] * temperature), dim=-1)
            # print(val[0][0])
            idx_next = torch.multinomial(val, num_samples=1)
            idx_next = idx[:, idx_next][0]
            ids = torch.cat([ids, idx_next], dim=-1)
            if idx_next == 0:
                break
        return ids


if __name__ == "__main__":
    model = NanoGPT(
        vocab_size=GPTConfig.vocab_size,
        n_embed=GPTConfig.n_embed,
        n_heads=GPTConfig.n_head,
        buffer_size=GPTConfig.buffer_size,
        n_blocks=GPTConfig.n_layers,
    )

    print(model)

NanoGPT(
  (token_embeddings): Embedding(143, 368)
  (positional_encodings): PositionalEncoding(
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (blocks): Sequential(
    (0): AttentionBlock(
      (sa): MultiHeadAttention(
        (heads): ModuleList(
          (0-3): 4 x Attention(
            (Q): Linear(in_features=368, out_features=92, bias=False)
            (K): Linear(in_features=368, out_features=92, bias=False)
            (V): Linear(in_features=368, out_features=92, bias=False)
          )
        )
        (proj): Linear(in_features=368, out_features=368, bias=False)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (ffwd): FeedForward(
        (net): Sequential(
          (0): Linear(in_features=368, out_features=1472, bias=True)
          (1): GELU(approximate='none')
          (2): Linear(in_features=1472, out_features=368, bias=True)
        )
      )
      (ln1): LayerNorm((368,), eps=1e-05, elementwise_affine=True)
      (ln2): LayerNorm((368,), eps=1

### Engine funcs

In [9]:
def train_fn(net: NanoGPT, trainloader: torch.utils.data.DataLoader, epochs: int):
    loss_fct = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(net.parameters(), lr=3e-4)
    net.train()

    for epoch in range(epochs):
        # print('#'*15)
        # print(f'### Epoch {epoch+1}/{epochs}')
        # print('#'*15)

        # pbar = tqdm(trainloader, total=len(trainloader), desc='(train)')
        correct, total, running_loss = 0, 0, 0.0

        for batch in trainloader:
            X, y = batch['input_ids'], batch['labels']
            X, y = X.to(DEVICE), y.to(DEVICE)

            optimizer.zero_grad()
            yHat, loss = net.forward(input_ids=X, labels=y)
            loss.backward()
            optimizer.step()

            running_loss += loss.item()
            total += y.size(0)
#             correct += (torch.argmax(yHat.data, dim=1) == y).sum().item()

            epoch_loss = running_loss / total
            # epoch_accuracy = correct / total

#             pbar.set_postfix({
#                 'loss': f'{epoch_loss:.4f}',
# #                 'accuracy': f'{epoch_accuracy*100:.4f}%'
#             })

        print(f'Epoch {epoch+1}: train loss {epoch_loss}')


@torch.no_grad()
def eval_fn(net: NanoGPT, testloader: torch.utils.data.DataLoader):
    loss_fct = nn.CrossEntropyLoss()
    correct, total, running_loss = 0., 0., 0.
    net.eval()

    # pbar = tqdm(testloader, total=len(testloader), desc='(eval)')

    for batch in testloader:
        X, y = batch['input_ids'], batch['labels']
        X, y = X.to(DEVICE), y.to(DEVICE)

        yHat, loss = net.forward(input_ids=X, labels=y)

        running_loss += loss.item()
        total += y.size(0)
#         correct += (torch.argmax(yHat.data, dim=1) == y).sum().item()

        epoch_loss = running_loss / total
#         epoch_accuracy = correct / total

#         pbar.set_postfix({
#             'loss': f'{epoch_loss:.4f}',
# #             'accuracy': f'{epoch_accuracy*100:.4f}%'
#         })

    return epoch_loss#, epoch_accuracy

In [10]:
trainloader = trainloaders[0]
valloader = validloaders[0]
net = NanoGPT(
    vocab_size=GPTConfig.vocab_size,
    n_embed=GPTConfig.n_embed,
    n_heads=GPTConfig.n_head,
    buffer_size=GPTConfig.buffer_size,
    n_blocks=GPTConfig.n_layers,
).to(DEVICE)

for epoch in range(5):
    train_fn(net, trainloader, 1)
    loss = eval_fn(net, valloader)
    print(f"Epoch {epoch+1}: validation loss {loss}")

loss = eval_fn(net, testloader)
print(f"Final test set performance:\n\tloss {loss}")

Epoch 1: train loss 0.041813092916100116
Epoch 1: validation loss 0.03464063922564189
Epoch 1: train loss 0.02955561763710446
Epoch 2: validation loss 0.03177231113115946
Epoch 1: train loss 0.027962340491789358
Epoch 3: validation loss 0.030970130960146586
Epoch 1: train loss 0.027175545780747025
Epoch 4: validation loss 0.030351797143618266
Epoch 1: train loss 0.026835898845284072
Epoch 5: validation loss 0.0296336030960083
Final test set performance:
	loss 0.02733169451266348


## Step 2: Federated Learning

### Updating model parameters

In [11]:
def get_parameters(net: NanoGPT) -> List[np.array]:
    return [val.cpu().numpy() for _, val in net.state_dict().items()]

def set_parameters(net: NanoGPT, parameters: List[np.ndarray]):
    params_dict = zip(net.state_dict().keys(), parameters)
    state_dict = OrderedDict({k: torch.tensor(v) for k,v in params_dict})
    net.load_state_dict(state_dict, strict=True)

### Client

In [12]:
class FlowerClient(fl.client.NumPyClient):
    def __init__(self, cid: str, net: NanoGPT, trainloader: torch.utils.data.DataLoader, validloader: torch.utils.data.DataLoader) -> None:
        self.cid = cid
        self.net = net
        self.trainloader = trainloader
        self.validloader = validloader

    def get_parameters(self, config):
        print(f"[Client {self.cid}] get_parameters")
        return get_parameters(self.net)

    def fit(self, parameters, config):
        server_round = config["server_round"]
        local_epochs = config["local_epochs"]

        print(f"[Client {self.cid}, round {server_round}] fit, config: {config}")
        set_parameters(self.net, parameters)
        train_fn(self.net, self.trainloader, epochs=local_epochs)
        return get_parameters(self.net), len(self.trainloader), {}

    def evaluate(self, parameters, config):
        print(f"[Client {self.cid}] evaluate: config: {config}")
        set_parameters(self.net, parameters)
        loss = eval_fn(self.net, self.validloader)
        return float(loss), len(self.validloader), {"loss": float(loss)}

### Virtual Client Engine

In [13]:
def client_fn(cid: str) -> FlowerClient:
    net = NanoGPT(
        vocab_size=GPTConfig.vocab_size,
        n_embed=GPTConfig.n_embed,
        n_heads=GPTConfig.n_head,
        buffer_size=GPTConfig.buffer_size,
        n_blocks=GPTConfig.n_layers,
    ).to(DEVICE)
    trainloader = trainloaders[int(cid)]
    validloader = validloaders[int(cid)]
    return FlowerClient(cid=cid, net=net, trainloader=trainloader, validloader=validloader)

## Starting training

### Server Side Evaluation

In [14]:
def evaluate_fn(
    server_round: int,
    parameters: fl.common.NDArrays,
    config: Dict[str, fl.common.Scalar]
) -> Optional[Tuple[float, Dict[str, fl.common.Scalar]]]:
    net = NanoGPT(
        vocab_size=GPTConfig.vocab_size,
        n_embed=GPTConfig.n_embed,
        n_heads=GPTConfig.n_head,
        buffer_size=GPTConfig.buffer_size,
        n_blocks=GPTConfig.n_layers,
    ).to(DEVICE)
    set_parameters(net, parameters)
    loss = eval_fn(net, testloader=testloader)
    print(f">>> Server-side evaluation loss {loss}")
    # del net
    return loss, {"loss": loss}

In [15]:
def fit_config_fn(server_round: int):
    """Return training config dict for each round.

    Perform two rounds of training with one local epoch, increase two local
    epochs afterwards
    """
    config = {
        "server_round": server_round,
        "local_epochs": 2 if server_round < 2 else 3
    }
    return config

In [16]:
strategy = fl.server.strategy.FedAvg(
    fraction_fit=1.0,              # Sample 100% of the available clients for training
    fraction_evaluate=0.5,         # Sample 50% of available clients for eval
    min_fit_clients=NUM_CLIENTS,            # Never sample less than 10 clients for training
    min_evaluate_clients=NUM_CLIENTS,        # Never sample less than 5 clients for eval
    min_available_clients=NUM_CLIENTS,      # Wait until all 10 clients are available
    evaluate_fn=evaluate_fn,       # Evaluate function after every round
    on_fit_config_fn=fit_config_fn # Fit Config
)

client_resources = None
if DEVICE.type == "cuda":
    import multiprocessing
    client_resources = {"num_gpus": 1, "num_cpus": multiprocessing.cpu_count()}

fl.simulation.start_simulation(
    client_fn=client_fn,
    num_clients=NUM_CLIENTS,
    config=fl.server.ServerConfig(num_rounds=5),
    strategy=strategy,
    client_resources=client_resources
)

INFO flwr 2023-11-08 14:44:26,480 | app.py:175 | Starting Flower simulation, config: ServerConfig(num_rounds=5, round_timeout=None)
INFO:flwr:Starting Flower simulation, config: ServerConfig(num_rounds=5, round_timeout=None)
2023-11-08 14:44:30,123	INFO worker.py:1621 -- Started a local Ray instance.
INFO flwr 2023-11-08 14:44:33,134 | app.py:210 | Flower VCE: Ray initialized with resources: {'GPU': 1.0, 'node:172.28.0.12': 1.0, 'object_store_memory': 3923902464.0, 'node:__internal_head__': 1.0, 'CPU': 2.0, 'memory': 7847804928.0}
INFO:flwr:Flower VCE: Ray initialized with resources: {'GPU': 1.0, 'node:172.28.0.12': 1.0, 'object_store_memory': 3923902464.0, 'node:__internal_head__': 1.0, 'CPU': 2.0, 'memory': 7847804928.0}
INFO flwr 2023-11-08 14:44:33,137 | app.py:224 | Flower VCE: Resources for each Virtual Client: {'num_gpus': 1, 'num_cpus': 2}
INFO:flwr:Flower VCE: Resources for each Virtual Client: {'num_gpus': 1, 'num_cpus': 2}
INFO flwr 2023-11-08 14:44:33,181 | app.py:270 | Flo

[2m[36m(DefaultActor pid=7793)[0m [Client 0] get_parameters


INFO flwr 2023-11-08 14:44:40,295 | server.py:94 | initial parameters (loss, other metrics): 0.1535356809237677, {'loss': 0.1535356809237677}
INFO:flwr:initial parameters (loss, other metrics): 0.1535356809237677, {'loss': 0.1535356809237677}
INFO flwr 2023-11-08 14:44:40,301 | server.py:104 | FL starting
INFO:flwr:FL starting
DEBUG flwr 2023-11-08 14:44:40,308 | server.py:222 | fit_round 1: strategy sampled 3 clients (out of 3)
DEBUG:flwr:fit_round 1: strategy sampled 3 clients (out of 3)


>>> Server-side evaluation loss 0.1535356809237677
[2m[36m(DefaultActor pid=7793)[0m [Client 1, round 1] fit, config: {'server_round': 1, 'local_epochs': 2}
[2m[36m(DefaultActor pid=7793)[0m Epoch 1: train loss 0.0410448893794307
[2m[36m(DefaultActor pid=7793)[0m Epoch 2: train loss 0.030041650975192034
[2m[36m(DefaultActor pid=7793)[0m [Client 0, round 1] fit, config: {'server_round': 1, 'local_epochs': 2}
[2m[36m(DefaultActor pid=7793)[0m Epoch 1: train loss 0.04103764615677021
[2m[36m(DefaultActor pid=7793)[0m Epoch 2: train loss 0.029813554573942115
[2m[36m(DefaultActor pid=7793)[0m [Client 2, round 1] fit, config: {'server_round': 1, 'local_epochs': 2}
[2m[36m(DefaultActor pid=7793)[0m Epoch 1: train loss 0.04141158201076366


DEBUG flwr 2023-11-08 14:45:34,617 | server.py:236 | fit_round 1 received 3 results and 0 failures
DEBUG:flwr:fit_round 1 received 3 results and 0 failures


[2m[36m(DefaultActor pid=7793)[0m Epoch 2: train loss 0.03054422363086983


INFO flwr 2023-11-08 14:45:35,963 | server.py:125 | fit progress: (1, 0.029720867265441375, {'loss': 0.029720867265441375}, 55.654963270999815)
INFO:flwr:fit progress: (1, 0.029720867265441375, {'loss': 0.029720867265441375}, 55.654963270999815)
DEBUG flwr 2023-11-08 14:45:35,967 | server.py:173 | evaluate_round 1: strategy sampled 3 clients (out of 3)
DEBUG:flwr:evaluate_round 1: strategy sampled 3 clients (out of 3)


>>> Server-side evaluation loss 0.029720867265441375
[2m[36m(DefaultActor pid=7793)[0m [Client 0] evaluate: config: {}
[2m[36m(DefaultActor pid=7793)[0m [Client 1] evaluate: config: {}
[2m[36m(DefaultActor pid=7793)[0m [Client 2] evaluate: config: {}


DEBUG flwr 2023-11-08 14:45:37,513 | server.py:187 | evaluate_round 1 received 3 results and 0 failures
DEBUG:flwr:evaluate_round 1 received 3 results and 0 failures
DEBUG flwr 2023-11-08 14:45:37,521 | server.py:222 | fit_round 2: strategy sampled 3 clients (out of 3)
DEBUG:flwr:fit_round 2: strategy sampled 3 clients (out of 3)


[2m[36m(DefaultActor pid=7793)[0m [Client 0, round 2] fit, config: {'server_round': 2, 'local_epochs': 3}
[2m[36m(DefaultActor pid=7793)[0m Epoch 1: train loss 0.028348260521888734
[2m[36m(DefaultActor pid=7793)[0m Epoch 2: train loss 0.02744697074095408
[2m[36m(DefaultActor pid=7793)[0m Epoch 3: train loss 0.026725006854092632
[2m[36m(DefaultActor pid=7793)[0m [Client 2, round 2] fit, config: {'server_round': 2, 'local_epochs': 3}
[2m[36m(DefaultActor pid=7793)[0m Epoch 1: train loss 0.028720699592872902
[2m[36m(DefaultActor pid=7793)[0m Epoch 2: train loss 0.028131604260868498
[2m[36m(DefaultActor pid=7793)[0m Epoch 3: train loss 0.027299850274015356
[2m[36m(DefaultActor pid=7793)[0m [Client 1, round 2] fit, config: {'server_round': 2, 'local_epochs': 3}
[2m[36m(DefaultActor pid=7793)[0m Epoch 1: train loss 0.028688570790820652
[2m[36m(DefaultActor pid=7793)[0m Epoch 2: train loss 0.027953380809889898


DEBUG flwr 2023-11-08 14:46:58,303 | server.py:236 | fit_round 2 received 3 results and 0 failures
DEBUG:flwr:fit_round 2 received 3 results and 0 failures


[2m[36m(DefaultActor pid=7793)[0m Epoch 3: train loss 0.026900179364063123


INFO flwr 2023-11-08 14:46:59,620 | server.py:125 | fit progress: (2, 0.027068215166638515, {'loss': 0.027068215166638515}, 139.31262042599997)
INFO:flwr:fit progress: (2, 0.027068215166638515, {'loss': 0.027068215166638515}, 139.31262042599997)
DEBUG flwr 2023-11-08 14:46:59,626 | server.py:173 | evaluate_round 2: strategy sampled 3 clients (out of 3)
DEBUG:flwr:evaluate_round 2: strategy sampled 3 clients (out of 3)


>>> Server-side evaluation loss 0.027068215166638515
[2m[36m(DefaultActor pid=7793)[0m [Client 0] evaluate: config: {}
[2m[36m(DefaultActor pid=7793)[0m [Client 2] evaluate: config: {}
[2m[36m(DefaultActor pid=7793)[0m [Client 1] evaluate: config: {}


DEBUG flwr 2023-11-08 14:47:01,135 | server.py:187 | evaluate_round 2 received 3 results and 0 failures
DEBUG:flwr:evaluate_round 2 received 3 results and 0 failures
DEBUG flwr 2023-11-08 14:47:01,140 | server.py:222 | fit_round 3: strategy sampled 3 clients (out of 3)
DEBUG:flwr:fit_round 3: strategy sampled 3 clients (out of 3)


[2m[36m(DefaultActor pid=7793)[0m [Client 1, round 3] fit, config: {'server_round': 3, 'local_epochs': 3}
[2m[36m(DefaultActor pid=7793)[0m Epoch 1: train loss 0.02626778304576874
[2m[36m(DefaultActor pid=7793)[0m Epoch 2: train loss 0.02532010696552418
[2m[36m(DefaultActor pid=7793)[0m Epoch 3: train loss 0.02422482289649822
[2m[36m(DefaultActor pid=7793)[0m [Client 2, round 3] fit, config: {'server_round': 3, 'local_epochs': 3}
[2m[36m(DefaultActor pid=7793)[0m Epoch 1: train loss 0.026572932998339335
[2m[36m(DefaultActor pid=7793)[0m Epoch 2: train loss 0.025285077624850803
[2m[36m(DefaultActor pid=7793)[0m Epoch 3: train loss 0.024233889778455097
[2m[36m(DefaultActor pid=7793)[0m [Client 0, round 3] fit, config: {'server_round': 3, 'local_epochs': 3}
[2m[36m(DefaultActor pid=7793)[0m Epoch 1: train loss 0.026237533004195603
[2m[36m(DefaultActor pid=7793)[0m Epoch 2: train loss 0.025318835995815418


DEBUG flwr 2023-11-08 14:48:22,121 | server.py:236 | fit_round 3 received 3 results and 0 failures
DEBUG:flwr:fit_round 3 received 3 results and 0 failures


[2m[36m(DefaultActor pid=7793)[0m Epoch 3: train loss 0.024211104225229332


INFO flwr 2023-11-08 14:48:23,427 | server.py:125 | fit progress: (3, 0.024353809729367792, {'loss': 0.024353809729367792}, 223.11887349699987)
INFO:flwr:fit progress: (3, 0.024353809729367792, {'loss': 0.024353809729367792}, 223.11887349699987)
DEBUG flwr 2023-11-08 14:48:23,430 | server.py:173 | evaluate_round 3: strategy sampled 3 clients (out of 3)
DEBUG:flwr:evaluate_round 3: strategy sampled 3 clients (out of 3)


>>> Server-side evaluation loss 0.024353809729367792
[2m[36m(DefaultActor pid=7793)[0m [Client 1] evaluate: config: {}
[2m[36m(DefaultActor pid=7793)[0m [Client 0] evaluate: config: {}
[2m[36m(DefaultActor pid=7793)[0m [Client 2] evaluate: config: {}


DEBUG flwr 2023-11-08 14:48:24,971 | server.py:187 | evaluate_round 3 received 3 results and 0 failures
DEBUG:flwr:evaluate_round 3 received 3 results and 0 failures
DEBUG flwr 2023-11-08 14:48:24,978 | server.py:222 | fit_round 4: strategy sampled 3 clients (out of 3)
DEBUG:flwr:fit_round 4: strategy sampled 3 clients (out of 3)


[2m[36m(DefaultActor pid=7793)[0m [Client 2, round 4] fit, config: {'server_round': 4, 'local_epochs': 3}
[2m[36m(DefaultActor pid=7793)[0m Epoch 1: train loss 0.024207838199756762
[2m[36m(DefaultActor pid=7793)[0m Epoch 2: train loss 0.023170905223599186
[2m[36m(DefaultActor pid=7793)[0m Epoch 3: train loss 0.022867680920494928
[2m[36m(DefaultActor pid=7793)[0m [Client 0, round 4] fit, config: {'server_round': 4, 'local_epochs': 3}
[2m[36m(DefaultActor pid=7793)[0m Epoch 1: train loss 0.023711012513549238
[2m[36m(DefaultActor pid=7793)[0m Epoch 2: train loss 0.022952256622137845
[2m[36m(DefaultActor pid=7793)[0m Epoch 3: train loss 0.02227636006143358
[2m[36m(DefaultActor pid=7793)[0m [Client 1, round 4] fit, config: {'server_round': 4, 'local_epochs': 3}
[2m[36m(DefaultActor pid=7793)[0m Epoch 1: train loss 0.02387305882241991
[2m[36m(DefaultActor pid=7793)[0m Epoch 2: train loss 0.022774532967143588


DEBUG flwr 2023-11-08 14:49:45,626 | server.py:236 | fit_round 4 received 3 results and 0 failures
DEBUG:flwr:fit_round 4 received 3 results and 0 failures


[2m[36m(DefaultActor pid=7793)[0m Epoch 3: train loss 0.022698394899015074


INFO flwr 2023-11-08 14:49:47,081 | server.py:125 | fit progress: (4, 0.02242309741959543, {'loss': 0.02242309741959543}, 306.7734330379999)
INFO:flwr:fit progress: (4, 0.02242309741959543, {'loss': 0.02242309741959543}, 306.7734330379999)
DEBUG flwr 2023-11-08 14:49:47,085 | server.py:173 | evaluate_round 4: strategy sampled 3 clients (out of 3)
DEBUG:flwr:evaluate_round 4: strategy sampled 3 clients (out of 3)


>>> Server-side evaluation loss 0.02242309741959543
[2m[36m(DefaultActor pid=7793)[0m [Client 1] evaluate: config: {}
[2m[36m(DefaultActor pid=7793)[0m [Client 2] evaluate: config: {}
[2m[36m(DefaultActor pid=7793)[0m [Client 0] evaluate: config: {}


DEBUG flwr 2023-11-08 14:49:48,674 | server.py:187 | evaluate_round 4 received 3 results and 0 failures
DEBUG:flwr:evaluate_round 4 received 3 results and 0 failures
DEBUG flwr 2023-11-08 14:49:48,679 | server.py:222 | fit_round 5: strategy sampled 3 clients (out of 3)
DEBUG:flwr:fit_round 5: strategy sampled 3 clients (out of 3)


[2m[36m(DefaultActor pid=7793)[0m [Client 0, round 5] fit, config: {'server_round': 5, 'local_epochs': 3}
[2m[36m(DefaultActor pid=7793)[0m Epoch 1: train loss 0.021668896123215004
[2m[36m(DefaultActor pid=7793)[0m Epoch 2: train loss 0.02110940527032923
[2m[36m(DefaultActor pid=7793)[0m Epoch 3: train loss 0.020221523521123108
[2m[36m(DefaultActor pid=7793)[0m [Client 2, round 5] fit, config: {'server_round': 5, 'local_epochs': 3}
[2m[36m(DefaultActor pid=7793)[0m Epoch 1: train loss 0.021937631479016055
[2m[36m(DefaultActor pid=7793)[0m Epoch 2: train loss 0.021356288614096466
[2m[36m(DefaultActor pid=7793)[0m Epoch 3: train loss 0.020630395853960957
[2m[36m(DefaultActor pid=7793)[0m [Client 1, round 5] fit, config: {'server_round': 5, 'local_epochs': 3}
[2m[36m(DefaultActor pid=7793)[0m Epoch 1: train loss 0.02221730688103923
[2m[36m(DefaultActor pid=7793)[0m Epoch 2: train loss 0.02112869832250807


DEBUG flwr 2023-11-08 14:51:09,852 | server.py:236 | fit_round 5 received 3 results and 0 failures
DEBUG:flwr:fit_round 5 received 3 results and 0 failures


[2m[36m(DefaultActor pid=7793)[0m Epoch 3: train loss 0.02093438204791811


INFO flwr 2023-11-08 14:51:11,286 | server.py:125 | fit progress: (5, 0.020866947476276175, {'loss': 0.020866947476276175}, 390.978452478)
INFO:flwr:fit progress: (5, 0.020866947476276175, {'loss': 0.020866947476276175}, 390.978452478)
DEBUG flwr 2023-11-08 14:51:11,290 | server.py:173 | evaluate_round 5: strategy sampled 3 clients (out of 3)
DEBUG:flwr:evaluate_round 5: strategy sampled 3 clients (out of 3)


>>> Server-side evaluation loss 0.020866947476276175
[2m[36m(DefaultActor pid=7793)[0m [Client 0] evaluate: config: {}
[2m[36m(DefaultActor pid=7793)[0m [Client 1] evaluate: config: {}
[2m[36m(DefaultActor pid=7793)[0m [Client 2] evaluate: config: {}


DEBUG flwr 2023-11-08 14:51:13,152 | server.py:187 | evaluate_round 5 received 3 results and 0 failures
DEBUG:flwr:evaluate_round 5 received 3 results and 0 failures
INFO flwr 2023-11-08 14:51:13,155 | server.py:153 | FL finished in 392.8468516779999
INFO:flwr:FL finished in 392.8468516779999
INFO flwr 2023-11-08 14:51:13,158 | app.py:225 | app_fit: losses_distributed [(1, 0.03148935437202453), (2, 0.028638044264581466), (3, 0.02566991004678938), (4, 0.02355352802409066), (5, 0.02186498741308848)]
INFO:flwr:app_fit: losses_distributed [(1, 0.03148935437202453), (2, 0.028638044264581466), (3, 0.02566991004678938), (4, 0.02355352802409066), (5, 0.02186498741308848)]
INFO flwr 2023-11-08 14:51:13,160 | app.py:226 | app_fit: metrics_distributed_fit {}
INFO:flwr:app_fit: metrics_distributed_fit {}
INFO flwr 2023-11-08 14:51:13,166 | app.py:227 | app_fit: metrics_distributed {}
INFO:flwr:app_fit: metrics_distributed {}
INFO flwr 2023-11-08 14:51:13,168 | app.py:228 | app_fit: losses_centrali

History (loss, distributed):
	round 1: 0.03148935437202453
	round 2: 0.028638044264581466
	round 3: 0.02566991004678938
	round 4: 0.02355352802409066
	round 5: 0.02186498741308848
History (loss, centralized):
	round 0: 0.1535356809237677
	round 1: 0.029720867265441375
	round 2: 0.027068215166638515
	round 3: 0.024353809729367792
	round 4: 0.02242309741959543
	round 5: 0.020866947476276175
History (metrics, centralized):
{'loss': [(0, 0.1535356809237677), (1, 0.029720867265441375), (2, 0.027068215166638515), (3, 0.024353809729367792), (4, 0.02242309741959543), (5, 0.020866947476276175)]}