<a href="https://colab.research.google.com/github/anshulchauhan502/btp/blob/main/Untitled18.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
pip install flwr tensorflow numpy pandas scikit-learn


Collecting flwr
  Downloading flwr-1.22.0-py3-none-any.whl.metadata (14 kB)
Collecting click<8.2.0 (from flwr)
  Downloading click-8.1.8-py3-none-any.whl.metadata (2.3 kB)
Collecting cryptography<45.0.0,>=44.0.1 (from flwr)
  Downloading cryptography-44.0.3-cp39-abi3-manylinux_2_34_x86_64.whl.metadata (5.7 kB)
Collecting grpcio-health-checking<2.0.0,>=1.62.3 (from flwr)
  Downloading grpcio_health_checking-1.75.1-py3-none-any.whl.metadata (1.1 kB)
Collecting iterators<0.0.3,>=0.0.2 (from flwr)
  Downloading iterators-0.0.2-py3-none-any.whl.metadata (2.5 kB)
Collecting pathspec<0.13.0,>=0.12.1 (from flwr)
  Downloading pathspec-0.12.1-py3-none-any.whl.metadata (21 kB)
Collecting protobuf<5.0.0,>=4.21.6 (from flwr)
  Downloading protobuf-4.25.8-cp37-abi3-manylinux2014_x86_64.whl.metadata (541 bytes)
Collecting pycryptodome<4.0.0,>=3.18.0 (from flwr)
  Downloading pycryptodome-3.23.0-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.4 kB)
Collecting tomli<3.0.0,>=2.0.1 

In [None]:
# ================================
# STAGE 2: Federated (IID) - FedAvg simulation (EfficientNetB0)
# Self-contained; reuses same preprocessing/model as Stage-1
# ================================

# ---------- Cell A: Installs (uncomment if flwr is required later) ----------
# !pip install -q flwr    # not required for this script (we use a simple FedAvg sim)

# ---------- Cell 1: Imports, setup ----------
import os, random, math
from pathlib import Path
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras import mixed_precision, callbacks, optimizers
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D, Dropout, Input
from tensorflow.keras.models import Model
from tensorflow.keras.applications import EfficientNetB0
from tensorflow.keras.applications.efficientnet import preprocess_input
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight

# Repro
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
tf.random.set_seed(SEED)

# Mixed precision (same as Stage-1)
mixed_precision.set_global_policy('mixed_float16')
print("Mixed precision policy:", mixed_precision.global_policy())

# Paths (update if different)
DATA_DIR = "/content/drive/MyDrive/ham10000_data"
IMG_DIR = os.path.join(DATA_DIR, "all_images")
META_CSV = os.path.join(DATA_DIR, "HAM10000_metadata.csv")

# ---------- Config (tune these) ----------
NUM_CLIENTS = 5          # number of simulated hospitals
ROUNDS = 8               # federated rounds (global aggregation steps)
LOCAL_EPOCHS = 1         # local training epochs per round
CLIENT_BATCH_SIZE = 32   # smaller than centralized to save memory
IMG_SIZE = 224
VERBOSE = 1

# ---------- Utility ----------
AUTOTUNE = tf.data.AUTOTUNE

# ---------- Cell 2: Load metadata and build image_path, filter missing ----------
meta = pd.read_csv(META_CSV, dtype=str)

# ensure image_id exists
if 'image_id' not in meta.columns and 'imageId' in meta.columns:
    meta = meta.rename(columns={'imageId':'image_id'})

meta['image_id'] = meta['image_id'].astype(str)
# create a case-insensitive mapping from file stems to actual file path
existing_files = list(Path(IMG_DIR).glob("*"))
stem2path = {p.stem.lower(): str(p) for p in existing_files}

def get_path(img_id):
    return stem2path.get(img_id.lower(), None)

meta['image_path'] = meta['image_id'].apply(get_path)
missing = meta['image_path'].isna().sum()
print("Metadata rows:", len(meta), "Missing images:", missing)
meta = meta.dropna(subset=['image_path']).reset_index(drop=True)
print("After filtering, rows:", len(meta))

# ---------- Cell 3: Binary label mapping and Train/Val/Test split ----------
benign_labels = ["nv", "bkl", "df", "vasc"]
malignant_labels = ["mel", "bcc", "akiec"]

meta['binary_label'] = meta['dx'].apply(lambda x: 1 if x in malignant_labels else 0).astype(int)
print("Overall class counts:\n", meta['binary_label'].value_counts())

# Centralized splits (to keep identical to Stage-1)
train_df, temp_df = train_test_split(meta, test_size=0.3, stratify=meta['binary_label'], random_state=SEED)
val_df, test_df = train_test_split(temp_df, test_size=0.33, stratify=temp_df['binary_label'], random_state=SEED)
print("Train/Val/Test sizes:", len(train_df), len(val_df), len(test_df))

# Compute global class-weights (used for local training to handle imbalance)
classes = np.unique(train_df['binary_label'])
weights = compute_class_weight(class_weight='balanced', classes=classes, y=train_df['binary_label'])
class_weights = {int(c): float(w) for c, w in zip(classes, weights)}
print("Class weights:", class_weights)

# ---------- Cell 4: IID client split (label-balanced) ----------
def split_iid_labelwise(df, n_clients):
    """Split df into n_clients IID by distributing each label's indices equally."""
    clients = [pd.DataFrame(columns=df.columns) for _ in range(n_clients)]
    for label in sorted(df['binary_label'].unique()):
        label_df = df[df['binary_label'] == label].sample(frac=1, random_state=SEED).reset_index(drop=True)
        parts = np.array_split(label_df, n_clients)
        for i, part in enumerate(parts):
            clients[i] = pd.concat([clients[i], part], ignore_index=True)
    # shuffle rows inside each client
    for i in range(n_clients):
        clients[i] = clients[i].sample(frac=1, random_state=SEED).reset_index(drop=True)
    return clients

client_dfs = split_iid_labelwise(train_df, NUM_CLIENTS)
for i, cdf in enumerate(client_dfs):
    print(f"Client {i}: {len(cdf)} samples, label distribution:\n{cdf['binary_label'].value_counts().to_dict()}")

# ---------- Cell 5: Dataset builder (same preprocessing as Stage-1) ----------
# Preprocess function (pure TF graph)
def preprocess_tf(path, label):
    img = tf.io.read_file(path)
    img = tf.image.decode_jpeg(img, channels=3)
    img = tf.image.resize(img, [IMG_SIZE, IMG_SIZE])
    img = tf.cast(img, tf.float32)
    img = preprocess_input(img)   # EfficientNet preprocessing
    return img, label

# Simple augmentation (same as Stage-1)
data_augmentation = tf.keras.Sequential([
    tf.keras.layers.RandomFlip("horizontal_and_vertical"),
    tf.keras.layers.RandomRotation(0.06),
    tf.keras.layers.RandomZoom(0.08),
    tf.keras.layers.RandomTranslation(0.05, 0.05),
])

def make_dataset_from_df(df, batch_size=CLIENT_BATCH_SIZE, training=False):
    paths = df['image_path'].values
    labels = df['binary_label'].astype(np.int32).values
    ds = tf.data.Dataset.from_tensor_slices((paths, labels))
    ds = ds.map(preprocess_tf, num_parallel_calls=AUTOTUNE)
    if training:
        ds = ds.shuffle(1000, seed=SEED)
        ds = ds.map(lambda x, y: (data_augmentation(x, training=True), y), num_parallel_calls=AUTOTUNE)
    ds = ds.batch(batch_size)
    ds = ds.prefetch(AUTOTUNE)
    return ds

# Build client datasets (train and small local val)
client_train_ds = []
client_val_ds = []
client_num_examples = []
for cdf in client_dfs:
    # small local validation split per client (10% holdout)
    if len(cdf) > 10:
        local_train, local_val = train_test_split(cdf, test_size=0.10, stratify=cdf['binary_label'], random_state=SEED)
    else:
        local_train, local_val = cdf, cdf  # tiny clients - use same for val
    client_train_ds.append(make_dataset_from_df(local_train, batch_size=CLIENT_BATCH_SIZE, training=True))
    client_val_ds.append(make_dataset_from_df(local_val, batch_size=CLIENT_BATCH_SIZE, training=False))
    client_num_examples.append(len(local_train))

# Centralized test dataset (for final evaluation)
test_ds = make_dataset_from_df(test_df, batch_size=CLIENT_BATCH_SIZE, training=False)

# ---------- Cell 6: Model builder (returns model and base model) ----------
def build_model(base_trainable=True):
    base_model = EfficientNetB0(include_top=False, weights='imagenet', input_tensor=Input(shape=(IMG_SIZE, IMG_SIZE, 3)))
    base_model.trainable = base_trainable
    x = base_model.output
    x = GlobalAveragePooling2D()(x)
    x = Dropout(0.3)(x)
    x = Dense(128, activation='relu')(x)
    out = Dense(1, activation='sigmoid', dtype='float32')(x)
    model = Model(inputs=base_model.input, outputs=out)
    return model, base_model

# ---------- Cell 7: Create client models (one per client) ----------
client_models = []
for i in range(NUM_CLIENTS):
    m, b = build_model(base_trainable=True)   # allow base to be trainable for federated updates
    m.compile(optimizer=optimizers.Adam(learning_rate=1e-4),
              loss='binary_crossentropy',
              metrics=['accuracy', tf.keras.metrics.AUC(name='auc')])
    client_models.append((m, b))

# Create a global model (will hold aggregated weights)
global_model, _ = build_model(base_trainable=True)
global_model.compile(optimizer=optimizers.Adam(learning_rate=1e-4),
                     loss='binary_crossentropy',
                     metrics=['accuracy', tf.keras.metrics.AUC(name='auc')])

# Initialize global weights from centralized pretrained head (or ImageNet init)
global_weights = global_model.get_weights()

# ---------- Cell 8: FedAvg simulation ----------
def fedavg(weights_list, weights_counts):
    """Weighted average of a list of weight lists (lists of ndarrays)."""
    # weights_list: list of client weights (each is a list of numpy arrays)
    # weights_counts: list of ints (num examples for each client)
    total = float(sum(weights_counts))
    new_weights = []
    for layer_idx in range(len(weights_list[0])):
        layer_sum = np.zeros_like(weights_list[0][layer_idx], dtype=np.float64)
        for w, n in zip(weights_list, weights_counts):
            layer_sum += (w[layer_idx].astype(np.float64) * (n / total))
        new_weights.append(layer_sum.astype(weights_list[0][layer_idx].dtype))
    return new_weights

history = {"round": [], "val_loss": [], "val_accuracy": [], "val_auc": []}

print("Starting FedAvg simulation: clients:", NUM_CLIENTS, "rounds:", ROUNDS)
for r in range(1, ROUNDS+1):
    print(f"\n--- Round {r}/{ROUNDS} ---")
    client_weights = []
    client_sizes = []
    # For each client: set global weights, train locally, collect weights
    for i in range(NUM_CLIENTS):
        model_i, _ = client_models[i]
        # set global weights
        model_i.set_weights(global_weights)
        # local training
        model_i.fit(client_train_ds[i],
                    epochs=LOCAL_EPOCHS,
                    class_weight=class_weights,
                    verbose=VERBOSE)
        # collect weights and size
        client_weights.append([w.copy() for w in model_i.get_weights()])
        client_sizes.append(client_num_examples[i])
    # Aggregate weights (FedAvg)
    global_weights = fedavg(client_weights, client_sizes)
    global_model.set_weights(global_weights)
    # evaluate global model on validation set (or test set)
    val_loss, val_acc, val_auc = global_model.evaluate(make_dataset_from_df(val_df, batch_size=CLIENT_BATCH_SIZE, training=False), verbose=0)
    print(f"After round {r}: val_loss={val_loss:.4f}, val_acc={val_acc:.4f}, val_auc={val_auc:.4f}")
    history["round"].append(r)
    history["val_loss"].append(float(val_loss))
    history["val_accuracy"].append(float(val_acc))
    history["val_auc"].append(float(val_auc))

# ---------- Cell 9: Final evaluation on centralized test set ----------
test_loss, test_acc, test_auc = global_model.evaluate(test_ds, verbose=1)
print(f"\nFinal global model on centralized test set -> Test Accuracy: {test_acc*100:.2f}%, Test AUC: {test_auc:.4f}")

# Save final weights if needed
out_dir = os.path.join(DATA_DIR, "federated_iid_experiment")
os.makedirs(out_dir, exist_ok=True)
global_model.save(os.path.join(out_dir, "global_model_fed_iid"), include_optimizer=False)
print("Saved global model to:", os.path.join(out_dir, "global_model_fed_iid"))


Mixed precision policy: <DTypePolicy "mixed_float16">
Metadata rows: 10015 Missing images: 0
After filtering, rows: 10015
Overall class counts:
 binary_label
0    8061
1    1954
Name: count, dtype: int64
Train/Val/Test sizes: 7010 2013 992
Class weights: {0: 0.6212336051045728, 1: 2.5621345029239766}
Client 0: 1403 samples, label distribution:
{0: 1129, 1: 274}
Client 1: 1403 samples, label distribution:
{0: 1129, 1: 274}
Client 2: 1402 samples, label distribution:
{0: 1128, 1: 274}
Client 3: 1401 samples, label distribution:
{0: 1128, 1: 273}
Client 4: 1401 samples, label distribution:
{0: 1128, 1: 273}


  return bound(*args, **kwds)


Downloading data from https://storage.googleapis.com/keras-applications/efficientnetb0_notop.h5
[1m16705208/16705208[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step
Starting FedAvg simulation: clients: 5 rounds: 8

--- Round 1/8 ---
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6551s[0m 155s/step - accuracy: 0.5893 - auc: 0.6442 - loss: 0.6793
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6302s[0m 155s/step - accuracy: 0.5820 - auc: 0.6335 - loss: 0.6826
[1m 3/40[0m [32m━[0m[37m━━━━━━━━━━━━━━━━━━━[0m [1m1:37:15[0m 158s/step - accuracy: 0.5972 - auc: 0.4888 - loss: 0.7476