In [1]:
# Imports for preprocessing
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

# Load and preprocess data
data = pd.read_csv('/global/homes/b/billmj/fl_unsw/data/centralized_test_data.csv')

# Separate features and target
X = data.drop(columns=['label'])
y = data['label']

# Identify categorical columns
categorical_columns = ['proto', 'service', 'state']

# Apply one-hot encoding to categorical columns and scaling to numerical columns
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), X.select_dtypes(include=['int64', 'float64']).columns),
        ('cat', OneHotEncoder(), categorical_columns)
    ])

X = preprocessor.fit_transform(X)

# Convert sparse matrix to dense format
X = X.toarray()

# Print the type and shape of the transformed data
print("Type of X after transformation:", type(X))
print("Shape of X after transformation:", X.shape)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Convert to numpy arrays (should be already numpy arrays, but this is to ensure)
X_train = np.array(X_train)
X_test = np.array(X_test)
y_train = np.array(y_train)
y_test = np.array(y_test)

# Print shapes to verify
print("Final X_train shape:", X_train.shape)
print("Final X_test shape:", X_test.shape)
print("Final y_train shape:", y_train.shape)
print("Final y_test shape:", y_test.shape)


Type of X after transformation: <class 'numpy.ndarray'>
Shape of X after transformation: (82332, 191)
Final X_train shape: (65865, 191)
Final X_test shape: (16467, 191)
Final y_train shape: (65865,)
Final y_test shape: (16467,)


In [2]:
# Imports for federated learning
import os
import time
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score, confusion_matrix, classification_report, matthews_corrcoef
from torch.utils.data import DataLoader, TensorDataset
from sklearn.utils import shuffle

# Shuffle the training data
X_train, y_train = shuffle(X_train, y_train, random_state=42)

# Split data into clients (simulating different clients)
num_clients = 10
client_data = []
for i in range(num_clients):
    start = i * len(X_train) // num_clients
    end = (i + 1) * len(X_train) // num_clients
    client_data.append((X_train[start:end], y_train[start:end]))

# Define the model
class SimpleNN(nn.Module):
    def __init__(self, input_dim):
        super(SimpleNN, self).__init__()
        self.fc1 = nn.Linear(input_dim, 16)
        self.dropout1 = nn.Dropout(0.3)
        self.fc2 = nn.Linear(16, 8)
        self.dropout2 = nn.Dropout(0.3)
        self.fc3 = nn.Linear(8, 1)
    
    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = self.dropout1(x)
        x = torch.relu(self.fc2(x))
        x = self.dropout2(x)
        x = torch.sigmoid(self.fc3(x))
        return x

# Define client update function with learning rate scheduler
def client_update(model, train_loader, criterion, optimizer, scheduler, num_epochs):
    model.train()
    computation_time = 0
    for epoch in range(num_epochs):
        for data, target in train_loader:
            comp_start = time.time()
            optimizer.zero_grad()
            output = model(data)
            loss = criterion(output, target.unsqueeze(1).float())
            loss.backward()
            optimizer.step()
            comp_end = time.time()
            computation_time += comp_end - comp_start
        scheduler.step()
    return model.state_dict(), computation_time

# Calculate model size
def calculate_model_size(model):
    return sum(p.numel() * p.element_size() for p in model.parameters()) / (1024 * 1024)  # Size in MB

# Implement federated learning with FedAvg and adaptive learning rate
def federated_learning_with_fedavg(num_rounds, num_epochs_per_round, batch_size, initial_lr, checkpoint_interval=5):
    input_dim = X_train.shape[1]
    global_model = SimpleNN(input_dim)
    global_weights = global_model.state_dict()

    # Create checkpoint directory
    checkpoint_dir = 'checkpoints'
    os.makedirs(checkpoint_dir, exist_ok=True)

    # Initialize the communication metrics
    communication_metrics = {
        "Round": [],
        "Average_RTT(s)": [],
        "Uplink_Data(MB)": [],
        "Downlink_Data(MB)": [],
        "Total_Data_Transferred(MB)": [],
        "Computation_Time(s)": [],
        "Communication_Time(s)": [],
        "Round_Total_Time(s)": [],
        "Round_Accuracy": []
    }

    for round in range(num_rounds):
        client_updates = []
        total_comm_time = 0
        total_comp_time = 0
        total_uplink_data = 0
        total_downlink_data = 0
        round_start_time = time.time()

        for client_x, client_y in client_data:
            client_tensor_x = torch.tensor(client_x, dtype=torch.float32)
            client_tensor_y = torch.tensor(client_y, dtype=torch.int64)
            client_dataset = TensorDataset(client_tensor_x, client_tensor_y)
            client_loader = DataLoader(client_dataset, batch_size=batch_size, shuffle=True)

            client_model = SimpleNN(input_dim)
            
            # Downlink: Global model to client
            downlink_start = time.time()
            client_model.load_state_dict(global_weights)
            downlink_end = time.time()
            downlink_time = downlink_end - downlink_start
            downlink_data = calculate_model_size(client_model)

            criterion = nn.BCELoss()
            optimizer = optim.Adam(client_model.parameters(), lr=initial_lr)
            scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.9)  # Reduce LR by 10% each epoch

            # Client computation
            new_weights, client_comp_time = client_update(client_model, client_loader, criterion, optimizer, scheduler, num_epochs_per_round)

            # Uplink: Client model to server
            uplink_start = time.time()
            client_updates.append(new_weights)
            uplink_end = time.time()
            uplink_time = uplink_end - uplink_start
            uplink_data = calculate_model_size(client_model)

            total_comm_time += downlink_time + uplink_time
            total_comp_time += client_comp_time
            total_uplink_data += uplink_data
            total_downlink_data += downlink_data

        # Average updates from all clients (FedAvg)
        global_weights = {k: torch.stack([client[k] for client in client_updates], 0).mean(0) for k in global_weights.keys()}
        global_model.load_state_dict(global_weights)

        # Save checkpoint
        if (round + 1) % checkpoint_interval == 0:
            checkpoint_path = os.path.join(checkpoint_dir, f'round_{round+1}.pth')
            torch.save(global_model.state_dict(), checkpoint_path)
            print(f"Checkpoint saved at round {round + 1}")

        # Evaluate global model
        global_model.eval()
        test_tensor_x = torch.tensor(X_test, dtype=torch.float32)
        test_tensor_y = torch.tensor(y_test, dtype=torch.int64)
        with torch.no_grad():
            test_output = global_model(test_tensor_x).squeeze()
            test_accuracy = ((test_output > 0.5).int() == test_tensor_y).float().mean()

        round_end_time = time.time()
        round_total_time = round_end_time - round_start_time
        avg_rtt = total_comm_time / num_clients

        print(f"Round {round + 1}, Test accuracy: {test_accuracy:.4f}")

        # Log communication metrics
        communication_metrics["Round"].append(round + 1)
        communication_metrics["Average_RTT(s)"].append(avg_rtt)
        communication_metrics["Uplink_Data(MB)"].append(total_uplink_data)
        communication_metrics["Downlink_Data(MB)"].append(total_downlink_data)
        communication_metrics["Total_Data_Transferred(MB)"].append(total_uplink_data + total_downlink_data)
        communication_metrics["Computation_Time(s)"].append(total_comp_time)
        communication_metrics["Communication_Time(s)"].append(total_comm_time)
        communication_metrics["Round_Total_Time(s)"].append(round_total_time)
        communication_metrics["Round_Accuracy"].append(test_accuracy.item())

    # Save communication metrics to CSV
    metrics_df = pd.DataFrame(communication_metrics)
    metrics_df.to_csv('communication_metrics.csv', index=False)
    print("Communication metrics saved to communication_metrics.csv")

    return global_model

# Run federated learning with FedAvg and adaptive learning rate
start_time = time.time()
final_model = federated_learning_with_fedavg(
    num_rounds=15,
    num_epochs_per_round=5,
    batch_size=256,
    initial_lr=0.001,
    checkpoint_interval=5
)

# Final evaluation
final_model.eval()
test_tensor_x = torch.tensor(X_test, dtype=torch.float32)
test_tensor_y = torch.tensor(y_test, dtype=torch.int64)
with torch.no_grad():
    test_output = final_model(test_tensor_x).squeeze()
    y_pred = (test_output > 0.5).int()

# Calculate and print various metrics
accuracy = accuracy_score(test_tensor_y, y_pred)
f1 = f1_score(test_tensor_y, y_pred)
precision = precision_score(test_tensor_y, y_pred)
recall = recall_score(test_tensor_y, y_pred)
roc_auc = roc_auc_score(test_tensor_y, y_pred)
mcc = matthews_corrcoef(test_tensor_y, y_pred)

print(f"Final Test accuracy: {accuracy:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"ROC AUC Score: {roc_auc:.4f}")
print(f"Matthews Correlation Coefficient: {mcc:.4f}")

print("\nConfusion Matrix:")
print(confusion_matrix(test_tensor_y, y_pred))

print("\nClassification Report:")
print(classification_report(test_tensor_y, y_pred))

total_time = time.time() - start_time
print(f"\nTotal execution time: {total_time:.2f} seconds")



Round 1, Test accuracy: 0.8392
Round 2, Test accuracy: 0.9078
Round 3, Test accuracy: 0.9374
Round 4, Test accuracy: 0.9472
Checkpoint saved at round 5
Round 5, Test accuracy: 0.9531
Round 6, Test accuracy: 0.9573
Round 7, Test accuracy: 0.9614
Round 8, Test accuracy: 0.9633
Round 9, Test accuracy: 0.9647
Checkpoint saved at round 10
Round 10, Test accuracy: 0.9665
Round 11, Test accuracy: 0.9689
Round 12, Test accuracy: 0.9707
Round 13, Test accuracy: 0.9722
Round 14, Test accuracy: 0.9740
Checkpoint saved at round 15
Round 15, Test accuracy: 0.9767
Communication metrics saved to communication_metrics.csv
Final Test accuracy: 0.9767
F1 Score: 0.9791
Precision: 0.9672
Recall: 0.9914
ROC AUC Score: 0.9751
Matthews Correlation Coefficient: 0.9532

Confusion Matrix:
[[7095  305]
 [  78 8989]]

Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.96      0.97      7400
           1       0.97      0.99      0.98      9067

    accurac