In [1]:
import os
import math
import random
import copy
import ast
from collections import Counter, defaultdict
from itertools import chain, combinations
from typing import Optional

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import seaborn as sns
from tqdm import tqdm

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import (
    accuracy_score, 
    confusion_matrix, 
    ConfusionMatrixDisplay, 
    classification_report
)

In [2]:
# disable ConvergenceWarnings
import warnings
from sklearn.exceptions import ConvergenceWarning
warnings.filterwarnings("ignore", category=ConvergenceWarning)

### Read The Spambase Dataset

In [None]:
file_path = '.../data/spambase.data'  # Adjust the path as needed
df = pd.read_csv(file_path, header=None)

In [4]:
X = df.iloc[:, :-1].to_numpy()
y = df.iloc[:, -1].to_numpy()
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

#### Import the split data function for partitioning the data among the clients

In [5]:
from utils.Spambase.split_data import split_data_equal

#### Import the corruption data functions for corrupting the client's data

In [6]:
from utils.Spambase.corrupt_data_spambase import corrupt_data, corrupt_clients

#### Import the aggregate functions for aggregating the cclient's models in FedLR and FedFor

In [7]:
from utils.aggregate_functions import aggregate_lr_models, FederatedForest

#### Import the decision tree Model

In [8]:
from utils.DecisionTree import DecisionTree

#### Import the evaluate coalitions function for evaluating all possible coalitions

In [9]:
from utils.evaluate_coalitions import evaluate_coalitions

#### Import the finding nash equilibria function

In [10]:
from utils.Nash import find_nash_equilibria_v2

In [None]:
def prepare_partitions(X_train, y_train, n_clients, random_seed, corruption_settings, noise_std, corrupt_client_indices, corrupt_function):
    partitions = split_data_equal(X_train, y_train, n_clients=n_clients, shuffle=True, random_seed=random_seed)
    corrupted_partitions, _ = corrupt_clients(
        corrupt_function, partitions, corrupt_client_indices,
        corruption_prob=corruption_settings.get("corruption_prob", 0.6),
        nan_prob=corruption_settings.get("nan_prob", 0.5),
        noise_std=noise_std,
        label_corruption_prob=corruption_settings.get("label_corruption_prob", 0.1),
        base_seed=random_seed
    )
    normalized_partitions = []
    for X_part, y_part in corrupted_partitions:
        local_scaler = StandardScaler()
        X_norm = local_scaler.fit_transform(X_part)
        normalized_partitions.append((X_norm, y_part))
    return normalized_partitions

#### Function for FedLR training

In [None]:
def train_models_fedlr(partitions, random_seed, X_test, y_test, max_iter):

    client_models = []
    client_global_accuracies = []
    
    for X_i, y_i in partitions:
        # Clean data: remove rows with NaN values.
        nan_mask = ~np.isnan(X_i).any(axis=1)
        X_clean = X_i[nan_mask]
        y_clean = y_i[nan_mask]
        if len(y_clean) == 0:
            client_models.append(None)
            client_global_accuracies.append(None)
            continue
        
        model = LogisticRegression(random_state=random_seed, max_iter=max_iter)
        try:
            model.fit(X_clean, y_clean)
            client_models.append(model)
            client_global_accuracies.append(model.score(X_test, y_test))
        except Exception as e:
            client_models.append(None)
            client_global_accuracies.append(None)
    
    return client_models, client_global_accuracies

#### Function for FedFor training

In [None]:
def train_models_fedfor(partitions, X_test, y_test, max_depth):

    client_models = []
    client_global_accuracies = {}
    
    for i, (X_i, y_i) in enumerate(partitions):
        model = DecisionTreeClassifier(max_depth=max_depth, random_state=np.random.randint(0, 100000))
        model.fit(X_i, y_i)
        client_models.append(model)
        y_pred = model.predict(X_test)
        client_global_accuracies[i] = np.mean(y_pred == y_test)  # equivalent to accuracy_score
    return client_models, client_global_accuracies

#### Function for training procedure in each trial

In [None]:
def run_trial(approach, trial_seed, n_clients, X_train, y_train, X_test, y_test,
              hyper_param, noise_std, corrupt_client_indices, corruption_settings, corrupt_function):

    partitions = prepare_partitions(
        X_train, y_train, n_clients, trial_seed, corruption_settings,
        noise_std, corrupt_client_indices, corrupt_function
    )
    if approach == 'fedlr':
        client_models, client_global_acc = train_models_fedlr(partitions, trial_seed, X_test, y_test, max_iter=hyper_param)
        df_results = evaluate_coalitions(client_models, client_global_acc, n_clients, aggregate_lr_models, X_test, y_test, corrupt_client_indices, approach='fedlr')
        return df_results, client_global_acc
    elif approach == 'fedfor':
        client_models, client_global_acc = train_models_fedfor(partitions, X_test, y_test, max_depth=hyper_param)
        df_results = evaluate_coalitions(client_models, client_global_acc, n_clients, FederatedForest, X_test, y_test, corrupt_client_indices, approach='fedfor')
        return df_results, client_global_acc
    else:
        raise ValueError("Unknown approach specified.")

#### Main function to perform all processes

In [None]:
def run_experiment(approach, n_trials, n_clients, hyper_params, partitions_corrupted,
                   corrupt_client_indices, X_train, y_train, X_test, y_test,
                   noise_std, save_dir, corrupt_function=corrupt_data, corruption_settings=None,
                   base_random_seed=42):

    os.makedirs(save_dir, exist_ok=True)
    all_details = []
    all_client_accuracies = []
    
    # Default corruption settings if none provided.
    if corruption_settings is None:
        corruption_settings = {'corruption_prob': 0.6, 'nan_prob': 0.5, 'label_corruption_prob': 0.1}
    
    for hyper_param in hyper_params:
        #if verbose:
            #print(f"\nRunning experiment for hyper_param = {hyper_param}")
        nash_counts = Counter()
        details_for_this_param = []
        client_accuracy_details = []
        
        # If no corrupted client indices provided, choose randomly based on partitions_corrupted.
        if corrupt_client_indices is None:
            corrupt_client_indices = np.random.choice(n_clients, size=partitions_corrupted, replace=False)
        
        for trial in range(n_trials):
            rand_component = random.randint(0, 500)
            trial_seed = base_random_seed + trial + int(1000 * hyper_param) + 2 * rand_component
            #if verbose:
                #print(f" Trial {trial+1}/{n_trials}, Seed: {trial_seed}")
            
            if approach == 'fedlr':
                df_results, client_global_acc = run_trial(
                    approach, trial_seed, n_clients, X_train, y_train, X_test, y_test,
                    hyper_param, noise_std, corrupt_client_indices, corruption_settings, corrupt_function
                )
            else:
                df_results, client_global_acc = run_trial(
                    approach, trial_seed, n_clients, X_train, y_train, X_test, y_test,
                    hyper_param, noise_std, corrupt_client_indices, corruption_settings, corrupt_function
                )
            
            # Identify Nash equilibria (using your pre-defined function)
            df_nash = find_nash_equilibria_v2(df_results.reset_index())
            for coalition in df_nash['Combination']:
                nash_counts[coalition] += 1
            df_nash['Trial'] = trial + 1
            df_nash['Noise Std'] = noise_std
            df_nash['Corrupted Clients'] = len(corrupt_client_indices)
            df_nash['Max Iter or Depth'] = hyper_param
            details_for_this_param.append(df_nash)
            
            # Collect client accuracy details for this trial.
            trial_acc = {
                'Trial': trial + 1,
                'Max Iter or Depth': hyper_param,
                'Noise Std': noise_std,
                'Corrupted Clients': len(corrupt_client_indices)
            }
            for j in range(n_clients):
                col_name = f'Client {j+1} Accuracy'
                if j in corrupt_client_indices:
                    col_name += " (low-quality client)"
                if approach == 'fedlr':
                    trial_acc[col_name] = client_global_acc[j] if client_global_acc[j] is not None else np.nan
                else:
                    trial_acc[col_name] = client_global_acc.get(j, np.nan)
            client_accuracy_details.append(trial_acc)
        
        # Aggregate details for the current hyper parameter.
        df_details = pd.concat(details_for_this_param, ignore_index=True)
        df_client_accuracy = pd.DataFrame(client_accuracy_details)
        df_combined = df_details.merge(
            df_client_accuracy,
            on=['Trial', 'Max Iter or Depth', 'Noise Std', 'Corrupted Clients'],
            how='left'
        )
        all_details.append(df_combined)
    
    final_details_df = pd.concat(all_details, ignore_index=True)
    details_path = os.path.join(save_dir, f"Nash_Equilibrium_Details_{approach}_noise_{noise_std}_c{len(corrupt_client_indices)}.csv")
    if not os.path.exists(details_path):
        final_details_df.to_csv(details_path, index=False)
    
    return final_details_df


### FedLR Spambase: Approximately 350 samples for each client:
noise std values = [0.1 , 0.3 , 0.5 , 0.7 ,1 , 2 , 3 , 4 , 5] 
and low-quality clients counts = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

In [None]:
noise_std_values = [0.1, 0.3, 0.5, 0.7, 1, 2 , 3 , 4 ,5]
corrupted_clients_counts = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]        
save_dir = ".../results/FedLR_Spambase_LQC_0_to_10"
max_iters = [10,100]
results = {noise: [] for noise in noise_std_values}

for noise in noise_std_values:
    for cc in corrupted_clients_counts:
        client_indices = list(range(cc))
        custom_corrupt_data = lambda X, y, corruption_prob=0.6, nan_prob=0.5, noise_std=noise, label_corruption_prob=0.1, random_seed=None: \
            corrupt_data(X, y, corruption_prob, nan_prob, noise_std, label_corruption_prob, random_seed)

        results_fedlr = run_experiment(
            approach='fedlr',
            n_trials=50,
            n_clients=10,
            hyper_params=[10, 100], 
            partitions_corrupted=cc,
            corrupt_client_indices=client_indices,  
            X_train=X_train,
            y_train=y_train,
            X_test=X_test_scaled,
            y_test=y_test,
            noise_std=noise,
            save_dir=save_dir,
            corrupt_function=corrupt_data
        )
        

        occurrence_count = (results_fedlr['Combination'] == '1111111111').sum()
        results[noise].append(occurrence_count)
        print(f"Noise Std: {noise}, Bad Clients: {cc}, max_iters: {max_iters}, Occurrences: {occurrence_count}")

results_df = pd.DataFrame(results, index=corrupted_clients_counts)
results_df.index.name = "Number of Bad Clients"
results_csv_path = os.path.join(save_dir, "nash_occurrence_results.csv")
results_df.to_csv(results_csv_path)



### FedLR Spambase: Approximately 350 samples for each client:
noise std values = [0.1 ] 
and low-quality clients counts = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10] and 1000 Trials in Total

In [None]:
noise_std_values = [0.1]
corrupted_clients_counts = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]        
save_dir = ".../results/FedLR_Spambase_LQC_0_to_10_with_1000Trials"
max_iters = [10,100]
results = {noise: [] for noise in noise_std_values}

for noise in noise_std_values:
    for cc in corrupted_clients_counts:
        client_indices = list(range(cc))
        custom_corrupt_data = lambda X, y, corruption_prob=0.6, nan_prob=0.5, noise_std=noise, label_corruption_prob=0.1, random_seed=None: \
            corrupt_data(X, y, corruption_prob, nan_prob, noise_std, label_corruption_prob, random_seed)

        results_fedlr = run_experiment(
            approach='fedlr',
            n_trials=500,
            n_clients=10,
            hyper_params=[10, 100], 
            partitions_corrupted=cc,
            corrupt_client_indices=client_indices,  
            X_train=X_train,
            y_train=y_train,
            X_test=X_test_scaled,
            y_test=y_test,
            noise_std=noise,
            save_dir=save_dir,
            corrupt_function=corrupt_data
        )
        

        occurrence_count = (results_fedlr['Combination'] == '1111111111').sum()
        results[noise].append(occurrence_count)
        print(f"Noise Std: {noise}, Bad Clients: {cc}, max_iters: {max_iters}, Occurrences: {occurrence_count}")

results_df = pd.DataFrame(results, index=corrupted_clients_counts)
results_df.index.name = "Number of Bad Clients"
results_csv_path = os.path.join(save_dir, "nash_occurrence_results.csv")
results_df.to_csv(results_csv_path)



### FedFor Spambase: Approximately 350 samples for each client:
noise std values = [0.1 , 0.3 , 0.5 , 0.7 ,1 , 2 , 3 , 4 , 5] 
and low-quality clients counts = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

In [None]:
noise_std_values = [0.1, 0.3, 0.5, 0.7, 1, 2 , 3 , 4 ,5]
corrupted_clients_counts = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]  
max_depths = [10,100]      
save_dir = ".../results/FedFor_Spambase_LQC_0_to_10"

results = {noise: [] for noise in noise_std_values}

for noise in noise_std_values:
    for cc in corrupted_clients_counts:
        client_indices = list(range(cc))
        custom_corrupt_data = lambda X, y, corruption_prob=0.6, nan_prob=0.5, noise_std=noise, label_corruption_prob=0.1, random_seed=None: \
            corrupt_data(X, y, corruption_prob, nan_prob, noise_std, label_corruption_prob, random_seed)

        results_fedfor = run_experiment(
            approach='fedfor',
            n_trials=50,
            n_clients=10,
            hyper_params = max_depths,  
            partitions_corrupted=cc,
            corrupt_client_indices=client_indices,  
            X_train=X_train,
            y_train=y_train,
            X_test=X_test_scaled,
            y_test=y_test,
            noise_std=noise,
            save_dir=save_dir,
            corrupt_function=corrupt_data
        )
        

        occurrence_count = (results_fedfor['Combination'] == '1111111111').sum()
        results[noise].append(occurrence_count)
        print(f"Noise Std: {noise}, Bad Clients: {cc}, max_depths: {max_depths}, Occurrences: {occurrence_count}")

results_df = pd.DataFrame(results, index=corrupted_clients_counts)
results_df.index.name = "Number of Bad Clients"
results_csv_path = os.path.join(save_dir, "nash_occurrence_results.csv")
results_df.to_csv(results_csv_path)



### FedFor Spambase: Approximately 350 samples for each client:
noise std values = [0.1 ] 
and low-quality clients counts = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10] and 1000 Trials in Total

In [None]:
noise_std_values = [0.1]
corrupted_clients_counts = [0,1,2,3,4,5,6,7, 8, 9, 10]  
max_depths = [10,100]      
save_dir = ".../results/FedFor_Spambase_LQC_0_to_10_with_1000Trials"

results = {noise: [] for noise in noise_std_values}

for noise in noise_std_values:
    for cc in corrupted_clients_counts:
        client_indices = list(range(cc))
        custom_corrupt_data = lambda X, y, corruption_prob=0.6, nan_prob=0.5, noise_std=noise, label_corruption_prob=0.1, random_seed=None: \
            corrupt_data(X, y, corruption_prob, nan_prob, noise_std, label_corruption_prob, random_seed)

        results_fedfor = run_experiment(
            approach='fedfor',
            n_trials=500,
            n_clients=10,
            hyper_params = max_depths,  
            partitions_corrupted=cc,
            corrupt_client_indices=client_indices,  
            X_train=X_train,
            y_train=y_train,
            X_test=X_test_scaled,
            y_test=y_test,
            noise_std=noise,
            save_dir=save_dir,
            corrupt_function=corrupt_data
        )
        

        occurrence_count = (results_fedfor['Combination'] == '1111111111').sum()
        results[noise].append(occurrence_count)
        print(f"Noise Std: {noise}, Bad Clients: {cc}, max_depths: {max_depths}, Occurrences: {occurrence_count}")

results_df = pd.DataFrame(results, index=corrupted_clients_counts)
results_df.index.name = "Number of Bad Clients"
results_csv_path = os.path.join(save_dir, "nash_occurrence_results.csv")
results_df.to_csv(results_csv_path)

