In [1]:
import os
import math
import random
import argparse
import copy
import ast
import warnings
from collections import Counter, defaultdict
from itertools import chain, combinations
from typing import Optional

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import seaborn as sns
from tqdm import tqdm

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

In [2]:
# disable ConvergenceWarnings
import warnings
from sklearn.exceptions import ConvergenceWarning
warnings.filterwarnings("ignore", category=ConvergenceWarning)

### Read The Global HuGaDB Dataset

In [3]:
# File patterns.
train_files_pattern = "/Users/abbaszal/Documents/Thesis_Project_Spambase/data/metadata/train_{i:02d}.csv"
test_files_pattern = "/Users/abbaszal/Documents/Thesis_Project_Spambase/data/metadata/test_{i:02d}.csv"

# Concatenate all training files.
df_train_global = pd.concat([
    pd.read_csv(train_files_pattern.format(i=i)) for i in range(1, 11)
]).dropna()

# Concatenate all testing files.
df_test_global = pd.concat([
    pd.read_csv(test_files_pattern.format(i=i)) for i in range(1, 11)
]).dropna()

# Split features and labels.
X_train_global = df_train_global.drop('act', axis=1)
y_train_global = df_train_global['act']

X_test_global = df_test_global.drop('act', axis=1)
y_test_global = df_test_global['act']

# Encode labels.
label_encoder = LabelEncoder()
y_train_global = label_encoder.fit_transform(y_train_global)
y_test_global = label_encoder.transform(y_test_global)

# Scale features.
scaler_global = StandardScaler()
X_train_global_scaled = scaler_global.fit_transform(X_train_global)
X_test_global_scaled  = scaler_global.transform(X_test_global)

In [4]:
# Create a stratified subsample of the test set to speed up the runtime.
subsample_size = 950  
X_test_global_scaled, _, y_test_global, _ = train_test_split(
    X_test_global_scaled, y_test_global,
    train_size=subsample_size,
    random_state=42,
    stratify=y_test_global
)
print("Subsampled test set shape:", X_test_global_scaled.shape)

Subsampled test set shape: (950, 38)


#### Import the corruption data functions for corrupting the client's data

In [5]:
from utils.HuGaDB.corrupt_data_hugadb import corrupt_data, corrupt_labels

#### Import the prepare partitions function to prepare client data, with possible data corruption

In [8]:
from utils.HuGaDB.prepare_partitions import prepare_partitions

#### Import the aggregate functions for aggregating the cclient's models in FedLR and FedFor

In [17]:
from utils.aggregate_functions import aggregate_lr_models, FederatedForest

#### Import the decision tree Model

In [9]:
from utils.DecisionTree import DecisionTree

#### Import the evaluate coalitions function for evaluating all possible coalitions ~ 1023

In [10]:
from utils.evaluate_coalitions import evaluate_coalitions

#### Import the finding nash equilibria function

In [11]:
from utils.Nash import find_nash_equilibria_v2

#### Function for FedLR training

In [12]:
def train_models_fedlr(n_clients, trial_seed, sample_size, num_corrupted_clients,
                       train_files_pattern, corruption_params, label_corruption_prob, hyper_param):
    """
    Trains models for all clients using FedLR (Logistic Regression).
    Returns a list of trained models and a dict of global accuracies.
    """
    client_models = []
    client_global_accuracies = {}
    for client_idx in range(1, n_clients + 1):
        with warnings.catch_warnings():
            warnings.filterwarnings("ignore", category=ConvergenceWarning)
        
        X_train_scaled, y_train = prepare_partitions(
            client_idx, trial_seed, sample_size, num_corrupted_clients,
            train_files_pattern, corruption_params, label_corruption_prob,label_encoder
        )
        
        model = LogisticRegression(random_state=trial_seed, max_iter=hyper_param)
        model.fit(X_train_scaled, y_train)
        acc_global = accuracy_score(y_test_global, model.predict(X_test_global_scaled))
        
        client_models.append(model)
        client_global_accuracies[client_idx - 1] = acc_global
    return client_models, client_global_accuracies

#### Function for FedFor training

In [13]:
def train_models_fedfor(n_clients, trial_seed, sample_size, num_corrupted_clients,
                        train_files_pattern, corruption_params, label_corruption_prob, hyper_param):
    """
    Trains models for all clients using FedFor (DecisionTree or similar).
    Returns a list of trained models and a dict of global accuracies.
    """
    client_models = []
    client_global_accuracies = {}
    for client_idx in range(1, n_clients + 1):
        with warnings.catch_warnings():
            warnings.filterwarnings("ignore", category=ConvergenceWarning)
        
        X_train_scaled, y_train = prepare_partitions(
            client_idx, trial_seed, sample_size, num_corrupted_clients,
            train_files_pattern, corruption_params, label_corruption_prob , label_encoder
        )
        
        model = DecisionTree(max_depth=hyper_param, random_state=trial_seed)
        model.fit(X_train_scaled, y_train)
        acc_global = accuracy_score(y_test_global, model.predict(X_test_global_scaled))
        
        client_models.append(model)
        client_global_accuracies[client_idx - 1] = acc_global
    return client_models, client_global_accuracies

#### Function for training procedure in each trial

In [14]:
def run_trial(approach, n_clients, trial_seed, sample_size, num_corrupted_clients,
              train_files_pattern, corruption_params, label_corruption_prob, hyper_param,
              aggregator_func, X_test, y_test, corrupt_client_indices):
    """
    Runs a single trial for a given hyper parameter.
    Trains models (using FedLR or FedFor), evaluates coalitions,
    and identifies Nash equilibria (via find_nash_equilibria_v2).
    
    Returns:
        df_results: Coalition evaluation DataFrame.
        df_nash: Nash equilibrium details.
        client_global_accuracies: Dict of client accuracies.
    """
    if approach == 'fedlr':
        client_models, client_global_accuracies = train_models_fedlr(
            n_clients, trial_seed, sample_size, num_corrupted_clients,
            train_files_pattern, corruption_params, label_corruption_prob, hyper_param
        )
    else:
        client_models, client_global_accuracies = train_models_fedfor(
            n_clients, trial_seed, sample_size, num_corrupted_clients,
            train_files_pattern, corruption_params, label_corruption_prob, hyper_param
        )
    
    df_results = evaluate_coalitions(
        client_models, client_global_accuracies, n_clients, aggregator_func,
        X_test, y_test, corrupt_client_indices, approach
    )
    
    df_nash = find_nash_equilibria_v2(df_results.reset_index())
    return df_results, df_nash, client_global_accuracies

#### Main function to perform all processes

In [15]:
def run_experiment(approach, n_trials, n_clients, hyper_params, noise_std, num_corrupted_clients,
                   save_dir, sample_size=350, base_random_seed=42,
                   data_root="/Users/abbaszal/Documents/Thesis_Project_Spambase/data/metadata"):
    """
    Runs the overall experiment across multiple trials and hyper parameters.
    Aggregates Nash equilibrium and client accuracy details, and saves the results to CSV.
    
    Returns:
        final_details_df: DataFrame with complete experiment details.
    """
    # Define file patterns.
    train_files_pattern = os.path.join(data_root, "train_{:02d}.csv")
    test_files_pattern = os.path.join(data_root, "test_{:02d}.csv")
    
    # Set up corruption parameters.
    corruption_params = {
        'corruption_prob': 0.6,
        'nan_prob': 0.5,
        'noise_std': noise_std
    }
    label_corruption_prob = 0.1
    os.makedirs(save_dir, exist_ok=True)
    all_details = []
    
    # Assume the first num_corrupted_clients are the low-quality ones.
    corrupt_client_indices = list(range(num_corrupted_clients))
    
    # Choose aggregator function based on approach.
    if approach == 'fedlr':
        aggregator_func = aggregate_lr_models  # Should accept a list of models.
    else:
        aggregator_func = lambda: FederatedForest()  # Returns a new aggregator instance.
    
    client_accuracy_details_all = []
    details_for_all_hyper = []
    
    for hyper_param in hyper_params:
        details_for_this_param = []
        client_accuracy_details = []
        nash_counts = Counter()
        
        for trial in range(n_trials):
            rand_component = random.randint(0, 500)
            trial_seed = base_random_seed + trial + int(1000 * hyper_param) + rand_component
            random.seed(trial_seed)
            np.random.seed(trial_seed)
            
            df_results, df_nash, client_global_accuracies = run_trial(
                approach, n_clients, trial_seed, sample_size, num_corrupted_clients,
                train_files_pattern, corruption_params, label_corruption_prob, hyper_param,
                aggregator_func, X_test_global_scaled, y_test_global, corrupt_client_indices
            )
            
            # Count Nash equilibria occurrences.
            for coalition in df_nash['Combination']:
                nash_counts[coalition] += 1
            df_nash['Trial'] = trial + 1
            df_nash['Noise Std'] = noise_std
            df_nash['Corrupted Clients'] = num_corrupted_clients
            df_nash['Max Iter or Depth'] = hyper_param
            details_for_this_param.append(df_nash)
            
            trial_acc = {
                'Trial': trial + 1,
                'Max Iter or Depth': hyper_param,
                'Noise Std': noise_std,
                'Corrupted Clients': num_corrupted_clients
            }
            for j in range(n_clients):
                col_name = f'Client {j+1} Accuracy'
                if j in corrupt_client_indices:
                    col_name += "(low-quality client)"
                trial_acc[col_name] = client_global_accuracies[j] if client_global_accuracies[j] is not None else np.nan
            client_accuracy_details.append(trial_acc)
        
        df_details = pd.concat(details_for_this_param, ignore_index=True)
        df_client_accuracy = pd.DataFrame(client_accuracy_details)
        df_combined = df_details.merge(
            df_client_accuracy,
            on=['Trial', 'Max Iter or Depth', 'Noise Std', 'Corrupted Clients'],
            how='left'
        )
        all_details.append(df_combined)
        

    
    final_details_df = pd.concat(all_details, ignore_index=True)
    details_path = os.path.join(save_dir, f"Nash_Equilibrium_Details_{approach}_noise_{noise_std}_c{num_corrupted_clients}.csv")
    final_details_df.to_csv(details_path, index=False)
    return final_details_df


### FedLR HuGaDB: 350 samples for each client:
noise std values = [0.1 , 0.3 , 0.5 , 0.7 ,1 , 2 , 3 , 4 , 5] 
and low-quality clients counts = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

In [None]:
noise_std_values = [0.1, 0.3, 0.5, 0.7, 1, 2, 3, 4, 5]
corrupted_clients_counts = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
save_dir = ".../results/FedLR_HuGaDB_LQC_0_to_10_test"


results = {noise: [] for noise in noise_std_values}


for noise in noise_std_values:
    for cc in corrupted_clients_counts:
        final_details_df = run_experiment(
            approach='fedlr',
            n_trials=50,
            n_clients=10,
            hyper_params=[10, 100],  
            noise_std=noise,
            num_corrupted_clients=cc,
            save_dir=save_dir,
            sample_size=350,        
            base_random_seed=42,
            data_root="/Users/abbaszal/Documents/Thesis_Project_Spambase/data/metadata"
        )
        

        occurrence_count = (final_details_df['Combination'] == '1111111111').sum()
        results[noise].append(occurrence_count)
        print(f"Noise Std: {noise}, Bad Clients: {cc}, Occurrences: {occurrence_count}")


results_df = pd.DataFrame(results, index=corrupted_clients_counts)
results_df.index.name = "Number of Bad Clients"
results_csv_path = os.path.join(save_dir, "nash_occurrence_results.csv")
results_df.to_csv(results_csv_path)
print(f"Results saved to {results_csv_path}")

### FedLR HuGaDB: 350 samples for each client:
noise std values = [0.1 ] 
and low-quality clients counts = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10] and 1000 Trials in Total

In [18]:
noise_std_values = [0.1]
corrupted_clients_counts = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
save_dir = "/Users/abbaszal/Documents/Thesis_Project_Spambase/results/FedLR_HuGaDB_LQC_0_to_10_with_1000Trials"


results = {noise: [] for noise in noise_std_values}


for noise in noise_std_values:
    for cc in corrupted_clients_counts:
        final_details_df = run_experiment(
            approach='fedlr',
            n_trials=500,
            n_clients=10,
            hyper_params=[10, 100],  
            noise_std=noise,
            num_corrupted_clients=cc,
            save_dir=save_dir,
            sample_size=350,        
            base_random_seed=42,
            data_root="/Users/abbaszal/Documents/Thesis_Project_Spambase/data/metadata"
        )
        

        occurrence_count = (final_details_df['Combination'] == '1111111111').sum()
        results[noise].append(occurrence_count)
        print(f"Noise Std: {noise}, Bad Clients: {cc}, Occurrences: {occurrence_count}")


results_df = pd.DataFrame(results, index=corrupted_clients_counts)
results_df.index.name = "Number of Bad Clients"
results_csv_path = os.path.join(save_dir, "nash_occurrence_results.csv")
results_df.to_csv(results_csv_path)

Noise Std: 0.1, Bad Clients: 0, Occurrences: 996
Noise Std: 0.1, Bad Clients: 1, Occurrences: 995
Noise Std: 0.1, Bad Clients: 2, Occurrences: 997
Noise Std: 0.1, Bad Clients: 3, Occurrences: 994
Noise Std: 0.1, Bad Clients: 4, Occurrences: 999
Noise Std: 0.1, Bad Clients: 5, Occurrences: 998
Noise Std: 0.1, Bad Clients: 6, Occurrences: 995
Noise Std: 0.1, Bad Clients: 7, Occurrences: 1000
Noise Std: 0.1, Bad Clients: 8, Occurrences: 1000
Noise Std: 0.1, Bad Clients: 9, Occurrences: 1000
Noise Std: 0.1, Bad Clients: 10, Occurrences: 1000
Results saved to /Users/abbaszal/Documents/Thesis_Project_Spambase/results/FedLR_HuGaDB_LQC_0_to_10_with_1000Trials/nash_occurrence_results.csv


### FedFor HuGaDB: 350 samples for each client:
noise std values = [0.1 , 0.3 , 0.5 , 0.7 ,1 , 2 , 3 , 4 , 5] 
and low-quality clients counts = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

In [None]:
noise_std_values = [0.1, 0.3, 0.5, 0.7, 1, 2, 3, 4, 5]
corrupted_clients_counts = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
save_dir = ".../results/FedFor_HuGaDB_LQC_0_to_10"

results = {noise: [] for noise in noise_std_values}


for noise in noise_std_values:
    for cc in corrupted_clients_counts:
        final_details_df = run_experiment(
            approach='fedfor',
            n_trials=50,
            n_clients=10,
            hyper_params=[10, 100],  
            noise_std=noise,
            num_corrupted_clients=cc,
            save_dir=save_dir,
            sample_size=350,      
            base_random_seed=42,
            data_root="/Users/abbaszal/Documents/Thesis_Project_Spambase/data/metadata"
        )
        

        occurrence_count = (final_details_df['Combination'] == '1111111111').sum()
        results[noise].append(occurrence_count)
        print(f"Noise Std: {noise}, Bad Clients: {cc}, Occurrences: {occurrence_count}")


results_df = pd.DataFrame(results, index=corrupted_clients_counts)
results_df.index.name = "Number of Bad Clients"
results_csv_path = os.path.join(save_dir, "nash_occurrence_results.csv")
results_df.to_csv(results_csv_path)


### FedFor HuGaDB: 350 samples for each client:
noise std values = [0.1 ] 
and low-quality clients counts = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10] and 1000 Trials in Total

In [None]:
noise_std_values = [0.1]
corrupted_clients_counts = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
save_dir = "/Users/abbaszal/Documents/Thesis_Project_Spambase/results/FedFor_HuGaDB_LQC_0_to_10_with_1000Trials"

results = {noise: [] for noise in noise_std_values}


for noise in noise_std_values:
    for cc in corrupted_clients_counts:
        final_details_df = run_experiment(
            approach='fedfor',
            n_trials=500,
            n_clients=10,
            hyper_params=[10, 100],  
            noise_std=noise,
            num_corrupted_clients=cc,
            save_dir=save_dir,
            sample_size=350,      
            base_random_seed=42,
            data_root="/Users/abbaszal/Documents/Thesis_Project_Spambase/data/metadata"
        )
        

        occurrence_count = (final_details_df['Combination'] == '1111111111').sum()
        results[noise].append(occurrence_count)
        print(f"Noise Std: {noise}, Bad Clients: {cc}, Occurrences: {occurrence_count}")


results_df = pd.DataFrame(results, index=corrupted_clients_counts)
results_df.index.name = "Number of Bad Clients"
results_csv_path = os.path.join(save_dir, "nash_occurrence_results.csv")
results_df.to_csv(results_csv_path)
