In [1]:
import os, math, random, argparse, ast, copy
from collections import Counter, defaultdict
from itertools import chain, combinations
import numpy as np
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier


### Local Imports

In [2]:
from utils.Spambase.split_data import split_data_equal
from utils.aggregate_functions import FederatedForest
from utils.DecisionTree import DecisionTree
from utils.nash1 import find_nash_equilibria_v2
from utils.evaluate_coalitions_new import evaluate_coalitions2

In [3]:
from utils.Nash_lottery import find_nash_equilibria_lottery

# HuGaDB Dataset 

In [4]:
import numpy as np

def payoff_10(mu: float, sigma: float) -> float:
    z = (
        1.9566
        + 0.0324 * mu
        + 0.0134 * sigma
    )
    return 1 / (1 + np.exp(-z))

def payoff_20(mu: float, sigma: float) -> float:
    z = (
        1.8198
        + 0.0048 * mu
        - 0.0042 * sigma
    )
    return 1 / (1 + np.exp(-z))

def payoff_30(mu: float, sigma: float) -> float:
    z = (
        1.6367
        + 0.0218 * mu
        - 0.0011 * sigma
    )
    return 1 / (1 + np.exp(-z))

def payoff_40(mu: float, sigma: float) -> float:
    z = (
        1.4733
        + 0.0425 * mu
        + 0.0044 * sigma
    )
    return 1 / (1 + np.exp(-z))

def payoff_50(mu: float, sigma: float) -> float:
    z = (
        1.3218
        + 0.0702 * mu
        + 0.0204 * sigma
    )
    return 1 / (1 + np.exp(-z))

def payoff_60(mu: float, sigma: float) -> float:
    z = (
        1.1931
        + 0.0930 * mu
        + 0.0030 * sigma
    )
    return 1 / (1 + np.exp(-z))

def payoff_70(mu: float, sigma: float) -> float:
    z = (
        1.0139
        + 0.1545 * mu
        + 0.0026 * sigma
    )
    return 1 / (1 + np.exp(-z))

def payoff_80(mu: float, sigma: float) -> float:
    z = (
        0.8174
        + 0.1995 * mu
        + 0.0159 * sigma
    )
    return 1 / (1 + np.exp(-z))

def payoff_90(mu: float, sigma: float) -> float:
    z = (
        0.6717
        + 0.2508 * mu
        + 0.0197 * sigma
    )
    return 1 / (1 + np.exp(-z))

def payoff_100(mu: float, sigma: float) -> float:
    z = (
        0.5125
        + 0.2550 * mu
        - 0.0079 * sigma
    )
    return 1 / (1 + np.exp(-z))


In [5]:
import warnings
warnings.filterwarnings("ignore", message="X does not have valid feature names")

In [6]:
# File patterns.
train_files_pattern = "/Users/abbaszal/Documents/Thesis_Project_Spambase/data/metadata/train_{i:02d}.csv"
test_files_pattern = "/Users/abbaszal/Documents/Thesis_Project_Spambase/data/metadata/test_{i:02d}.csv"

# Concatenate all training files.
df_train_global = pd.concat([
    pd.read_csv(train_files_pattern.format(i=i)) for i in range(1, 11)
]).dropna()

# Concatenate all testing files.
df_test_global = pd.concat([
    pd.read_csv(test_files_pattern.format(i=i)) for i in range(1, 11)
]).dropna()

# Split features and labels.
X_train_global = df_train_global.drop('act', axis=1)
y_train_global = df_train_global['act']

X_test_global = df_test_global.drop('act', axis=1)
y_test_global = df_test_global['act']

# Encode labels.
label_encoder = LabelEncoder()
y_train_global = label_encoder.fit_transform(y_train_global)
y_test_global = label_encoder.transform(y_test_global)

# Scale features.
scaler_global = StandardScaler()
X_train_global_scaled = scaler_global.fit_transform(X_train_global)
X_test_global_scaled  = scaler_global.transform(X_test_global)
# Create a stratified subsample of the test set to speed up the runtime.
subsample_size = 950  
X_test_global_scaled, _, y_test_global, _ = train_test_split(
    X_test_global_scaled, y_test_global,
    train_size=subsample_size,
    random_state=42,
    stratify=y_test_global
)
print("Subsampled test set shape:", X_test_global_scaled.shape)

Subsampled test set shape: (950, 38)


In [None]:
# Parameters
eps               = 1e-8
n_clients_list    = [10,20,30,40,50,60,70,80,90,100]
n_trials          = 100
base_random_seed  = 42
max_depths        = [100]
approach          = 'fedfor'

# Saving directory
save_dir = "/Users/abbaszal/Documents/Fit/hugadb_fedfor_test"
os.makedirs(save_dir, exist_ok=True)



train_csv_path = "/Users/abbaszal/Documents/Thesis_Project_Spambase/data/metadata/new_runs/train.csv"
df_full_train = pd.read_csv(train_csv_path).dropna(subset=['act']).reset_index(drop=True)

for n_clients in n_clients_list:
    print(f"\n> n_clients = {n_clients}")
          
    # Dynamically get the corresponding payoff function
    #payoff_func_name = f"payoff_{n_clients}"
    #if payoff_func_name not in globals():
        #raise ValueError(f"Missing payoff function for {n_clients} clients")
    #payoff_func = globals()[payoff_func_name]
 

    for max_depth in max_depths:
        print(f"\n max_depth = {max_depth}")

        # reset counters for this config
        counts_static = Counter()
        lottery_count = 0

        for trial in range(1, n_trials + 1):
            
            rand_component = random.randint(0, 500)
            trial_seed = base_random_seed + trial + int(1000 * max_depth) + 2 * rand_component


            df_trial, _ = train_test_split(
                df_full_train,
                train_size=10000,
                random_state=trial_seed,
                stratify=df_full_train['act']
            )
   
            df_remaining = df_trial.copy()


            client_partitions = []

            sample_size = 3500 // n_clients
            for client_idx in range(n_clients):
                if len(df_remaining) < sample_size:
                    raise ValueError("err")

                if len(df_remaining) == sample_size:
                    df_client = df_remaining.copy()
                    df_remaining = df_remaining.iloc[0:0] 

                else:
                    df_client, df_remaining = train_test_split(
                        df_remaining,
                        train_size=sample_size,
                        random_state=trial_seed,
                        stratify=df_remaining['act']
                    )
                df_client = df_client.reset_index(drop=True)
                client_partitions.append(df_client)

            # train each client
            client_models = []
            client_global_accuracies   = {}

            for client_idx, df_client in enumerate(client_partitions):

                X_client = df_client.drop(columns=['act'])
                y_client = df_client['act']

                client_scaler = StandardScaler()
                X_client_scaled = client_scaler.fit_transform(X_client)
                y_client_encoded = label_encoder.transform(y_client)
                # fit model
                model = DecisionTree(
                    max_depth=max_depth,
                    random_state=trial_seed
                )
                model.fit(X_client_scaled, y_client_encoded)
                client_models.append(model)

                # record its global-test accuracy
                y_pred_global = model.predict(X_test_global_scaled)
                acc_global = accuracy_score(y_test_global, y_pred_global)
                client_global_accuracies[client_idx] = acc_global

            # coalition evaluation
            df_res = evaluate_coalitions2(
                client_models=client_models,
                client_global_accuracies=client_global_accuracies,
                n_clients=n_clients,
                aggregator_func=FederatedForest,
                X_test=X_test_global_scaled,
                y_test=y_test_global,
                corrupt_client_indices=[],
                approach=approach
            )

            # static‐game Nash counts
            df_ne = find_nash_equilibria_v2(df_res)
            if not df_ne.empty:
                for coalition in df_ne.index:
                    counts_static[coalition] += 1


            # lottery‐game incentive check
            vals      = np.array(list(client_global_accuracies.values()))
            mu_full   = vals.mean()
            sig_full  = vals.std(ddof=1)
            payoff_f  = payoff_100(mu_full, sig_full)

            has_incentive = any(
                acc > payoff_f + eps
                for acc in client_global_accuracies.values()
            )
            if not has_incentive:
                lottery_count += 1


        static_count = sum(counts_static.values())

        counts_df = pd.DataFrame([{
            'n_clients':           n_clients,
            'max_iter':            max_depth,
            'Static_Occurrences':  static_count,
            'Lottery_Occurrences': lottery_count
        }])
        fname = (f"Nash_Counts_{approach}"
                 f"_nclients_{n_clients}"
                 f"_maxiter_{max_depth}.csv")
        out_path = os.path.join(save_dir, fname)
        counts_df.to_csv(out_path, index=False)
        print(f"saved {fname}")


# Spambase Dataset

In [8]:
file_path = '/Users/abbaszal/Documents/Thesis_Project_Spambase/data/spambase.data'  # Adjust the path as needed
df = pd.read_csv(file_path, header=None)

In [12]:
import numpy as np

def payoff_10(mu: float, sigma: float) -> float:
    z = (
        2.4315
        + 0.0231 * mu
        - 0.0085 * sigma
    )
    return 1 / (1 + np.exp(-z))

def payoff_20(mu: float, sigma: float) -> float:
    z = (
        2.4312
        + 0.0180 * mu
        + 0.0088 * sigma
    )
    return 1 / (1 + np.exp(-z))

def payoff_30(mu: float, sigma: float) -> float:
    z = (
        2.4131
        + 0.0186 * mu
        + 0.0111 * sigma
    )
    return 1 / (1 + np.exp(-z))

def payoff_40(mu: float, sigma: float) -> float:
    z = (
        2.3715
        + 0.0121 * mu
        + 0.0112 * sigma
    )
    return 1 / (1 + np.exp(-z))

def payoff_50(mu: float, sigma: float) -> float:
    z = (
        2.3369
        + 0.0114 * mu
        + 0.0058 * sigma
    )
    return 1 / (1 + np.exp(-z))

def payoff_60(mu: float, sigma: float) -> float:
    z = (
        2.3019
        + 0.0244 * mu
        + 0.0080 * sigma
    )
    return 1 / (1 + np.exp(-z))

def payoff_70(mu: float, sigma: float) -> float:
    z = (
        2.2800
        + 0.0150 * mu
        + 0.0093 * sigma
    )
    return 1 / (1 + np.exp(-z))

def payoff_80(mu: float, sigma: float) -> float:
    z = (
        2.2433
        + 0.0138 * mu
        + 0.0072 * sigma
    )
    return 1 / (1 + np.exp(-z))

def payoff_90(mu: float, sigma: float) -> float:
    z = (
        2.2230
        + 0.0204 * mu
        + 0.0040 * sigma
    )
    return 1 / (1 + np.exp(-z))

def payoff_100(mu: float, sigma: float) -> float:
    z = (
        2.1893
        + 0.0150 * mu
        + 0.0016 * sigma
    )
    return 1 / (1 + np.exp(-z))


In [10]:
def train_models_fedfor(partitions, X_test, y_test, max_depth):
    client_models = []
    client_global_accuracies = {}
    
    for i, (X_i, y_i) in enumerate(partitions):
        local_scaler = StandardScaler()
        model = DecisionTreeClassifier(max_depth=max_depth, random_state=np.random.randint(0, 100000))
        model.fit(local_scaler.fit_transform(X_i), y_i)
        client_models.append(model)
        y_pred = model.predict(X_test)
        client_global_accuracies[i] = np.mean(y_pred == y_test)
    return client_models, client_global_accuracies

In [11]:
random_seed= 42
X = df.iloc[:, :-1].to_numpy()
y = df.iloc[:, -1].to_numpy()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_seed)

scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
# Parameters
eps     = 1e-8
n_clients_list    = [10,20,30,40,50,60,70,80,90,100]
n_trials    = 100
base_seed   = 42
max_depths   = [ 100]
approach    = 'fedfor'


save_dir = (
"/Users/abbaszal/Documents/Fit/spambase_fedfor"
)
os.makedirs(save_dir, exist_ok=True)

all_results = [] 


for n_clients in n_clients_list:
    print(f"\n> n_clients = {n_clients}")
          
    # Dynamically get the corresponding payoff function
    payoff_func_name = f"payoff_{n_clients}"
    if payoff_func_name not in globals():
        raise ValueError(f"Missing payoff function for {n_clients} clients")
    payoff_func = globals()[payoff_func_name]
 

    for max_depth in max_depths:
        print(f"\n max_depth = {max_depth}")
        counts_static = Counter()
        lottery_count = 0

        for trial in range(1, n_trials+1):
            rc = random.randint(0, 500)
            trial_seed = base_seed + (trial-1) + 1000*max_depth + 2*rc
            random.seed(trial_seed)
            np.random.seed(trial_seed)

            partitions = split_data_equal(
                X_train, y_train,
                n_clients=n_clients,
                shuffle=True,
                random_seed=trial_seed
            )
            client_models, client_accs = train_models_fedfor(
                partitions=partitions,
                X_test=X_test,
                y_test=y_test,
                max_depth=max_depth
            )

            # coalition evaluation
            df_res = evaluate_coalitions2(
                client_models=client_models,
                client_global_accuracies=client_accs,
                n_clients=n_clients,
                aggregator_func=FederatedForest,
                X_test=X_test,
                y_test=y_test,
                corrupt_client_indices=[],
                approach=approach
            )

            # static‐game Nash counts
            df_ne = find_nash_equilibria_v2(df_res)
            if not df_ne.empty:
                for coalition in df_ne.index:
                    counts_static[coalition] += 1


            # lottery‐game incentive check
            vals = np.array(list(client_accs.values()))
            mu_full   = vals.mean()
            sig_full  = vals.std(ddof=1)
            payoff_f  = payoff_func(mu_full, sig_full)

            has_incentive = any(
                acc > payoff_f 
                for acc in client_accs.values()
            )
            if not has_incentive:
                lottery_count += 1

        # aggregate counts into single numbers
        static_count = sum(counts_static.values())


        counts_df = pd.DataFrame([{
            'n_clients':           n_clients,
            'max_iter':            max_depth,
            'Static_Occurrences':  static_count,
            'Lottery_Occurrences': lottery_count
        }])
        fname = (f"Nash_Counts_{approach}"
                    f"_nclients_{n_clients}"
                    f"_maxiter_{max_depth}.csv")
        out_path = os.path.join(save_dir, fname)
        counts_df.to_csv(out_path, index=False)
        print(f" saved {fname}")