In [1]:
import os
import math
import random
import argparse
from collections import Counter, defaultdict
from itertools import chain, combinations
import ast
import copy
import numpy as np
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay, classification_report
from sklearn.linear_model import LogisticRegression


### Local Imports

In [2]:
from utils.Spambase.split_data import split_data_equal
from utils.aggregate_functions import aggregate_lr_models
from utils.evaluate_coalitions import evaluate_coalitions
from utils.nash1 import find_nash_equilibria_v2
from utils.evaluate_coalitions_new import evaluate_coalitions2

In [3]:
from utils.nash_lot import find_nash_equilibria_lottery

In [4]:
# disable ConvergenceWarnings
import warnings
from sklearn.exceptions import ConvergenceWarning
warnings.filterwarnings("ignore", category=ConvergenceWarning)

# HuGaDB Dataset

In [5]:
import numpy as np

def payoff_10(mu: float, sigma: float) -> float:
    z = (
        0.6847
        + 0.0157 * mu
        - 0.0003 * sigma
    )
    return 1 / (1 + np.exp(-z))

def payoff_20(mu: float, sigma: float) -> float:
    z = (
        0.7031
        + 0.0146 * mu
        - 0.0024 * sigma
    )
    return 1 / (1 + np.exp(-z))

def payoff_30(mu: float, sigma: float) -> float:
    z = (
        0.6867
        + 0.0084 * mu
        + 0.0054 * sigma
    )
    return 1 / (1 + np.exp(-z))

def payoff_40(mu: float, sigma: float) -> float:
    z = (
        0.6603
        + 0.0116 * mu
        + 0.0035 * sigma
    )
    return 1 / (1 + np.exp(-z))

def payoff_50(mu: float, sigma: float) -> float:
    z = (
        0.6424
        + 0.0129 * mu
        + 0.0070 * sigma
    )
    return 1 / (1 + np.exp(-z))

def payoff_60(mu: float, sigma: float) -> float:
    z = (
        0.6253
        + 0.0109 * mu
        + 0.0023 * sigma
    )
    return 1 / (1 + np.exp(-z))

def payoff_70(mu: float, sigma: float) -> float:
    z = (
        0.6056
        + 0.0128 * mu
        + 0.0044 * sigma
    )
    return 1 / (1 + np.exp(-z))

def payoff_80(mu: float, sigma: float) -> float:
    z = (
        0.5879
        + 0.0144 * mu
        + 0.0074 * sigma
    )
    return 1 / (1 + np.exp(-z))

def payoff_90(mu: float, sigma: float) -> float:
    z = (
        0.5715
        + 0.0106 * mu
        + 0.0057 * sigma
    )
    return 1 / (1 + np.exp(-z))

def payoff_100(mu: float, sigma: float) -> float:
    z = (
        0.5580
        + 0.0020 * mu
        - 0.0015 * sigma
    )
    return 1 / (1 + np.exp(-z))


In [7]:
# File patterns.
train_files_pattern = "/Users/abbaszal/Documents/Thesis_Project_Spambase/data/metadata/train_{i:02d}.csv"
test_files_pattern = "/Users/abbaszal/Documents/Thesis_Project_Spambase/data/metadata/test_{i:02d}.csv"

In [6]:
# File patterns.
train_files_pattern = "/Users/abbaszal/Documents/Thesis_Project_Spambase/data/metadata/train_{i:02d}.csv"
test_files_pattern = "/Users/abbaszal/Documents/Thesis_Project_Spambase/data/metadata/test_{i:02d}.csv"

# Concatenate all training files.
df_train_global = pd.concat([
    pd.read_csv(train_files_pattern.format(i=i)) for i in range(1, 18)
]).dropna()

# Concatenate all testing files.
df_test_global = pd.concat([
    pd.read_csv(test_files_pattern.format(i=i)) for i in range(1, 18)
]).dropna()

# Split features and labels.
X_train_global = df_train_global.drop('act', axis=1)
y_train_global = df_train_global['act']

X_test_global = df_test_global.drop('act', axis=1)
y_test_global = df_test_global['act']

# Encode labels.
label_encoder = LabelEncoder()
y_train_global = label_encoder.fit_transform(y_train_global)
y_test_global = label_encoder.transform(y_test_global)

# Scale features.
scaler_global = StandardScaler()
X_train_global_scaled = scaler_global.fit_transform(X_train_global)
X_test_global_scaled  = scaler_global.transform(X_test_global)

# Create a stratified subsample of the test set to speed up the runtime.
subsample_size = 950  
X_test_global_scaled, _, y_test_global, _ = train_test_split(
    X_test_global_scaled, y_test_global,
    train_size=subsample_size,
    random_state=42,
    stratify=y_test_global
)
print("Subsampled test set shape:", X_test_global_scaled.shape)

Subsampled test set shape: (950, 38)


In [8]:
# Concatenate all training files.
df_train_global = pd.concat([
    pd.read_csv(train_files_pattern.format(i=i)) for i in range(1, 18)
]).dropna()

# Concatenate all testing files.
df_test_global = pd.concat([
    pd.read_csv(test_files_pattern.format(i=i)) for i in range(1, 18)
]).dropna()

# Split features and labels.
X_train_global = df_train_global.drop('act', axis=1)
y_train_global = df_train_global['act']

X_test_global = df_test_global.drop('act', axis=1)
y_test_global = df_test_global['act']

# Encode labels.
label_encoder = LabelEncoder()
y_train_global = label_encoder.fit_transform(y_train_global)
y_test_global = label_encoder.transform(y_test_global)

# Scale features.
scaler_global = StandardScaler()
X_train_global_scaled = scaler_global.fit_transform(X_train_global)
X_test_global_scaled  = scaler_global.transform(X_test_global)

In [10]:
# Parameters
eps               = 1e-8
n_clients_list    = [10,20,30,40,50,60,70,80,90,100]
n_trials          = 100
base_random_seed  = 42
max_iters         = [ 100]
approach          = 'fedlr'

# Saving directory
save_dir = "/Users/abbaszal/Documents/Fit/hugadb_fedlr_test_random"
os.makedirs(save_dir, exist_ok=True)

train_csv_path = "/Users/abbaszal/Documents/Thesis_Project_Spambase/data/metadata/new_runs/train.csv"
df_full_train = pd.read_csv(train_csv_path).dropna(subset=['act']).reset_index(drop=True)

for n_clients in n_clients_list:
    print(f"\n> n_clients = {n_clients}")

    # Dynamically get the corresponding payoff function
    payoff_func_name = f"payoff_{n_clients}"
    if payoff_func_name not in globals():
        raise ValueError(f"Missing payoff function for {n_clients} clients")
    payoff_func = globals()[payoff_func_name]
    #print(payoff_func)
    
    for max_iter in max_iters:
        print(f"  max_iter = {max_iter}")

        # reset counters for this config
        counts_static = Counter()
        lottery_count = 0

        for trial in range(1, n_trials + 1):
            
            rand_component = random.randint(0, 500)
            trial_seed = base_random_seed + trial + int(1000 * max_iter) + 2 * rand_component


            X_test_full_scaled = X_test_global_scaled.copy()
            y_test_full        = y_test_global.copy()

            # Create a stratified subsample of the test set to speed up the runtime.
            subsample_size = 950  
            X_test_glob, _, y_test_glob, _ = train_test_split(
                X_test_full_scaled, y_test_full,
                train_size=subsample_size,
                random_state=trial_seed,
                stratify=y_test_full
            )


            df_trial, _ = train_test_split(
                df_full_train,
                train_size=10000,
                random_state=trial_seed,
                stratify=df_full_train['act']
            )
   
            df_remaining = df_trial.copy()


            client_partitions = []

            sample_size = 5000 // n_clients
            for client_idx in range(n_clients):
                if len(df_remaining) < sample_size:
                    raise ValueError("err")

                if len(df_remaining) == sample_size:
                    df_client = df_remaining.copy()
                    df_remaining = df_remaining.iloc[0:0] 

                else:
                    df_client, df_remaining = train_test_split(
                        df_remaining,
                        train_size=sample_size,
                        random_state=trial_seed,
                        stratify=df_remaining['act']
                    )
                df_client = df_client.reset_index(drop=True)
                client_partitions.append(df_client)

            # train each client
            client_models = []
            client_accs   = {}

            for client_idx, df_client in enumerate(client_partitions):

                X_client = df_client.drop(columns=['act'])
                y_client = df_client['act']

                client_scaler = StandardScaler()
                X_client_scaled = client_scaler.fit_transform(X_client)
                y_client_encoded = label_encoder.transform(y_client)

                mdl = LogisticRegression(random_state=trial_seed,
                                         max_iter=max_iter)
                mdl.fit(X_client_scaled, y_client_encoded)
                client_models.append(mdl)

                pred = mdl.predict(X_test_glob)
                client_accs[client_idx] = accuracy_score(y_test_glob, pred)

            # coalition evaluation
            df_res = evaluate_coalitions2(
                client_models=client_models,
                client_global_accuracies=client_accs,
                n_clients=n_clients,
                aggregator_func=aggregate_lr_models,
                X_test=X_test_glob,
                y_test=y_test_glob,
                corrupt_client_indices=[],
                approach=approach
            )

            # static‐game Nash counts
            df_ne = find_nash_equilibria_v2(df_res)
            if not df_ne.empty:
                for coalition in df_ne.index:
                    counts_static[coalition] += 1


            # lottery‐game incentive check
            vals      = np.array(list(client_accs.values()))
            print(vals)
            mu_full   = vals.mean()
            sig_full  = vals.std(ddof=1)
            payoff_f  = payoff_func(mu_full, sig_full)
            print(payoff_f)

            has_incentive = any(
                acc > payoff_f + eps
                for acc in client_accs.values()
            )
            if not has_incentive:
                lottery_count += 1


        static_count = sum(counts_static.values())

        counts_df = pd.DataFrame([{
            'n_clients':           n_clients,
            'max_iter':            max_iter,
            'Static_Occurrences':  static_count,
            'Lottery_Occurrences': lottery_count
        }])
        fname = (f"Nash_Counts_{approach}"
                 f"_nclients_{n_clients}"
                 f"_maxiter_{max_iter}.csv")
        out_path = os.path.join(save_dir, fname)
        counts_df.to_csv(out_path, index=False)
        print(f"saved {fname}")



> n_clients = 10
  max_iter = 100
[0.65157895 0.62526316 0.61894737 0.63894737 0.65263158 0.63684211
 0.58210526 0.60736842 0.61052632 0.61684211]
0.666965431881558
[0.62421053 0.62736842 0.61789474 0.63263158 0.61473684 0.62526316
 0.59894737 0.60736842 0.61578947 0.61894737]
0.6669460309780182
[0.63473684 0.63157895 0.63263158 0.63473684 0.63789474 0.60526316
 0.61684211 0.60315789 0.57684211 0.63789474]
0.6669552468886831
[0.61368421 0.61263158 0.63578947 0.62315789 0.58947368 0.62210526
 0.60421053 0.65473684 0.6        0.60315789]
0.6669369799179993
[0.62210526 0.62210526 0.61052632 0.61894737 0.63052632 0.61052632
 0.6        0.61473684 0.60421053 0.62526316]
0.666937605933944
[0.60736842 0.57157895 0.59368421 0.60631579 0.61894737 0.60421053
 0.63368421 0.60210526 0.59578947 0.61157895]
0.6668975108908602
[0.56631579 0.63368421 0.60842105 0.6        0.59263158 0.59578947
 0.61052632 0.58842105 0.59157895 0.61052632]
0.6668808959064613
[0.60526316 0.61684211 0.59894737 0.6178947

# Spambase Dataset

In [7]:
file_path = '/Users/abbaszal/Documents/Thesis_Project_Spambase/data/spambase.data'  # Adjust the path as needed
df = pd.read_csv(file_path, header=None)

In [8]:
import numpy as np

def payoff_10(mu: float, sigma: float) -> float:
    z = (
        2.5376
        + 0.0116 * mu
        + 0.0019 * sigma
    )
    return 1 / (1 + np.exp(-z))

def payoff_20(mu: float, sigma: float) -> float:
    z = (
        2.5093
        + 0.0052 * mu
        - 0.0007 * sigma
    )
    return 1 / (1 + np.exp(-z))

def payoff_30(mu: float, sigma: float) -> float:
    z = (
        2.4309
        + 0.0116 * mu
        + 0.0083 * sigma
    )
    return 1 / (1 + np.exp(-z))

def payoff_40(mu: float, sigma: float) -> float:
    z = (
        2.3424
        + 0.0049 * mu
        + 0.0016 * sigma
    )
    return 1 / (1 + np.exp(-z))

def payoff_50(mu: float, sigma: float) -> float:
    z = (
        2.2725
        + 0.0038 * mu
        - 0.0012 * sigma
    )
    return 1 / (1 + np.exp(-z))

def payoff_60(mu: float, sigma: float) -> float:
    z = (
        2.2128
        + 0.0061 * mu
        + 0.0035 * sigma
    )
    return 1 / (1 + np.exp(-z))

def payoff_70(mu: float, sigma: float) -> float:
    z = (
        2.1594
        + 0.0048 * mu
        + 0.0010 * sigma
    )
    return 1 / (1 + np.exp(-z))

def payoff_80(mu: float, sigma: float) -> float:
    z = (
        2.1235
        + 0.0059 * mu
        - 0.0002 * sigma
    )
    return 1 / (1 + np.exp(-z))

def payoff_90(mu: float, sigma: float) -> float:
    z = (
        2.0829
        + 0.0042 * mu
        + 0.0010 * sigma
    )
    return 1 / (1 + np.exp(-z))

def payoff_100(mu: float, sigma: float) -> float:
    z = (
        2.0504
        + 0.0042 * mu
        + 0.0025 * sigma
    )
    return 1 / (1 + np.exp(-z))


In [9]:
random_seed= 42
X = df.iloc[:, :-1].to_numpy()
y = df.iloc[:, -1].to_numpy()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_seed)

scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [10]:
def train_models_fedlr(partitions, random_seed, X_test, y_test, max_iter):
    client_models = []
    client_global_accuracies = []
    
    for X_i, y_i in partitions:
        nan_mask = ~np.isnan(X_i).any(axis=1)
        X_clean = X_i[nan_mask]
        y_clean = y_i[nan_mask]
        if len(y_clean) == 0:
            client_models.append(None)
            client_global_accuracies.append(None)
            continue
        
        model = LogisticRegression(random_state=random_seed, max_iter=max_iter)
        try:
            local_scaler = StandardScaler()
            model.fit(local_scaler.fit_transform(X_clean), y_clean)
            client_models.append(model)
            client_global_accuracies.append(model.score(X_test, y_test))
        except Exception as e:
            client_models.append(None)
            client_global_accuracies.append(None)
    
    return client_models, client_global_accuracies

In [None]:
# Parameters
eps     = 1e-8
n_trials    = 100
n_clients_list    = [10,20,30,40,50,60,70,80,90,100]
base_seed   = 42
max_iters   = [100]
approach    = 'fedlr'


save_dir = (
"/Users/abbaszal/Documents/Fit/spambase_fedlr"
)
os.makedirs(save_dir, exist_ok=True)

all_results = []


for n_clients in n_clients_list:
    print(f"\n> n_clients = {n_clients}")

    # Dynamically get the corresponding payoff function
    payoff_func_name = f"payoff_{n_clients}"
    if payoff_func_name not in globals():
        raise ValueError(f"Missing payoff function for {n_clients} clients")
    payoff_func = globals()[payoff_func_name]

    for max_iter in max_iters:
        print(f"\n max_iter = {max_iter} ")
        counts_static = Counter()
        lottery_count = 0

        for trial in range(1, n_trials+1):
            rc = random.randint(0, 500)
            trial_seed = base_seed + (trial-1) + 1000*max_iter + 2*rc
            random.seed(trial_seed)
            np.random.seed(trial_seed)

            partitions = split_data_equal(
                X_train, y_train,
                n_clients=n_clients,
                shuffle=True,
                random_seed=trial_seed
            )
            
            client_models, client_accs = train_models_fedlr(
                partitions=partitions,
                random_seed=trial_seed,
                X_test=X_test,
                y_test=y_test,
                max_iter=max_iter
            )

    
            # coalition evaluation
            df_res = evaluate_coalitions2(
                client_models=client_models,
                client_global_accuracies=client_accs,
                n_clients=n_clients,
                aggregator_func=aggregate_lr_models,
                X_test=X_test,
                y_test=y_test,
                corrupt_client_indices=[],
                approach=approach
            )

            # static‐game Nash counts
            df_ne = find_nash_equilibria_v2(df_res)
            if not df_ne.empty:
                for coalition in df_ne.index:
                    counts_static[coalition] += 1


            # lottery‐game incentive check
            vals = np.array(client_accs)
            mu_full   = vals.mean()
            sig_full  = vals.std(ddof=1)
            payoff_f  = payoff_func(mu_full, sig_full)



            has_incentive = any(
                acc > payoff_f 
                for acc in client_accs
            )
            if not has_incentive:
                lottery_count += 1

        # aggregate counts into single numbers
        static_count = sum(counts_static.values())


        counts_df = pd.DataFrame([{
            'n_clients':           n_clients,
            'max_iter':            max_iter,
            'Static_Occurrences':  static_count,
            'Lottery_Occurrences': lottery_count
        }])
        fname = (f"Nash_Counts_{approach}"
                    f"_nclients_{n_clients}"
                    f"_maxiter_{max_iter}.csv")
        out_path = os.path.join(save_dir, fname)
        counts_df.to_csv(out_path, index=False)
        print(f" saved {fname}")

In [None]:
import pandas as pd
import os

# Load the data
file_path = "/Users/abbaszal/Documents/Spambase_results_with_LR.csv"  # Adjust path if needed
df = pd.read_csv(file_path)

# Identify columns containing client accuracies
client_accuracy_cols = [col for col in df.columns if "Client" in col and "Accuracy" in col]

# Also keep MaxIter and Trial columns
columns_to_keep = ['MaxIter', 'Trial'] + client_accuracy_cols

# Get first row for each (MaxIter, Trial)
df_first_rows = df.groupby(['MaxIter', 'Trial'], as_index=False).first()

# Select only the desired columns
df_filtered = df_first_rows[columns_to_keep]

# Save the filtered dataframe
save_path = "/Users/abbaszal/Documents/Spambase_FedLR_Fit.csv"
df_filtered.to_csv(save_path, index=False)

print(f"Saved filtered client accuracies to {save_path}")

Saved filtered client accuracies to /Users/abbaszal/Documents/Spambase_FedLR_Fit.csv
