In [7]:
# client_ALS.py
import flwr as fl
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
import xgboost as xgb

# -----------------------------
# CLIENT CONFIG
# -----------------------------
client_name = "ALS"
dataset_path = r"D:\ML\Neurodivergent\Minsk2020_ALS_dataset.csv"
target_column = "Diagnosis (ALS)"
server_address = "10.244.107.149:5555"  # ZeroTier IP + port

# -----------------------------
# Flower Client
# -----------------------------
class ALSClient(fl.client.NumPyClient):
    def __init__(self):
        # Load dataset
        df = pd.read_csv(dataset_path)
        X = df.drop(columns=[target_column])
        y = df[target_column]

        # Train/test split
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=42
        )

        # Preprocessing
        numeric_features = X_train.select_dtypes(include=["int64", "float64"]).columns.tolist()
        categorical_features = X_train.select_dtypes(exclude=["int64", "float64"]).columns.tolist()

        preprocessor = ColumnTransformer(
            transformers=[
                ("num", StandardScaler(), numeric_features),
                ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features)
            ]
        )

        self.X_train = preprocessor.fit_transform(X_train)
        self.X_test = preprocessor.transform(X_test)
        self.y_train = np.array(y_train)
        self.y_test = np.array(y_test)

        # XGBoost model
        self.model = xgb.XGBClassifier(use_label_encoder=False, eval_metric="logloss")

    def get_parameters(self, config):
        # Tree-based model: return empty list (FedAvg not applicable)
        return []

    def fit(self, parameters, config):
        self.model.fit(self.X_train, self.y_train)
        accuracy = self.model.score(self.X_train, self.y_train)
        return [], len(self.X_train), {"accuracy": float(accuracy), "client_name": client_name}

    def evaluate(self, parameters, config):
        accuracy = self.model.score(self.X_test, self.y_test)
        return 0.0, len(self.X_test), {"accuracy": float(accuracy), "client_name": client_name}

# -----------------------------
# START CLIENT
# -----------------------------
fl.client.start_client(
    server_address=server_address,
    client=ALSClient().to_client()
)


	Instead, use the `flower-supernode` CLI command to start a SuperNode as shown below:

		$ flower-supernode --insecure --superlink='<IP>:<PORT>'

	To view all available options, run:

		$ flower-supernode --help

	Using `start_client()` is deprecated.

            This is a deprecated feature. It will be removed
            entirely in future versions of Flower.
        
[92mINFO [0m:      
[92mINFO [0m:      Received: get_parameters message 617e68a7-43c5-47f3-ab7f-1b5679b2cd24
[92mINFO [0m:      Sent reply
[92mINFO [0m:      
[92mINFO [0m:      Received: train message 7bdaa83f-7c73-4504-81fb-b22fa26c3c8a
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
[92mINFO [0m:      Sent reply
[92mINFO [0m:      
[92mINFO [0m:      Received: reconnect message 45d8be14-bf9d-4f0b-90fe-2f87b0b86bc4
[92mINFO [0m:      Disconnect and shut down


In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
import optuna
from optuna.visualization import plot_optimization_history, plot_param_importances
import warnings
warnings.filterwarnings('ignore')

# Load the dataset
file_path = r"D:\ML\Neurodivergent\Minsk2020_ALS_dataset.csv"

try:
    df = pd.read_csv(file_path)
    print("File loaded successfully")
except Exception as e:
    print(f"Error loading file: {e}")
    raise

print("Dataset shape:", df.shape)
print("\nFirst few rows:")
print(df.head())
print("\nColumn names:")
print(df.columns.tolist())
print("\nData types:")
print(df.dtypes)
print("\nMissing values:")
print(df.isnull().sum())
print("\nBasic statistics:")
print(df.describe())

# Check if dataframe is empty
if df.shape[0] == 0 or df.shape[1] == 0:
    raise ValueError("DataFrame is empty! Please check your file path and data.")

# Identify the target column - adjust based on your dataset
# Common target column names for ALS datasets
possible_target_cols = ['target', 'Target', 'diagnosis', 'Diagnosis', 'class', 'Class', 
                       'label', 'Label', 'ALS', 'als', 'outcome', 'Outcome', 'status', 
                       'Status', 'disease', 'Disease']

target_col = None
for col in possible_target_cols:
    if col in df.columns:
        target_col = col
        break

# If not found, assume last column is target
if target_col is None:
    target_col = df.columns[-1]
    print(f"\nNo standard target column found. Using last column '{target_col}' as target.")
else:
    print(f"\nUsing '{target_col}' as target column.")

# Separate features and target
X = df.drop(columns=[target_col])
y = df[target_col]

print(f"\nFeatures shape: {X.shape}")
print(f"Target shape: {y.shape}")
print(f"Target unique values: {y.unique()}")
print(f"Target value counts:\n{y.value_counts()}")

# Handle any non-numeric columns if present
label_encoders = {}
for col in X.columns:
    if X[col].dtype == 'object':
        le = LabelEncoder()
        X[col] = le.fit_transform(X[col].astype(str))
        label_encoders[col] = le
        print(f"Encoded column: {col}")

# Encode target if it's categorical
le_target = None
if y.dtype == 'object':
    le_target = LabelEncoder()
    y = le_target.fit_transform(y)
    print(f"\nEncoded target classes: {le_target.classes_}")

# Handle missing values
if X.isnull().sum().sum() > 0:
    print("\nHandling missing values...")
    X = X.fillna(X.median(numeric_only=True))
    for col in X.columns:
        if X[col].isnull().sum() > 0:
            X[col] = X[col].fillna(X[col].mode()[0])

# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"\nTraining set size: {X_train.shape}")
print(f"Test set size: {X_test.shape}")
print(f"\nTarget distribution in training set:")
print(pd.Series(y_train).value_counts())
print(f"\nTarget distribution in test set:")
print(pd.Series(y_test).value_counts())

# Determine if binary or multiclass
n_classes = len(np.unique(y))
is_binary = n_classes == 2
print(f"\nProblem type: {'Binary' if is_binary else 'Multiclass'} Classification ({n_classes} classes)")

# Define the objective function for Optuna
def objective(trial):
    """
    Objective function for Optuna to optimize XGBoost hyperparameters
    """
    # Suggest hyperparameters
    params = {
        'booster': trial.suggest_categorical('booster', ['gbtree', 'dart']),
        'lambda': trial.suggest_float('lambda', 1e-8, 1.0, log=True),
        'alpha': trial.suggest_float('alpha', 1e-8, 1.0, log=True),
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'eta': trial.suggest_float('eta', 0.01, 0.3, log=True),
        'gamma': trial.suggest_float('gamma', 1e-8, 1.0, log=True),
        'grow_policy': trial.suggest_categorical('grow_policy', ['depthwise', 'lossguide']),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'colsample_bylevel': trial.suggest_float('colsample_bylevel', 0.5, 1.0),
        'colsample_bynode': trial.suggest_float('colsample_bynode', 0.5, 1.0),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'max_delta_step': trial.suggest_int('max_delta_step', 0, 10),
        'tree_method': 'hist',
        'random_state': 42,
        'n_jobs': -1
    }
    
    # Add objective and eval_metric based on problem type
    if is_binary:
        params['objective'] = 'binary:logistic'
        params['eval_metric'] = 'logloss'
    else:
        params['objective'] = 'multi:softmax'
        params['num_class'] = n_classes
        params['eval_metric'] = 'mlogloss'
    
    # Create model
    model = xgb.XGBClassifier(**params)
    
    # Use stratified k-fold cross-validation
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    
    # Use F1 score for evaluation
    scoring = 'f1_weighted' if n_classes > 2 else 'f1'
    
    cv_scores = cross_val_score(
        model, X_train, y_train, 
        cv=skf, 
        scoring=scoring,
        n_jobs=-1
    )
    
    return cv_scores.mean()

# Create and run the Optuna study
print("\n" + "="*70)
print("Starting Optuna Hyperparameter Optimization for ALS Dataset")
print("="*70)

study = optuna.create_study(
    direction='maximize',
    study_name='xgboost_als_minsk_optimization',
    sampler=optuna.samplers.TPESampler(seed=42)
)

# Run optimization with progress bar
print(f"\nOptimizing for {n_classes}-class classification problem...")
print("This may take a while depending on your dataset size and n_trials...")

study.optimize(
    objective, 
    n_trials=100,  # Adjust based on computational resources
    timeout=3600,  # 1 hour timeout (optional)
    show_progress_bar=True
)

# Print optimization results
print("\n" + "="*70)
print("Optimization Results")
print("="*70)
print(f"Number of finished trials: {len(study.trials)}")
print(f"\nBest trial:")
print(f"  Value (CV F1 Score): {study.best_trial.value:.4f}")
print(f"\nBest hyperparameters:")
for key, value in study.best_params.items():
    print(f"  {key}: {value}")

# Train final model with best parameters
print("\n" + "="*70)
print("Training Final Model with Best Parameters")
print("="*70)

best_params = study.best_params.copy()

# Add fixed parameters
if is_binary:
    best_params['objective'] = 'binary:logistic'
    best_params['eval_metric'] = 'logloss'
else:
    best_params['objective'] = 'multi:softmax'
    best_params['num_class'] = n_classes
    best_params['eval_metric'] = 'mlogloss'

best_params['tree_method'] = 'hist'
best_params['random_state'] = 42
best_params['n_jobs'] = -1

# Train best model
best_model = xgb.XGBClassifier(**best_params)
best_model.fit(X_train, y_train)

# Evaluate on test set
y_pred = best_model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_pred)
test_f1 = f1_score(y_test, y_pred, average='weighted' if n_classes > 2 else 'binary')

print(f"\nTest Set Performance:")
print(f"  Accuracy: {test_accuracy:.4f}")
print(f"  F1 Score: {test_f1:.4f}")
print("\nClassification Report:")
if le_target is not None:
    print(classification_report(y_test, y_pred, target_names=le_target.classes_))
else:
    print(classification_report(y_test, y_pred))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# Feature importance
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': best_model.feature_importances_
}).sort_values('importance', ascending=False)

print("\nTop 15 Most Important Features:")
print(feature_importance.head(15))

# Save the best model and encoders
import joblib

model_path = r"D:\ML\Neurodivergent\best_xgboost_als_minsk_model.pkl"
encoders_path = r"D:\ML\Neurodivergent\label_encoders_als.pkl"

joblib.dump(best_model, model_path)
joblib.dump({
    'feature_encoders': label_encoders, 
    'target_encoder': le_target,
    'feature_names': X.columns.tolist()
}, encoders_path)

print(f"\nModel saved to: {model_path}")
print(f"Encoders saved to: {encoders_path}")

# Save feature importance to CSV
feature_importance_path = r"D:\ML\Neurodivergent\feature_importance_als.csv"
feature_importance.to_csv(feature_importance_path, index=False)
print(f"Feature importance saved to: {feature_importance_path}")

# Visualize optimization history
try:
    import plotly.graph_objects as go
    
    # Optimization history
    fig1 = plot_optimization_history(study)
    fig1.write_html(r"D:\ML\Neurodivergent\optimization_history_als.html")
    
    # Parameter importance
    fig2 = plot_param_importances(study)
    fig2.write_html(r"D:\ML\Neurodivergent\param_importances_als.html")
    
    print("\nVisualization files saved:")
    print("  - optimization_history_als.html")
    print("  - param_importances_als.html")
except Exception as e:
    print(f"\nCould not create visualizations: {e}")

# Summary of best trial
print("\n" + "="*70)
print("Best Trial Summary")
print("="*70)
print(f"Trial number: {study.best_trial.number}")
print(f"CV F1 Score: {study.best_trial.value:.4f}")
print(f"Test Accuracy: {test_accuracy:.4f}")
print(f"Test F1 Score: {test_f1:.4f}")

print("\n" + "="*70)
print("Optimization Complete!")
print("="*70)

File loaded successfully
Dataset shape: (64, 135)

First few rows:
   ID Sex  Age      J1_a      J3_a      J5_a     J55_a      S1_a      S3_a  \
0   8   M   58  0.321817  0.141230  0.199128  0.923634  6.044559  3.196477   
1  20   F   57  0.344026  0.177032  0.206458  0.827714  1.967728  0.856639   
2  21   F   58  0.264740  0.148228  0.177078  0.532566  1.850893  0.942743   
3  22   F   70  0.455793  0.174870  0.243660  0.962641  2.883768  1.284926   
4  24   M   66  0.269335  0.143961  0.167465  0.547745  2.327924  1.164109   

       S5_a  ...   dCCi(7)   dCCi(8)   dCCi(9)  dCCi(10)  dCCi(11)  dCCi(12)  \
0  3.770575  ... -0.024467 -0.005300  0.051874 -0.037710 -0.026549 -0.021149   
1  1.179851  ...  0.002485 -0.004535 -0.000225 -0.006977 -0.012510  0.014773   
2  1.071950  ... -0.013927  0.007908  0.007960 -0.009022 -0.012488 -0.015588   
3  1.915058  ... -0.019285 -0.021768  0.020495  0.035976 -0.034648  0.008021   
4  1.420891  ... -0.005743  0.004726 -0.015247  0.003900 -0.0076

[I 2025-10-30 11:15:25,904] A new study created in memory with name: xgboost_als_minsk_optimization



Optimizing for 2-class classification problem...
This may take a while depending on your dataset size and n_trials...


  0%|          | 0/100 [00:00<?, ?it/s]

[I 2025-10-30 11:15:38,661] Trial 0 finished with value: 0.7533333333333333 and parameters: {'booster': 'dart', 'lambda': 0.007177141927992002, 'alpha': 0.0006155564318973012, 'max_depth': 4, 'eta': 0.01699897838270077, 'gamma': 2.9152036385288193e-08, 'grow_policy': 'depthwise', 'subsample': 0.8540362888980227, 'colsample_bytree': 0.5102922471479012, 'colsample_bylevel': 0.9849549260809971, 'colsample_bynode': 0.9162213204002109, 'min_child_weight': 3, 'n_estimators': 263, 'max_delta_step': 2}. Best is trial 0 with value: 0.7533333333333333.
[I 2025-10-30 11:16:29,000] Trial 1 finished with value: 0.7484848484848484 and parameters: {'booster': 'dart', 'lambda': 2.85469785779718e-05, 'alpha': 2.1371407316372935e-06, 'max_depth': 9, 'eta': 0.01607123851203988, 'gamma': 2.1734877073417355e-06, 'grow_policy': 'lossguide', 'subsample': 0.8925879806965068, 'colsample_bytree': 0.5998368910791798, 'colsample_bylevel': 0.7571172192068059, 'colsample_bynode': 0.7962072844310213, 'min_child_weig