In [65]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import Normalizer, OneHotEncoder, StandardScaler
from category_encoders import TargetEncoder
from sklearn.metrics import f1_score, make_scorer
from skopt import BayesSearchCV
from scipy.stats import zscore
from tqdm import tqdm
from typing import Optional, Tuple


def categorial_encoding(X_train, X_test, encoders=None):
    """
    Encodes categorical features using One-Hot Encoding.
    Handles cases where categories might be missing in one dataset.

    Parameters:
    - X_train (pd.DataFrame): Training features.
    - X_test (pd.DataFrame): Testing features.
    - encoders (dict): Dictionary of already-fitted encoders (optional).

    Returns:
    - Tuple[pd.DataFrame, pd.DataFrame, dict]: Transformed X_train, X_test, and dictionary of encoders.
    """
    categorical_columns = X_train.select_dtypes(include=['object']).columns
    if encoders is None:
        encoders = {}

    for col in categorical_columns:
        num_categories = X_train[col].nunique()
        # TODO if num_categories is large OneHotEncoding works bad and there are alternatives
        # if num_categories > 10:
        # ...

        encoder = OneHotEncoder(drop='first', sparse_output=False, handle_unknown="infrequent_if_exist")
        encoder.fit(X_train[[col]])

        # Get expected columns from encoder
        expected_cols = encoder.get_feature_names_out([col])
        
        # Transform both datasets
        train_encoded = encoder.transform(X_train[[col]])
        test_encoded = encoder.transform(X_test[[col]])
        
        # Create DataFrames with consistent columns
        train_encoded_df = pd.DataFrame(train_encoded, columns=expected_cols, index=X_train.index)
        test_encoded_df = pd.DataFrame(test_encoded, columns=expected_cols, index=X_test.index)
        
        # Drop original column and join encoded data
        X_train = X_train.drop(columns=col).join(train_encoded_df)
        X_test = X_test.drop(columns=col).join(test_encoded_df)
        
        encoders[col] = ('onehot', encoder, expected_cols.tolist())

    return X_train, X_test, encoders


def handle_missing_data(
    train_df: pd.DataFrame, 
    test_df: Optional[pd.DataFrame] = None
) -> Tuple[pd.DataFrame, Optional[pd.DataFrame]]:
    """
    Fills missing values in the training and (optionally) test DataFrame without dropping any rows.

    - For numeric columns (int or float), missing values are filled with the column mean.
    - For categorical columns (object or other types), missing values are filled with the column mode.
    - The same fill values from the training set are applied to the test set (if provided), ensuring consistency.

    Parameters:
        train_df (pd.DataFrame): The training dataset with possible missing values.
        test_df (Optional[pd.DataFrame]): The test dataset with possible missing values. Default is None.

    Returns:
        Tuple[pd.DataFrame, Optional[pd.DataFrame]]: The training and (optionally) test DataFrames with missing values handled.
    """
    # Copy to avoid modifying original data
    train_df = train_df.copy()
    if test_df is not None:
        test_df = test_df.copy()

    # Fill missing values in training data
    for col in train_df.columns:
        if train_df[col].dtype in ['float64', 'int64']:
            fill_value = train_df[col].median()
        else:
            fill_value = train_df[col].mode()
        train_df[col] = train_df[col].fillna(fill_value)
        
        # Fill missing values in test data using same fill values
        if test_df is not None and col in test_df:
            test_df[col] = test_df[col].fillna(fill_value)

    return train_df, test_df


def drop_outliers(X_train, y_train, threshold=5):
    """
    Filters out rows from training data with Z-score above threshold (outliers).

    Parameters:
    - X_train (pd.DataFrame): Training features.
    - y_train (pd.Series): Training diabetess.
    - threshold (float): Z-score threshold for detecting outliers.

    Returns:
    - Tuple[pd.DataFrame, pd.Series]: Filtered training data.
    """
    numeric_train = X_train.select_dtypes(include=[np.number])
    z_train = np.abs(zscore(numeric_train))
    mask = (z_train < threshold).all(axis=1)

    return X_train[mask], y_train[mask]


def scale_features(X_train, X_test, scaler=None):
    """
    Applies feature scaling using StandardScaler.

    Parameters:
    - X_train (pd.DataFrame): Training features.
    - X_test (pd.DataFrame): Test features.
    - scaler (StandardScaler): Optionally provide pre-fitted scaler.

    Returns:
    - Tuple[pd.DataFrame, pd.DataFrame, StandardScaler]: Scaled train and test data, and fitted scaler.
    """
    scaler = scaler or StandardScaler()
    X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns, index=X_train.index)
    X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns, index=X_test.index)
    return X_train_scaled, X_test_scaled, scaler

def normalize_features(X_train, X_test, normalizer=None):
    """
    Applies L2 normalization to the feature vectors (rows).

    Parameters:
    - X_train (pd.DataFrame): Training features.
    - X_test (pd.DataFrame): Test features.
    - normalizer (Normalizer): Optionally provide pre-fitted Normalizer.

    Returns:
    - Tuple[pd.DataFrame, pd.DataFrame, Normalizer]: Normalized train and test data, and fitted normalizer.
    """
    normalizer = normalizer or Normalizer(norm='l2')
    X_train_norm = pd.DataFrame(normalizer.fit_transform(X_train), columns=X_train.columns, index=X_train.index)
    X_test_norm = pd.DataFrame(normalizer.transform(X_test), columns=X_test.columns, index=X_test.index)
    return X_train_norm, X_test_norm, normalizer


f1 = make_scorer(f1_score , average='macro')

def train_model(X_train, y_train, model, param_space, n_iterations=10, n_splits=5, scoring='accuracy'):
    """
    Trains a model using Bayesian hyperparameter optimization with k-fold cross validation.
    
    Parameters:
    - X_train (pd.DataFrame/array): Training features
    - y_train (pd.Series/array): Training targets
    - model: Scikit-learn compatible model/estimator
    - param_space (dict): Parameter search space for Bayesian optimization
    - n_iterations (int): Number of optimization iterations
    - n_splits (int): Number of folds for cross validation
    - scoring (str/callable): Scoring metric to optimize for
    
    Returns:
    - BayesSearchCV: Fitted optimizer with best parameters and cross-validation results
    """
    
    # Set up stratified k-fold (preserves class distribution in splits)
    cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
            
    # Bayesian optimization with cross-validation
    opt = BayesSearchCV(
        estimator=model,
        search_spaces=param_space,
        n_iter=n_iterations,
        cv=cv,
        scoring=scoring,
        n_jobs=-1,
        verbose=0,
        random_state=42,
        return_train_score=True
    )
    
    print(f"Starting Bayesian Optimization with {n_splits}-fold CV...")
    with tqdm(total=n_iterations, desc="Bayesian Optimization Progress") as pbar:
        def on_step(optim_result):
            pbar.update(1)
        opt.fit(X_train, y_train, callback=on_step)
    
    # Print comprehensive results
    print("\nOptimization complete!")
    print(f"Best parameters: {opt.best_params_}")
    print(f"Best {scoring} score: {opt.best_score_:.4f}")
    print(f"Best estimator: {opt.best_estimator_}")    
    print("---------------------------------------")
    
    # Analyze cross-validation performance
    cv_results = opt.cv_results_
    best_index = opt.best_index_
    best_train_score = cv_results['mean_train_score'][best_index]
    best_validation_score = opt.best_score_

    print(f"\nCross-validation summary ({n_splits}-fold):")
    print(f"  - Best training score:    {best_train_score:.4f}")
    print(f"  - Best validation score:  {best_validation_score:.4f}")
    # print(f"Standard deviation: {np.std(cv_results['mean_test_score']):.4f}")
    return opt

def evaluate_model(model, X, y):
    """
    Prints the macro F1 score for the given model and dataset.

    Parameters:
    - model: Trained model.
    - X (pd.DataFrame): Features.
    - y (pd.Series): True Surviveds.
    """
    y_pred = model.predict(X)
    print("Macro F1 Score:", f1_score(y, y_pred, average='macro'))


In [66]:

# 1. Load data
data = pd.read_csv(r"C:\Users\Timo\OneDrive\REDI School\s25\Session 14 - ML Classification - Lab Class\data\diabetes_prediction_dataset.csv", sep=',')
X = data.copy()
y = data['diabetes']

In [73]:

# 2. Preprocessing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_test = handle_missing_data(X_train, X_test)
X_train, y_train = drop_outliers(X_train, y_train)    # TODO: Test if replacing with mmean/mode is better
X_train, X_test, encoders = categorial_encoding(X_train, X_test)

X_train = X_train.drop(columns=['diabetes'], errors='ignore')
X_test = X_test.drop(columns=['diabetes'], errors='ignore')

X_train, X_test, scaler = scale_features(X_train, X_test)
#X_train, X_test, normalizer = normalize_features(X_train, X_test) # not all ML models need normalization. Test if it improves results. Should help with SVM and not with tree-based models.

# TODO: remove multicolonarity columns


In [68]:
# 3. Model training and evaluation LogisticRegression

# from sklearn.linear_model import LogisticRegression

model = LogisticRegression(class_weight='balanced',  max_iter=1000)
param_space = {
    'C': (0.01, 10)  # Regularization strength
}
model = train_model(X_train, y_train, model, param_space, n_iterations=10, scoring=f1)



Starting Bayesian Optimization with 5-fold CV...


Bayesian Optimization Progress: 100%|██████████| 10/10 [00:05<00:00,  1.75it/s]


Optimization complete!
Best parameters: OrderedDict({'C': 8.37550967197731})
Best make_scorer(f1_score, response_method='predict', average=macro) score: 0.7250
Best estimator: LogisticRegression(C=8.37550967197731, class_weight='balanced', max_iter=1000)
---------------------------------------

Cross-validation summary (5-fold):
  - Best training score:    0.7247
  - Best validation score:  0.7250





In [69]:
# Atttention: runs very long on whole dataset!!!
# needs data normalisation

# 3. Model training and evaluation Support Vector Machine
from sklearn.svm import SVC

model = SVC( probability=True)
param_space = {
    'C': (0.01, 10),
    'gamma': (0.001, 1)
}
# only some data as SVM are slow with a lot of data
model = train_model(X_train[:10000], y_train[:10000], model, param_space, n_iterations=10, scoring=f1)


Starting Bayesian Optimization with 5-fold CV...


Bayesian Optimization Progress: 100%|██████████| 10/10 [00:28<00:00,  2.84s/it]


Optimization complete!
Best parameters: OrderedDict({'C': 8.12583592369006, 'gamma': 0.17269968983516415})
Best make_scorer(f1_score, response_method='predict', average=macro) score: 0.8453
Best estimator: SVC(C=8.12583592369006, gamma=0.17269968983516415, probability=True)
---------------------------------------

Cross-validation summary (5-fold):
  - Best training score:    0.8519
  - Best validation score:  0.8453





In [74]:
# 3. Model training and evaluation RandomForestClassifier

model = RandomForestClassifier(class_weight='balanced')
param_space = {    
    'max_depth': (1, 10),
    'n_estimators': (2, 50)
}
model = train_model(X_train, y_train, model, param_space, n_iterations=10, scoring=f1)


Starting Bayesian Optimization with 5-fold CV...


Bayesian Optimization Progress: 100%|██████████| 10/10 [00:09<00:00,  1.10it/s]


Optimization complete!
Best parameters: OrderedDict({'max_depth': 1, 'n_estimators': 41})
Best make_scorer(f1_score, response_method='predict', average=macro) score: 0.7932
Best estimator: RandomForestClassifier(class_weight='balanced', max_depth=1, n_estimators=41)
---------------------------------------

Cross-validation summary (5-fold):
  - Best training score:    0.7934
  - Best validation score:  0.7932





In [72]:
# 3. Model training and evaluation XGBoostClassifier

from xgboost import XGBClassifier
from sklearn.utils.class_weight import compute_class_weight
import numpy as np

model = XGBClassifier()
param_space = {
    'max_depth': (1, 10),
    'n_estimators': (2, 50)
}
model = train_model(X_train, y_train, model, param_space, n_iterations=10, scoring=f1)


Starting Bayesian Optimization with 5-fold CV...


Bayesian Optimization Progress: 100%|██████████| 10/10 [00:05<00:00,  1.93it/s]


Optimization complete!
Best parameters: OrderedDict({'max_depth': 6, 'n_estimators': 46})
Best make_scorer(f1_score, response_method='predict', average=macro) score: 0.8911
Best estimator: XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              feature_weights=None, gamma=None, grow_policy=None,
              importance_type=None, interaction_constraints=None,
              learning_rate=None, max_bin=None, max_cat_threshold=None,
              max_cat_to_onehot=None, max_delta_step=None, max_depth=6,
              max_leaves=None, min_child_weight=None, missing=nan,
              monotone_constraints=None, multi_strategy=None, n_estimators=46,
              n_jobs=None, num_parallel_tree=None, ...)
---------------------------------------

Cross-validat




LogisticRegression: validation score:      0.7520
Support Vector Machine: validation score:  0.8453
RandomForestClassifier: validation score:  0.7932
XGB: validation score:                     0.8936