In [1]:
import numpy as np
import pandas as pd
from sklearn.experimental import enable_iterative_imputer  # 启用 IterativeImputer
from sklearn.impute import IterativeImputer
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier, RandomForestClassifier, ExtraTreesClassifier, BaggingClassifier, HistGradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

In [2]:
def run_model_selection(data: pd.DataFrame, target: str, id_column: str, 
                        n_splits: int = 5, random_state: int = 42):
    """
    Perform model selection given a dataset using Accuracy, Precision, Recall, and F1-score.

    This function:
        1. Drops rows with more than 95% missing values.
        2. Splits data into features (X) and target (y).
        3. Uses MICE (IterativeImputer) with max_iter=20 to impute missing values.
        4. Trains multiple classification models using cross-validation.
        5. Prints a summary table of Accuracy, Precision, Recall, and F1-score.
        6. Returns the best model based on F1-score.

    :param data: Input DataFrame containing features and target.
    :param target: The name of the target column.
    :param id_column: The name of the ID column which is not used for training.
    :param n_splits: Number of folds for Stratified K-Fold cross validation.
    :param random_state: Random seed for reproducibility.
    :return: The best performing model (already fit on the entire dataset).
    """

    # Step 1: Drop rows with missing ratio > 0.95
    row_missing_ratio = data.isnull().sum(axis=1) / data.shape[1]
    data = data.loc[row_missing_ratio <= 0.95].copy()

    # Separate target and ID from the features
    X = data.drop(columns=[target, id_column])
    y = data[target]

    # Step 2: Apply MICE for missing data imputation
    imputer = IterativeImputer(max_iter=12, random_state=random_state)
    X_imputed = imputer.fit_transform(X)
    
    # Define classification models
    models = {
        "Decision Tree": DecisionTreeClassifier(),
        "Logistic Regression": LogisticRegression(max_iter=1000),
        "Naive Bayes": GaussianNB(),
        "KNN": KNeighborsClassifier(),
        "Multilayer Perceptron": MLPClassifier(max_iter=1000),
        "SVM": SVC(probability=True),
        "Gradient Boosting": GradientBoostingClassifier(),
        "AdaBoost": AdaBoostClassifier(),
        "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric="logloss"),
        "Random Forest": RandomForestClassifier(),
        "LightGBM": LGBMClassifier(),
        "Extra Trees": ExtraTreesClassifier(),
        "CatBoost": CatBoostClassifier(verbose=0),
        "Bagging Classifier": BaggingClassifier(),
        "HistGradientBoosting": HistGradientBoostingClassifier(),
    }

    # Step 3: Cross-validation for each model
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)

    results = []
    best_model = None
    best_f1_score = 0.0
`
    # Define scoring metrics
    scoring_metrics = {
        "accuracy": make_scorer(accuracy_score),
        "precision": make_scorer(precision_score, average="weighted"),
        "recall": make_scorer(recall_score, average="weighted"),
        "f1_score": make_scorer(f1_score, average="weighted"),
    }

    # Evaluate each model
    for model_name, model in models.items():
        accuracy = np.mean(cross_val_score(model, X_imputed, y, cv=skf, scoring=scoring_metrics["accuracy"]))
        precision = np.mean(cross_val_score(model, X_imputed, y, cv=skf, scoring=scoring_metrics["precision"]))
        recall = np.mean(cross_val_score(model, X_imputed, y, cv=skf, scoring=scoring_metrics["recall"]))
        f1 = np.mean(cross_val_score(model, X_imputed, y, cv=skf, scoring=scoring_metrics["f1_score"]))

        results.append([model_name, accuracy, precision, recall, f1])

        # Keep track of the best model based on F1-score
        if f1 > best_f1_score:
            best_f1_score = f1
            best_model = model_name

    # Create a results DataFrame
    results_df = pd.DataFrame(results, columns=["Model", "Accuracy", "Precision", "Recall", "F1-score"])
    
    # Print the results in table form
    print("\n===== Model Evaluation Results =====")
    print(results_df.to_string(index=False))

    # Print the best model
    print(f"\n===== Best Model (Based on F1-score) =====")
    print(f"Best Model: {best_model}")
    print(f"Best F1-score: {best_f1_score:.4f}")

    # Fit the best model on the entire dataset
    final_model = models[best_model]
    final_model.fit(X_imputed, y)

    return final_model

In [3]:
def main(data: pd.DataFrame, target: str, id_column: str):
    """
    Main function to load data and perform model selection.
    """
    best_model = run_model_selection(data, target, id_column)

In [4]:
if __name__ == "__main__":
    # Load the NICU dataset
    nicu_120 = pd.read_csv(r"../data/final/nicu_120.csv")

    # Define target column and ID column
    target_column = "is_infected"  # Target variable
    id_column = "SUBJECT_ID"       # ID column

    # Run the main function
    main(data=nicu_120, target=target_column, id_column=id_column)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

[LightGBM] [Info] Number of positive: 3208, number of negative: 2016
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002087 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 21231
[LightGBM] [Info] Number of data points in the train set: 5224, number of used features: 84
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.614089 -> initscore=0.464532
[LightGBM] [Info] Start training from score 0.464532
[LightGBM] [Info] Number of positive: 3208, number of negative: 2017
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001715 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 21226
[LightGBM] [Info] Number of data points in the train set: 5225, number of used features: 84
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.613971 -> initscore=0.464036
[LightGBM] [Info] Start training from score 0.464036
[LightGBM] [Info] Nu