In [1]:
import pandas as pd
import numpy as np
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, KNNImputer, IterativeImputer
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

In [2]:
data_path = "../data/final/nicu_30.csv"
df = pd.read_csv(data_path)

# Drop rows or columns if needed or do other preliminary cleaning
# For this example, let's assume we only drop columns that are not needed
# and handle missing values only in the pipeline.
df = df.dropna(axis=1, how='all')  # example: remove columns with all NaNs

# Separate features (X) and target (y)
X = df.drop('is_infected', axis=1)
y = df['is_infected']

In [3]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2, 
    random_state=42
)

In [4]:
pipeline = Pipeline([
    ("imputer", SimpleImputer()),  # placeholder, will be replaced in param_grid
    ("classifier", LogisticRegression(max_iter=1000))
])

In [5]:
param_grid = [
    {
        "imputer": [SimpleImputer()],
        "imputer__strategy": ["mean", "median", "most_frequent"]
    },
    {
        "imputer": [KNNImputer()],
        "imputer__n_neighbors": [2, 5, 10]
    },
    {
        "imputer": [IterativeImputer(random_state=42)],
        "imputer__max_iter": [5, 10, 20],
    }
]

In [6]:
grid_search = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    scoring="accuracy",      # you can choose other metrics, e.g. "roc_auc"
    cv=5,                    # 5-fold cross-validation
    n_jobs=-1,               # use all available CPU cores
    verbose=1                # show progress messages
)

grid_search.fit(X_train, y_train)

# Display the best parameters and best score
print("Best parameters found:", grid_search.best_params_)
print("Best cross-validation accuracy:", grid_search.best_score_)

Fitting 5 folds for each of 9 candidates, totalling 45 fits
Best parameters found: {'imputer': SimpleImputer(), 'imputer__strategy': 'median'}
Best cross-validation accuracy: 0.6545627039652581


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [7]:
test_score = grid_search.score(X_test, y_test)
print("Test set accuracy:", test_score)

Test set accuracy: 0.6223241590214067
