In [39]:
!pip install bayesian-optimization



In [40]:
# ML
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from bayes_opt import BayesianOptimization
import xgboost as xgb

In [41]:
df = pd.read_csv("Microbiota_composition.csv")
df = df.drop("Unnamed: 0", axis=1)

In [None]:
# Features
X = df.drop(columns=["condition"], axis=1)

# Target
y = df["condition"]

# lambda to scale the target into numeric (CRC : 1, control : 0)
y = y.apply(lambda x: 1 if x == "CRC" else 0)

# Split df into train and test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Boolean filter to keep numeric column
numeric_features = df.select_dtypes(include=['number']).columns
numeric_transformer = Pipeline(steps=[("scaler", StandardScaler())])

# Selection of the categorical feature to preprocess
# categorical_feature = X.select_dtypes(exclude=['number']).columns
# feature_transformer = Pipeline(steps=[("encoder", OneHotEncoder(drop="first"))])

# Use ColumnTransformer to make a preprocessor object that describes all the treatments to be done
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        # ("cat", feature_transformer, categorical_feature),
    ]
)

# Prétraitement des données (X_train, X_test) avec une normalisation
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)
# input_shape = X_train_scaled.shape[1]

# Définir la fonction d'évaluation
def evaluate_xgb(n_estimators, max_depth, learning_rate, gamma, min_child_weight):
    model = xgb.XGBClassifier(
        n_estimators=int(n_estimators),
        max_depth=int(max_depth),
        learning_rate=learning_rate,
        gamma=gamma,
        objective='binary:logistic',
        min_child_weight=min_child_weight,
        random_state=42
        )

    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    return accuracy

# Définir l'espace de recherche
pbounds = {
    'n_estimators': (50, 250),
    'max_depth': (3, 6),
    'learning_rate': (0.01, 0.1),
    'gamma': (0, 0.4),
    'min_child_weight': (1, 2)
    }

# Initialiser l'optimiseur bayésien
optimizer = BayesianOptimization(f=evaluate_xgb, pbounds=pbounds, random_state=42)

# Lancer l'optimisation
optimizer.maximize(init_points=5, n_iter=50)

# Récupérer les hyperparamètres optimaux
best_params = optimizer.max['params']

# Entraîner le modèle final avec les meilleurs hyperparamètres
best_model = xgb.XGBClassifier(n_estimators=int(best_params['n_estimators']),
                           max_depth=int(best_params['max_depth']),
                           learning_rate=best_params['learning_rate'],
                           gamma=best_params['gamma'],
                           min_child_weight=best_params['min_child_weight'])
best_model.fit(X_train, y_train)

|   iter    |  target   |   gamma   | learni... | max_depth | min_ch... | n_esti... |
-------------------------------------------------------------------------------------
| [0m1        [0m | [0m0.7987   [0m | [0m0.1498   [0m | [0m0.09556  [0m | [0m5.928    [0m | [0m1.599    [0m | [0m81.2     [0m |
| [0m2        [0m | [0m0.7727   [0m | [0m0.0624   [0m | [0m0.01523  [0m | [0m6.465    [0m | [0m1.601    [0m | [0m191.6    [0m |
| [0m3        [0m | [0m0.7922   [0m | [0m0.008234 [0m | [0m0.09729  [0m | [0m6.33     [0m | [0m1.212    [0m | [0m86.36    [0m |
| [0m4        [0m | [0m0.7662   [0m | [0m0.07336  [0m | [0m0.03738  [0m | [0m5.099    [0m | [0m1.432    [0m | [0m108.2    [0m |
| [95m5        [0m | [95m0.8052   [0m | [95m0.2447   [0m | [95m0.02255  [0m | [95m4.169    [0m | [95m1.366    [0m | [95m141.2    [0m |
| [0m6        [0m | [0m0.7792   [0m | [0m0.341    [0m | [0m0.1      [0m | [0m3.0      [0m | [0m1.979

In [None]:
# Make predictions on the test set
predictions = best_model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, predictions)
print(f"Accuracy: {accuracy:.4f}")