In [9]:
# !pip install bayesian-optimization

In [2]:
# ML
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from bayes_opt import BayesianOptimization
import xgboost as xgb
import pickle

In [3]:
df = pd.read_csv("pages/Microbiota_composition.csv")
# df = df.drop("country", axis=1)

In [5]:
df.head()

Unnamed: 0,condition,Streptococcus_anginosus___ref_mOTU_v2_0004__,Enterobacteriaceae_sp.___ref_mOTU_v2_0036__,Citrobacter_sp.___ref_mOTU_v2_0076__,Klebsiella_michiganensis/oxytoca___ref_mOTU_v2_0079__,Enterococcus_faecalis___ref_mOTU_v2_0116__,Lactobacillus_salivarius___ref_mOTU_v2_0125__,Dielma_fastidiosa___ref_mOTU_v2_0138__,Streptococcus_constellatus/intermedius___ref_mOTU_v2_0143__,Streptococcus_parasanguinis___ref_mOTU_v2_0144__,...,unknown_Clostridiales___meta_mOTU_v2_7778__,unknown_Clostridiales___meta_mOTU_v2_7781__,unknown_Clostridiales___meta_mOTU_v2_7782__,unknown_Clostridiales___meta_mOTU_v2_7784__,Clostridium_sp._CAG__230___meta_mOTU_v2_7788__,Clostridium_sp._CAG__1193___meta_mOTU_v2_7789__,unknown_Erysipelotrichaceae___meta_mOTU_v2_7790__,unknown_Clostridiales___meta_mOTU_v2_7795__,unknown_Clostridiales___meta_mOTU_v2_7800__,country
0,control,0.0,0.0,0.0,0.0,0.0,0.0,8.3e-05,0.0,0.000249,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.3e-05,8.3e-05,Austria
1,control,0.000591,6.6e-05,0.0,0.0,0.0,0.0,0.0,6.6e-05,0.002102,...,0.0,0.0,0.000788,0.0,0.0,0.0,0.0,0.0,0.0,Austria
2,control,0.00084,6.5e-05,0.002454,0.0,6.5e-05,0.013111,0.0,6.5e-05,0.012013,...,0.0,0.000194,6.5e-05,0.0,0.00084,0.0,0.0,0.001808,0.0,Austria
3,control,0.0,7.1e-05,0.0,0.0,0.0,0.0,0.000213,7.1e-05,0.000142,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Austria
4,control,8.5e-05,0.019236,0.001111,0.000256,8.5e-05,8.5e-05,0.0,0.0,0.000513,...,0.000171,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Austria


In [12]:
# Features
X = df.drop(columns=["condition", "country"], axis=1)

# Target
y = df["condition"]

# lambda to scale the target into numeric (CRC : 1, control : 0)
y = y.apply(lambda x: 1 if x == "CRC" else 0)

# Split df into train and test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Boolean filter to keep numeric column
numeric_features = df.select_dtypes(include=['number']).columns
numeric_transformer = Pipeline(steps=[("scaler", StandardScaler())])

# Selection of the categorical feature to preprocess
# categorical_feature = X.select_dtypes(exclude=['number']).columns
# feature_transformer = Pipeline(steps=[("encoder", OneHotEncoder(drop="first"))])

# Use ColumnTransformer to make a preprocessor object that describes all the treatments to be done
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        # ("cat", feature_transformer, categorical_feature),
    ]
)

# Prétraitement des données (X_train, X_test) avec une normalisation
preprocessor.fit(X_train)

# Spécifiez le nom du fichier dans lequel vous souhaitez enregistrer l'objet
scaler_name = "scaler.pkl"

# Utilisez la bibliothèque pickle pour enregistrer l'objet scaler dans le fichier
with open(scaler_name, 'wb') as fichier:
    pickle.dump(preprocessor, fichier)

X_train = preprocessor.transform(X_train)
X_test = preprocessor.transform(X_test)
# input_shape = X_train_scaled.shape[1]

# Définir la fonction d'évaluation
def evaluate_xgb(n_estimators, max_depth, learning_rate, gamma, min_child_weight, alpha, reg_lambda, eta):
    model = xgb.XGBClassifier(
        n_estimators=int(n_estimators),
        max_depth=int(max_depth),
        learning_rate=learning_rate,
        gamma=gamma,
        objective='binary:logistic',
        min_child_weight=min_child_weight,
        random_state=42,
        alpha=alpha,
        reg_lambda=reg_lambda,
        booster='gbtree',
        eta=eta
        )

    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    return accuracy

# Définir l'espace de recherche
pbounds = {
    'max_depth': (3, 10),
    'learning_rate': (0.001, 0.1),
    'n_estimators': (100, 200),
    'gamma': (0, 5.0),
    'min_child_weight': (0, 10.0),
    'alpha': (0, 2),
    'reg_lambda': (0, 2),
    'eta': (0, 1)
    }

# Initialiser l'optimiseur bayésien
optimizer = BayesianOptimization(f=evaluate_xgb, pbounds=pbounds, random_state=42)

# Lancer l'optimisation
optimizer.maximize(init_points=10, n_iter=40) # 5 points aléatoires, 30 itérations

# Récupérer les hyperparamètres optimaux
best_params = optimizer.max['params']

# Entraîner le modèle final avec les meilleurs hyperparamètres
best_model = xgb.XGBClassifier(n_estimators=int(best_params['n_estimators']),
                           max_depth=int(best_params['max_depth']),
                           learning_rate=best_params['learning_rate'],
                           gamma=best_params['gamma'],
                           min_child_weight=best_params['min_child_weight'],
                           alpha=best_params['alpha'],
                           reg_lambda=best_params['reg_lambda'],
                           eta=best_params['eta']
                           )

best_model.fit(X_train, y_train)

# Enregistrez le modèle dans un fichier pickle 
with open("xgboost_classifier_model.pkl", "wb") as file:
    pickle.dump(best_model, file)

# Charger le modèle depuis le fichier pickle
with open("xgboost_classifier_model.pkl", "rb") as file:
    loaded_model = pickle.load(file)

# Make predictions on the test set
# predictions = best_model.predict(X_test)

# Utilisez le modèle chargé pour faire des prédictions, par exemple
predictions = loaded_model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, predictions)
print(f"Accuracy: {accuracy:.4f}")

|   iter    |  target   |   alpha   |    eta    |   gamma   | learni... | max_depth | min_ch... | n_esti... | reg_la... |
-------------------------------------------------------------------------------------------------------------------------
| [0m1        [0m | [0m0.7792   [0m | [0m0.7491   [0m | [0m0.9507   [0m | [0m3.66     [0m | [0m0.06027  [0m | [0m4.092    [0m | [0m1.56     [0m | [0m105.8    [0m | [0m1.732    [0m |
| [95m2        [0m | [95m0.7857   [0m | [95m1.202    [0m | [95m0.7081   [0m | [95m0.1029   [0m | [95m0.09702  [0m | [95m8.827    [0m | [95m2.123    [0m | [95m118.2    [0m | [95m0.3668   [0m |
| [0m3        [0m | [0m0.7597   [0m | [0m0.6085   [0m | [0m0.5248   [0m | [0m2.16     [0m | [0m0.02983  [0m | [0m7.283    [0m | [0m1.395    [0m | [0m129.2    [0m | [0m0.7327   [0m |
| [95m4        [0m | [95m0.7987   [0m | [95m0.9121   [0m | [95m0.7852   [0m | [95m0.9984   [0m | [95m0.05191  [0m | [95m7.147  