In [8]:
import os

from google.colab import drive
drive.mount('/content/drive', force_remount=True)

!pip install mlflow
!pip install boto3 awscli
!pip install mlflow optuna xgboost
!pip install optuna optuna-integration[mlflow]
import optuna.integration.mlflow
import mlflow.sklearn
import pandas as pd
import numpy as np
import xgboost as xgb
import optuna.integration.mlflow
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import mlflow
import ast
import joblib
import matplotlib.pyplot as plt
import seaborn as sns

!aws configure
# Chemin vers les fichiers
os.makedirs("/content/drive/MyDrive/ProjetEts/models", exist_ok=True)
data_dir = "/content/drive/MyDrive/ProjetEts/HDFS_results/"
train_file = os.path.join(data_dir, "Event_occurence_matrix_HDFS_train.csv")
valid_file = os.path.join(data_dir, "Event_occurence_matrix_HDFS_valid.csv")

# Chargement des données
df_train = pd.read_csv(train_file)
df_valid = pd.read_csv(valid_file)

# Suppression des colonnes non pertinentes
X_train = df_train.drop(columns=["BlockId", "Label", "Time", "Date", "Type"], errors='ignore')
y_train = df_train["Label"].apply(lambda x: 1 if x == "Fail" else 0)

X_valid = df_valid.drop(columns=["BlockId", "Label", "Time", "Date", "Type"], errors='ignore')
y_valid = df_valid["Label"].apply(lambda x: 1 if x == "Fail" else 0)

# Définir l'expérience MLflow
mlflow.set_tracking_uri("http://ec2-18-207-206-140.compute-1.amazonaws.com:5000")
mlflow.set_experiment("HDFS")

with mlflow.start_run(run_name="RandomForest-HDFS") as run:
    # Modèle
    rf = RandomForestClassifier(n_estimators=100, random_state=42)
    rf.fit(X_train, y_train)

    # Log des paramètres
    mlflow.log_param("n_estimators", 100)
    mlflow.log_param("random_state", 42)

    # Log du modèle
    mlflow.sklearn.log_model(rf, "model")

    # Évaluation sur validation (calculs dans script suivant)
    val_score = rf.score(X_valid, y_valid)
    mlflow.log_metric("val_accuracy", val_score)

    print(f"✅ Modèle entraîné avec une précision sur validation : {val_score:.4f}")
    print(f"🔗 Run ID : {run.info.run_id}")

    #  Visualisation de l'importance des features ===
    importances = rf.feature_importances_
    feature_names = X_train.columns
    feature_df = pd.DataFrame({"Feature": feature_names, "Importance": importances})
    feature_df = feature_df.sort_values(by="Importance", ascending=False).head(20)

    plt.figure(figsize=(10, 6))
    sns.barplot(x="Importance", y="Feature", data=feature_df)
    plt.title("Top 20 Features Importantes - Random Forest")
    plt.tight_layout()

    # Sauvegarder localement
    fig_path = "/content/drive/MyDrive/ProjetEts/models/feature_importance.png"
    plt.savefig(fig_path)
    plt.close()

    # Enregistrer dans MLflow comme artefact
    mlflow.log_artifact(fig_path)






with mlflow.start_run(run_name="XGBoost-HDFS") as run:
    model = xgb.XGBClassifier(n_estimators=100, max_depth=5, use_label_encoder=False, eval_metric='logloss')
    model.fit(X_train, y_train)

    # Log des paramètres
    mlflow.log_param("model", "XGBoost")
    mlflow.log_param("n_estimators", 100)
    mlflow.log_param("max_depth", 5)

    # Log du modèle
    mlflow.sklearn.log_model(model, "model")

    # Évaluation
    val_score = model.score(X_valid, y_valid)
    mlflow.log_metric("val_accuracy", val_score)

    print(f"✅ XGBoost Accuracy : {val_score:.4f}")
    print(f"🔗 Run ID : {run.info.run_id}")

joblib.dump(rf, "/content/drive/MyDrive/ProjetEts/models/random_forest.pkl")


Mounted at /content/drive
Collecting optuna-integration[mlflow]
  Downloading optuna_integration-4.4.0-py3-none-any.whl.metadata (12 kB)
Downloading optuna_integration-4.4.0-py3-none-any.whl (98 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.9/98.9 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: optuna-integration
Successfully installed optuna-integration-4.4.0
AWS Access Key ID [****************ADFV]: AKIAQTO4ENAVVPUDADFV
AWS Secret Access Key [****************SlAV]: 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
Default region name [us-east-1]: us-east-1
Default output format [None]: 




✅ Modèle entraîné avec une précision sur validation : 0.9772
🔗 Run ID : 653382e45dd9461c8a084e1999b4ceac
🏃 View run RandomForest-HDFS at: http://ec2-18-207-206-140.compute-1.amazonaws.com:5000/#/experiments/904462445519544081/runs/653382e45dd9461c8a084e1999b4ceac
🧪 View experiment at: http://ec2-18-207-206-140.compute-1.amazonaws.com:5000/#/experiments/904462445519544081


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


✅ XGBoost Accuracy : 0.9778
🔗 Run ID : 392d30f60341441893306b116655b1e9
🏃 View run XGBoost-HDFS at: http://ec2-18-207-206-140.compute-1.amazonaws.com:5000/#/experiments/904462445519544081/runs/392d30f60341441893306b116655b1e9
🧪 View experiment at: http://ec2-18-207-206-140.compute-1.amazonaws.com:5000/#/experiments/904462445519544081


['/content/drive/MyDrive/ProjetEts/models/random_forest.pkl']

In [5]:
import os

from google.colab import drive
drive.mount('/content/drive', force_remount=True)

!pip install mlflow
!pip install boto3 awscli
!pip install mlflow optuna xgboost
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import mlflow
import mlflow.sklearn
import os
import ast
import joblib
!aws configure
# Chemin vers les fichiers
os.makedirs("/content/drive/MyDrive/ProjetEts/models", exist_ok=True)
data_dir = "/content/drive/MyDrive/ProjetEts/HDFS_results/"
train_file = os.path.join(data_dir, "Event_occurence_matrix_HDFS_train.csv")
valid_file = os.path.join(data_dir, "Event_occurence_matrix_HDFS_valid.csv")

# Chargement des données
df_train = pd.read_csv(train_file)
df_valid = pd.read_csv(valid_file)

# Suppression des colonnes non pertinentes
X_train = df_train.drop(columns=["BlockId", "Label", "Time", "Date", "Type"], errors='ignore')
y_train = df_train["Label"].apply(lambda x: 1 if x == "Fail" else 0)

X_valid = df_valid.drop(columns=["BlockId", "Label", "Time", "Date", "Type"], errors='ignore')
y_valid = df_valid["Label"].apply(lambda x: 1 if x == "Fail" else 0)

# Définir l'expérience MLflow
mlflow.set_tracking_uri("http://ec2-18-207-206-140.compute-1.amazonaws.com:5000")
mlflow.set_experiment("HDFS")

with mlflow.start_run(run_name="RandomForest-HDFS") as run:
    # Modèle
    rf = RandomForestClassifier(n_estimators=100, random_state=42)
    rf.fit(X_train, y_train)

    # Log des paramètres
    mlflow.log_param("n_estimators", 100)
    mlflow.log_param("random_state", 42)

    # Log du modèle
    mlflow.sklearn.log_model(rf, "model")

    # Évaluation sur validation (calculs dans script suivant)
    val_score = rf.score(X_valid, y_valid)
    mlflow.log_metric("val_accuracy", val_score)

    print(f"✅ Modèle entraîné avec une précision sur validation : {val_score:.4f}")
    print(f"🔗 Run ID : {run.info.run_id}")

joblib.dump(rf, "/content/drive/MyDrive/ProjetEts/models/random_forest.pkl")


Mounted at /content/drive
AWS Access Key ID [****************ADFV]: AKIAQTO4ENAVVPUDADFV
AWS Secret Access Key [****************SlAV]: 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
Default region name [us-east-1]: us-east-1
Default output format [None]: 




✅ Modèle entraîné avec une précision sur validation : 0.9772
🔗 Run ID : 7ff0dbd1675d409585817fea5c3df0df
🏃 View run RandomForest-HDFS at: http://ec2-18-207-206-140.compute-1.amazonaws.com:5000/#/experiments/904462445519544081/runs/7ff0dbd1675d409585817fea5c3df0df
🧪 View experiment at: http://ec2-18-207-206-140.compute-1.amazonaws.com:5000/#/experiments/904462445519544081


['/content/drive/MyDrive/ProjetEts/models/random_forest.pkl']