# Notebook de test pour lancer de manière guidée de nombreux tests

In [52]:
# All needed import
import numpy as np
import pandas as pd
from decouple import config

import mlflow
from mlflow.client import MlflowClient

from typing import Tuple

In [53]:
# All import needed for the pre-processing
from sklearn.ensemble import IsolationForest
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

In [54]:
# All model import
from sklearn.linear_model import Ridge
from sklearn.neural_network import MLPRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import AdaBoostRegressor, HistGradientBoostingRegressor

In [55]:
# Global variable
RANDOM_STATE = 42
SEED = 42
URI = config("URI")
EXPERIMENT_ID = "415539499946844786"

In [56]:
# Path to the dataset
DATA_PATH = "../../data/solar_weather.csv"

In [57]:
MODEL_REGRESSION = ["ridge", "mlp_regressor", "knn_regressor", "light_gmb_poisson", "adaboost"]
MODEL_FUNCTION = {
    # regression
    "ridge": Ridge(),
    "mlp_regressor": MLPRegressor(),
    "light_gmb_poisson": HistGradientBoostingRegressor(loss="poisson"),
    "adaboost": AdaBoostRegressor(),
    "knn_regressor": KNeighborsRegressor(),
}

In [58]:
# Function to get and pre-process the datas
def get_data(frac: float = 1.0) -> Tuple:
    """Function used for the weather dataset"""

    data = pd.read_csv(DATA_PATH).sample(frac=frac, random_state=RANDOM_STATE)
    target_column = "Energy delta[Wh]"
    data = data.drop(["Time"], axis=1)
    # No features to modify

    iforest = IsolationForest(contamination=0.1, random_state=RANDOM_STATE)
    outliers = iforest.fit_predict(data)
    clean_data = data[(outliers != -1)]

    # we normalize
    min_max_scaler = MinMaxScaler()
    clean_array = min_max_scaler.fit_transform(clean_data)
    clean_data = pd.DataFrame(clean_array, columns=clean_data.keys())

    data_values = clean_data.drop([target_column], axis=1)
    data_target = clean_data[target_column]
    x_train, x_test, y_train, y_test = train_test_split(
        data_values, data_target, test_size=0.3, random_state=RANDOM_STATE
    )
    return (x_train, y_train), (x_test, y_test)

In [59]:
# (x_train, y_train), (x_test, y_test) = get_data(0.01)

In [60]:
def log_params(data_train, data_test, model_name) -> None:
    mlflow.log_param("nb_features", data_train[0].shape[1])
    mlflow.log_param("nb_samples_train", data_train[0].shape[0])
    mlflow.log_param("nb_samples_test", data_test[0].shape[0])
    mlflow.log_param("model_name", model_name)


def launch_model(run_name, experiment_id, data_train: tuple, data_test: tuple, model):
    log_params(data_train, data_test, model)
    model = MODEL_FUNCTION[model]
    model.fit(data_train[0], data_train[1])
    mlflow.sklearn.log_model(model, "model")
    model_uri = mlflow.get_artifact_uri("model")
    eval_data = data_test[0]
    eval_data["label"] = data_test[1]
    mlflow.evaluate(
        model=model_uri,
        data=eval_data,
        targets="label",
        model_type="regressor",
        evaluators="default",
    )
    print(f"Model saved in run {mlflow.active_run().info.run_uuid}")
    mlflow.end_run()


In [72]:
def main():
    mlflow.set_tracking_uri(URI)
    mlflow.sklearn.autolog()
    size = [1]
    # for data in DATA_REGRESSION:
    for frac in size:
        print("data loading")
        (X_train, Y_train), (X_test, Y_test) = get_data(frac)
        for model_name in MODEL_REGRESSION:
            run_name = model_name + "-Test-" + str(frac) + "-bis"
            with mlflow.start_run(run_name=run_name, experiment_id=EXPERIMENT_ID):
                model = MODEL_FUNCTION[model_name]
                model.fit(X_train, Y_train)
                mlflow.sklearn.log_model(model, "model")
                model_uri = mlflow.get_artifact_uri("model")
                eval_data = X_test
                eval_data["label"] = Y_test
                mlflow.evaluate(
                    model=model_uri,
                    data=eval_data,
                    targets="label",
                    model_type="regressor",
                    evaluators="default",
                )
                mlflow.end_run()
    print(mlflow.get_tracking_uri())

In [73]:
main()

data loading


2023/05/23 17:01:03 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '4138ac0ad1054a639ddb8232d518b261', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow
X does not have valid feature names, but IsolationForest was fitted with feature names
2023/05/23 17:01:42 INFO mlflow.models.evaluation.base: Evaluating the model with the default evaluator.
2023/05/23 17:01:43 INFO mlflow.models.evaluation.default_evaluator: Shap explainer Linear is used.
Unable to serialize underlying model using MLflow, will use SHAP serialization
2023/05/23 17:02:11 INFO mlflow.models.evaluation.base: Evaluating the model with the default evaluator.
2023/05/23 17:03:48 INFO mlflow.models.evaluation.base: Evaluating the model with the default evaluator.
2023/05/23 17:04:42 INFO mlflow.models.evaluation.base: Evaluating the model with the default evaluator.
2023/05/23 17:04:43 INFO mlflow.models.evaluation.def

http://localhost:5000


In [63]:
mlflow.end_run()