# Notebook de test pour lancer de manière guidée de nombreux tests

In [11]:
# All needed import
import numpy as np
import pandas as pd
from decouple import config

import mlflow

from typing import Tuple

In [12]:
# All import needed for the pre-processing
from sklearn.ensemble import IsolationForest
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

In [13]:
# All model import
from sklearn.linear_model import Ridge
from sklearn.neural_network import MLPRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import AdaBoostRegressor, HistGradientBoostingRegressor

In [3]:
# import for the logs
from utils.mlflow_logs import log_confusion_matrix, log_fn_and_fp, log_f1_score, log_parameters

In [15]:
# Global variable
RANDOM_STATE = 42
SEED = 42
URI = config("URI")
EXPERIMENT_ID = "415539499946844786"

In [16]:
# Path to the dataset
DATA_PATH = "datasets/solar_weather.csv"

In [17]:
MODEL_FUNCTION = {
    # regression
    "ridge": Ridge(),
    "mlp_regressor": MLPRegressor(),
    "light_gmb_poisson": HistGradientBoostingRegressor(loss="poisson"),
    "adaboost": AdaBoostRegressor(),
    "knn_regressor": KNeighborsRegressor(),
}

In [18]:
# Function to get and pre-process the datas
def get_data(frac: float = 1.0) -> Tuple:
    """Function used for the weather dataset"""

    data = pd.read_csv(DATA_PATH).sample(frac=frac, random_state=RANDOM_STATE)
    target_column = "Energy delta[Wh]"
    data = data.drop(["Time"], axis=1)
    # No features to modify

    iforest = IsolationForest(contamination=0.1, random_state=RANDOM_STATE)
    outliers = iforest.fit_predict(data)
    clean_data = data[(outliers != -1)]

    # we normalize
    min_max_scaler = MinMaxScaler()
    clean_array = min_max_scaler.fit_transform(clean_data)
    clean_data = pd.DataFrame(clean_array, columns=clean_data.keys())

    data_values = clean_data.drop([target_column], axis=1)
    data_target = clean_data[target_column]
    x_train, x_test, y_train, y_test = train_test_split(
        data_values, data_target, test_size=0.3, random_state=RANDOM_STATE
    )
    return (x_train, y_train), (x_test, y_test)

In [19]:
# (x_train, y_train), (x_test, y_test) = get_data(0.01)

In [20]:
def log_params(X_train, X_test, model_name) -> None:
    mlflow.log_param("nb_features", X_train.shape[1])
    mlflow.log_param("nb_samples_train", X_train.shape[0])
    mlflow.log_param("nb_samples_test", X_test.shape[0])
    mlflow.log_param("model_name", model_name)

In [21]:
def main():
    mlflow.set_tracking_uri(URI)
    mlflow.sklearn.autolog()
    frac = 0.1  # part of the total dataset to use
    print("data loading")
    (X_train, Y_train), (X_test, Y_test) = get_data(frac)
    for model_name in MODEL_FUNCTION:
        run_name = f"Run-of-{model_name}"
        with mlflow.start_run(run_name=run_name, experiment_id=EXPERIMENT_ID):
            model = MODEL_FUNCTION[model_name]
            model.fit(X_train, Y_train)
            mlflow.sklearn.log_model(model, "model")
            log_params(X_train, X_test, model_name)
            model_uri = mlflow.get_artifact_uri("model")
            eval_data = X_test
            eval_data["label"] = Y_test
            mlflow.evaluate(
                model=model_uri,
                data=eval_data,
                targets="label",
                model_type="regressor",
                evaluators="default",
            )
            mlflow.end_run()
    print(mlflow.get_tracking_uri())

In [22]:
main()

data loading


2023/05/24 08:43:16 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'e7f7326641b74398b56b90b828915937', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow
2023/05/24 08:43:50 INFO mlflow.models.evaluation.base: Evaluating the model with the default evaluator.
2023/05/24 08:43:52 INFO mlflow.models.evaluation.default_evaluator: Shap explainer Linear is used.
Unable to serialize underlying model using MLflow, will use SHAP serialization
2023/05/24 08:44:14 INFO mlflow.models.evaluation.base: Evaluating the model with the default evaluator.
2023/05/24 08:44:32 INFO mlflow.models.evaluation.base: Evaluating the model with the default evaluator.
2023/05/24 08:44:48 INFO mlflow.models.evaluation.base: Evaluating the model with the default evaluator.
2023/05/24 08:44:48 INFO mlflow.models.evaluation.default_evaluator: Shap explainer Tree is used.
2023/05/24 08:45:12 INFO mlflow.models.eva

http://localhost:5000
