# Notebook de test pour lancer de manière guidée de nombreux tests

In [37]:
# All needed import
import numpy as np
import pandas as pd
from decouple import config

import mlflow
from mlflow.client import MlflowClient

from typing import Tuple

In [38]:
# All import needed for the pre-processing
from sklearn.ensemble import IsolationForest
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

In [59]:
# All model import
from sklearn.linear_model import Ridge
from sklearn.neural_network import MLPRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import AdaBoostRegressor, HistGradientBoostingRegressor

In [40]:
# Global variable
RANDOM_STATE = 42
SEED = 42
URI = config("URI")
EXPERIMENT_ID = 415539499946844786

In [41]:
# Path to the dataset
DATA_PATH = "../../data/solar_weather.csv"

In [60]:
MODEL_REGRESSION = ["ridge", "mlp_regressor", "knn_regressor", "light_gmb_poisson", "adaboost"]
MODEL_FUNCTION = {
    # regression
    "ridge": Ridge(),
    "mlp_regressor": MLPRegressor(),
    "light_gmb_poisson": HistGradientBoostingRegressor(loss="poisson"),
    "adaboost": AdaBoostRegressor(),
    "knn_regressor": KNeighborsRegressor(),
}

In [43]:
# Function to get and pre-process the datas
def get_data(frac: float = 1.0) -> Tuple:
    """Function used for the weather dataset"""

    data = pd.read_csv(DATA_PATH).sample(frac=frac, random_state=RANDOM_STATE)
    target_column = "Energy delta[Wh]"
    data = data.drop(["Time"], axis=1)
    # No features to modify

    iforest = IsolationForest(contamination=0.1, random_state=RANDOM_STATE)
    outliers = iforest.fit_predict(data)
    clean_data = data[(outliers != -1)]

    # we normalize
    min_max_scaler = MinMaxScaler()
    clean_array = min_max_scaler.fit_transform(clean_data)
    clean_data = pd.DataFrame(clean_array, columns=clean_data.keys())

    data_values = clean_data.drop([target_column], axis=1)
    data_target = clean_data[target_column]
    x_train, x_test, y_train, y_test = train_test_split(
        data_values, data_target, test_size=0.3, random_state=RANDOM_STATE
    )
    return (x_train, y_train), (x_test, y_test)

In [44]:
# (x_train, y_train), (x_test, y_test) = get_data(0.01)



In [53]:
def mlflow_preparation():
    mlflow.set_tracking_uri(URI)
    experiment_id = mlflow.set_experiment(experiment_id=EXPERIMENT_ID)
    return experiment_id


def log_params(data_train, data_test, model_name) -> None:
    mlflow.log_param("nb_features", data_train[0].shape[1])
    mlflow.log_param("nb_samples_train", data_train[0].shape[0])
    mlflow.log_param("nb_samples_test", data_test[0].shape[0])
    mlflow.log_param("model_name", model_name)


def launch_model(run_name, experiment_id, data_train: tuple, data_test: tuple, model):
    mlflow.start_run(run_name=run_name, experiment_id=experiment_id)
    log_params(data_train, data_test, model)
    model = MODEL_FUNCTION[model]
    model.fit(data_train[0], data_train[1])
    mlflow.sklearn.log_model(model, "model")
    model_uri = mlflow.get_artifact_uri("model")
    eval_data = data_test[0]
    eval_data["label"] = data_test[1]
    mlflow.evaluate(
        model=model_uri,
        data=eval_data,
        targets="label",
        model_type="regressor",
        evaluators="default",
    )
    print(f"Model saved in run {mlflow.active_run().info.run_uuid}")
    mlflow.end_run()


In [66]:
def main():
    size = [1]
    experiment_id = mlflow_preparation()
    # for data in DATA_REGRESSION:
    for frac in size:
        print("data loading")
        data_train, data_test = get_data(frac)
        print("data loaded")
        for model in MODEL_REGRESSION:
            run_name = model + "-Test-" + str(frac)
            print(run_name + " start")
            launch_model(run_name, EXPERIMENT_ID, data_train, data_test, model)
            print(run_name + " over")
    print(mlflow.get_tracking_uri())
    mlflow.end_run()

In [67]:
main()

data loading


X does not have valid feature names, but IsolationForest was fitted with feature names


data loaded
ridge-Test-0.1 start


2023/05/23 14:42:33 INFO mlflow.models.evaluation.base: Evaluating the model with the default evaluator.
2023/05/23 14:42:33 INFO mlflow.models.evaluation.default_evaluator: Shap explainer Linear is used.
Unable to serialize underlying model using MLflow, will use SHAP serialization


Model saved in run bb5da16b977544a9bb6bd848b6d4726e
ridge-Test-0.1 over
mlp_regressor-Test-0.1 start


2023/05/23 14:42:47 INFO mlflow.models.evaluation.base: Evaluating the model with the default evaluator.


Model saved in run 7e9e21ea28de4de68b54b8511f4bf3ff
mlp_regressor-Test-0.1 over
knn_regressor-Test-0.1 start


2023/05/23 14:42:58 INFO mlflow.models.evaluation.base: Evaluating the model with the default evaluator.


Model saved in run 63ba824f3d6147708f4a596501e2280e
knn_regressor-Test-0.1 over
light_gmb_poisson-Test-0.1 start


2023/05/23 14:43:09 INFO mlflow.models.evaluation.base: Evaluating the model with the default evaluator.
2023/05/23 14:43:09 INFO mlflow.models.evaluation.default_evaluator: Shap explainer Tree is used.


Model saved in run 23fd91d993b74be6bfd81073057d4e76
light_gmb_poisson-Test-0.1 over
adaboost-Test-0.1 start


2023/05/23 14:43:23 INFO mlflow.models.evaluation.base: Evaluating the model with the default evaluator.


Model saved in run 06c44fb37ec149ccbeb75382b86e9c32
adaboost-Test-0.1 over
http://localhost:5000
