In [None]:
# All needed import
import numpy as np
import pandas as pd
from decouple import config

import mlflow

from typing import Tuple

In [None]:
# All import needed for the pre-processing
from sklearn.ensemble import IsolationForest
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

In [None]:
# All model import
# TODO Import your model here
...

In [None]:
# import for the logs
from utils.mlflow_logs import log_confusion_matrix, log_fn_and_fp, log_f1_score, log_parameters

In [None]:
# Global variable
RANDOM_STATE = 42
SEED = 42
URI = config("URI")
EXPERIMENT_ID = ...  # TODO put your experiment id here

In [None]:
# Path to the datasets
DATA_PATH = ...  # TODO put the path to your dataset here

In [None]:
MODEL_FUNCTION = {...}  # TODO create a dictionary for all : model_name -> class   /!\ each model must have a different name

In [None]:
# Function to get and pre-process the datas
def get_data(frac: float = 1.0) -> Tuple:
    """Function used for the weather dataset"""

    data = pd.read_csv(DATA_PATH).sample(frac=frac, random_state=RANDOM_STATE)
    target_column = ... # TODO give here the target column
    data = data.drop([...], axis=1)  # TODO drop here the unecessary column
    for column in [...]:  # TODO column to transform in numerical values
        data[column] = LabelEncoder().fit_transform(data[column])
    data = data.dropna(axis=0)

    iforest = IsolationForest(contamination=0.1, random_state=RANDOM_STATE)
    outliers = iforest.fit_predict(data)
    clean_data = data[(outliers != -1)]

    # we normalize
    min_max_scaler = MinMaxScaler()
    clean_array = min_max_scaler.fit_transform(clean_data)
    clean_data = pd.DataFrame(clean_array, columns=clean_data.keys())

    data_values = clean_data.drop([target_column], axis=1)
    data_target = clean_data[target_column]
    x_train, x_test, y_train, y_test = train_test_split(
        data_values, data_target, test_size=0.3, random_state=RANDOM_STATE
    )
    return (x_train, y_train), (x_test, y_test)

In [None]:
# Used params for the visualisation application (DO NOT REMOVE THE model_name PARAMETER !)
def log_params(X_train, X_test, model_name) -> None:
    mlflow.log_param("nb_features", X_train.shape[1])
    mlflow.log_param("nb_samples_train", X_train.shape[0])
    mlflow.log_param("nb_samples_test", X_test.shape[0])
    mlflow.log_param("model_name", model_name)
    # TODO add parameters here if you want

In [None]:
def main():
    mlflow.set_tracking_uri(URI)
    mlflow.sklearn.autolog()  # TODO change autolog() if you are using an other library than sklearn
    frac = ...  # TODO put here the wanted part of the total dataset to use (between 0 and 1)
    print("data loading")
    (X_train, Y_train), (X_test, Y_test) = get_data(frac)
    for model_name in MODEL_FUNCTION:
        run_name = f"Run-of-{model_name}"  # TODO you can change the name of the run here
        with mlflow.start_run(run_name=run_name, experiment_id=EXPERIMENT_ID):
            model = MODEL_FUNCTION[model_name]
            model.fit(X_train, Y_train)
            mlflow.sklearn.log_model(model, "model")  # TODO change autolog() if you are using an other library than sklearn
            log_params(X_train, X_test, model_name)
            model_uri = mlflow.get_artifact_uri("model")
            eval_data = X_test
            eval_data["label"] = Y_test
            mlflow.evaluate(
                model=model_uri,
                data=eval_data,
                targets="label",
                model_type=...,  # TODO complete here the type of model ("regressor" or "classifier")
                evaluators="default",
            )
            mlflow.end_run()

In [None]:
main()  # We launch it all