In [1]:
# All needed import
import numpy as np
import pandas as pd
from decouple import config

import mlflow

from typing import Tuple

In [2]:
# All import needed for the pre-processing
from sklearn.ensemble import IsolationForest
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

In [3]:
# All model import
from sklearn.linear_model import Ridge
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import AdaBoostRegressor, HistGradientBoostingRegressor

In [4]:
# import for the logs
from utils.mlflow_logs import log_confusion_matrix, log_fn_and_fp, log_f1_score, log_parameters

In [5]:
# Global variable
RANDOM_STATE = 42
SEED = 42
URI = config("URI")
EXPERIMENT_ID = "760849660285383714"  # TODO put your experiment id here (str)

In [6]:
# Path to the datasets
DATA_PATH = "datasets/ds_salaries.csv"  # TODO put the path to your dataset here

In [7]:
MODEL_FUNCTION = {
    "lightGBM with poisson loss 1" : HistGradientBoostingRegressor(loss="poisson", learning_rate=0.01, tol=1e-7),
    "lightGBM with poisson loss 2" : HistGradientBoostingRegressor(loss="poisson", learning_rate=0.05, tol=1e-7),
    "lightGBM with poisson loss 3" : HistGradientBoostingRegressor(loss="poisson", learning_rate=0.1, tol=1e-7),
    "lightGBM with poisson loss 4" : HistGradientBoostingRegressor(loss="poisson", learning_rate=0.01, tol=1e-5),
    "lightGBM with poisson loss 5" : HistGradientBoostingRegressor(loss="poisson", learning_rate=0.05, tol=1e-5),
    "lightGBM with poisson loss 6" : HistGradientBoostingRegressor(loss="poisson", learning_rate=0.1, tol=1e-5),
    "lightGBM with abs loss 1" : HistGradientBoostingRegressor(loss="absolute_error", learning_rate=0.01, tol=1e-7),
    "lightGBM with abs loss 2" : HistGradientBoostingRegressor(loss="absolute_error", learning_rate=0.05, tol=1e-7),
    "lightGBM with abs loss 3" : HistGradientBoostingRegressor(loss="absolute_error", learning_rate=0.1, tol=1e-7),
    "lightGBM with abs loss 4" : HistGradientBoostingRegressor(loss="absolute_error", learning_rate=0.01, tol=1e-5),
    "lightGBM with abs loss 5" : HistGradientBoostingRegressor(loss="absolute_error", learning_rate=0.05, tol=1e-5),
    "lightGBM with abs loss 6" : HistGradientBoostingRegressor(loss="absolute_error", learning_rate=0.1, tol=1e-5),
    "multi layer perceptron 1" : MLPRegressor(hidden_layer_sizes=100, activation="identity"),
    "multi layer perceptron 2" : MLPRegressor(hidden_layer_sizes=100, activation="logistic"),
    "multi layer perceptron 3" : MLPRegressor(hidden_layer_sizes=100, activation="tanh"),
    "multi layer perceptron 4" : MLPRegressor(hidden_layer_sizes=100, activation="relu"),
    "multi layer perceptron 5" : MLPRegressor(hidden_layer_sizes=150, activation="identity"),
    "multi layer perceptron 6" : MLPRegressor(hidden_layer_sizes=150, activation="logistic"),
    "multi layer perceptron 7" : MLPRegressor(hidden_layer_sizes=150, activation="tanh"),
    "multi layer perceptron 8" : MLPRegressor(hidden_layer_sizes=150, activation="relu"),
    "Adaboost 1" : AdaBoostRegressor(n_estimators=50, loss="linear"),
    "Adaboost 2" : AdaBoostRegressor(n_estimators=50, loss="square"),
    "Adaboost 3" : AdaBoostRegressor(n_estimators=50, loss="exponential"),
    "Adaboost 4" : AdaBoostRegressor(n_estimators=100, loss="linear"),
    "Adaboost 5" : AdaBoostRegressor(n_estimators=100, loss="square"),
    "Adaboost 6" : AdaBoostRegressor(n_estimators=100, loss="exponential"),
    "Ridge 1" : Ridge(alpha=0.25),
    "Ridge 2" : Ridge(alpha=0.5),
    "Ridge 3" : Ridge(alpha=1),
    "Ridge 4" : Ridge(alpha=1.5),
    "Ridge 5" : Ridge(alpha=2),
}  # TODO create a dictionary for all : model_name -> class    /!\ each model must have a different name

In [8]:
# Function to get and pre-process the datas
def get_data(frac: float = 1.0) -> Tuple:
    """Function used for the weather dataset"""

    data = pd.read_csv(DATA_PATH).sample(frac=frac, random_state=RANDOM_STATE)
    target_column = "salary_in_usd" # TODO give here the target column
    data = data.drop(["salary_currency", "salary"], axis=1)  # TODO drop here the unecessary column
    for column in ["experience_level", "employment_type", "job_title", "employee_residence", "company_location", "company_size"]:  # TODO column to transform in numerical values
        data[column] = LabelEncoder().fit_transform(data[column])
    data = data.dropna(axis=0)

    iforest = IsolationForest(contamination=0.1, random_state=RANDOM_STATE)
    outliers = iforest.fit_predict(data)
    clean_data = data[(outliers != -1)]

    # we normalize
    min_max_scaler = MinMaxScaler()
    clean_array = min_max_scaler.fit_transform(clean_data)
    clean_data = pd.DataFrame(clean_array, columns=clean_data.keys())

    data_values = clean_data.drop([target_column], axis=1)
    data_target = clean_data[target_column]
    x_train, x_test, y_train, y_test = train_test_split(
        data_values, data_target, test_size=0.3, random_state=RANDOM_STATE
    )
    return (x_train, y_train), (x_test, y_test)

In [9]:
def log_params(X_train, X_test, model_name) -> None:
    mlflow.log_param("nb_features", X_train.shape[1])
    mlflow.log_param("nb_samples_train", X_train.shape[0])
    mlflow.log_param("nb_samples_test", X_test.shape[0])
    mlflow.log_param("model_name", model_name)
    mlflow.log_param("model_class", type(MODEL_FUNCTION[model_name]))

In [10]:
def main():
    mlflow.set_tracking_uri(URI)
    mlflow.sklearn.autolog()
    frac = 1  # TODO put here the wanted part of the total dataset to use (between 0 and 1)
    print("data loading")
    (X_train, Y_train), (X_test, Y_test) = get_data(frac)
    for model_name in MODEL_FUNCTION:
        run_name = f"Run of {model_name}"  # TODO you can change the name of the form here
        with mlflow.start_run(run_name=run_name, experiment_id=EXPERIMENT_ID):
            model = MODEL_FUNCTION[model_name]
            model.fit(X_train, Y_train)
            mlflow.sklearn.log_model(model, "model")
            log_params(X_train, X_test, model_name)  # this line is optional
            model_uri = mlflow.get_artifact_uri("model")
            eval_data = X_test
            eval_data["label"] = Y_test
            mlflow.evaluate(
                model=model_uri,
                data=eval_data,
                targets="label",
                model_type="regressor",  # TODO complete here the type of model ("regressor" or "classifier")
                evaluators="default",
            )
            mlflow.end_run()

In [11]:
main()  # We launch it all

data loading


2023/06/06 11:16:00 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '3b57aaf0b5e64075bcbc6564c3c4a1f3', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow
2023/06/06 11:16:28 INFO mlflow.models.evaluation.base: Evaluating the model with the default evaluator.
2023/06/06 11:16:30 INFO mlflow.models.evaluation.default_evaluator: Shap explainer Tree is used.
Unable to serialize underlying model using MLflow, will use SHAP serialization
2023/06/06 11:16:51 INFO mlflow.models.evaluation.base: Evaluating the model with the default evaluator.
2023/06/06 11:16:51 INFO mlflow.models.evaluation.default_evaluator: Shap explainer Tree is used.
Unable to serialize underlying model using MLflow, will use SHAP serialization
2023/06/06 11:17:12 INFO mlflow.models.evaluation.base: Evaluating the model with the default evaluator.
2023/06/06 11:17:12 INFO mlflow.models.evaluation.default_evaluator: S