In [1]:
# All needed import
import numpy as np
import pandas as pd
from decouple import config

import mlflow

from typing import Tuple

In [2]:
# All import needed for the pre-processing
from sklearn.ensemble import IsolationForest
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

In [3]:
# All model import
# TODO Import your model here
from sklearn.ensemble import HistGradientBoostingClassifier, RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC

In [4]:
# import for the logs
from utils.mlflow_logs import log_confusion_matrix, log_fn_and_fp, log_f1_score, log_parameters

In [5]:
# Global variable
RANDOM_STATE = 42
SEED = 42
URI = config("URI")
EXPERIMENT_ID = 131349728120206495  # TODO put your experiment id here

In [6]:
# Path to the datasets
DATA_PATH = "datasets/gender_classification_v7.csv"  # TODO put the path to your dataset here

In [7]:
MODEL_FUNCTION = {
    "LightGBM with binary_crossentropy loss 1" : HistGradientBoostingClassifier(loss="binary_crossentropy", learning_rate=0.01, tol=1e-7),
    "LightGBM with binary_crossentropy loss 2" : HistGradientBoostingClassifier(loss="binary_crossentropy", learning_rate=0.05, tol=1e-7),
    "LightGBM with binary_crossentropy loss 3" : HistGradientBoostingClassifier(loss="binary_crossentropy", learning_rate=0.1, tol=1e-7),
    "LightGBM with binary_crossentropy loss 4" : HistGradientBoostingClassifier(loss="binary_crossentropy", learning_rate=0.01, tol=1e-5),
    "LightGBM with binary_crossentropy loss 5" : HistGradientBoostingClassifier(loss="binary_crossentropy", learning_rate=0.05, tol=1e-5),
    "LightGBM with binary_crossentropy loss 6" : HistGradientBoostingClassifier(loss="binary_crossentropy", learning_rate=0.1, tol=1e-5),
    "LightGBM with log_loss loss 1" : HistGradientBoostingClassifier(loss="log_loss", learning_rate=0.01, tol=1e-7),
    "LightGBM with log_loss loss 2" : HistGradientBoostingClassifier(loss="log_loss", learning_rate=0.05, tol=1e-7),
    "LightGBM with log_loss loss 3" : HistGradientBoostingClassifier(loss="log_loss", learning_rate=0.1, tol=1e-7),
    "LightGBM with log_loss loss 4" : HistGradientBoostingClassifier(loss="log_loss", learning_rate=0.01, tol=1e-5),
    "LightGBM with log_loss loss 5" : HistGradientBoostingClassifier(loss="log_loss", learning_rate=0.05, tol=1e-5),
    "LightGBM with log_loss loss 6" : HistGradientBoostingClassifier(loss="log_loss", learning_rate=0.1, tol=1e-5),
    "Multi layer perceptron 1" : MLPClassifier(hidden_layer_sizes=100, activation="identity"),
    "Multi layer perceptron 2" : MLPClassifier(hidden_layer_sizes=100, activation="logistic"),
    "Multi layer perceptron 3" : MLPClassifier(hidden_layer_sizes=100, activation="tanh"),
    "Multi layer perceptron 4" : MLPClassifier(hidden_layer_sizes=100, activation="relu"),
    "Multi layer perceptron 5" : MLPClassifier(hidden_layer_sizes=150, activation="identity"),
    "Multi layer perceptron 6" : MLPClassifier(hidden_layer_sizes=150, activation="logistic"),
    "Multi layer perceptron 7" : MLPClassifier(hidden_layer_sizes=150, activation="tanh"),
    "Multi layer perceptron 8" : MLPClassifier(hidden_layer_sizes=150, activation="relu"),
    "Random Forest 1" : RandomForestClassifier(n_estimators=50, criterion="gini"),
    "Random Forest 2" : RandomForestClassifier(n_estimators=50, criterion="entropy"),
    "Random Forest 3" : RandomForestClassifier(n_estimators=50, criterion="log_loss"),
    "Random Forest 4" : RandomForestClassifier(n_estimators=100, criterion="gini"),
    "Random Forest 5" : RandomForestClassifier(n_estimators=100, criterion="entropy"),
    "Random Forest 6" : RandomForestClassifier(n_estimators=100, criterion="log_loss"),
    "SVC with rbf 1" : SVC(kernel="linear", C=1, gamma="scale", random_state=RANDOM_STATE),
    "SVC with rbf 2" : SVC(kernel="linear", C=0.5, gamma="scale", random_state=RANDOM_STATE),
    "SVC with rbf 3" : SVC(kernel="linear", C=10, gamma="scale", random_state=RANDOM_STATE),
    "SVC with rbf 4" : SVC(kernel="linear", C=1, gamma="auto", random_state=RANDOM_STATE),
    "SVC with rbf 5" : SVC(kernel="linear", C=0.5, gamma="auto", random_state=RANDOM_STATE),
    "SVC with rbf 6" : SVC(kernel="linear", C=10, gamma="auto", random_state=RANDOM_STATE),
}

In [8]:
# Function to get and pre-process the datas
def get_data(frac: float = 1.0) -> Tuple:
    """Function used for the weather dataset"""

    data = pd.read_csv(DATA_PATH).sample(frac=frac, random_state=RANDOM_STATE)
    target_column = "gender" # TODO give here the target column
    data = data.drop([], axis=1)  # TODO drop here the unecessary column
    for column in ["gender"]:  # TODO column to transform in numerical values
        data[column] = LabelEncoder().fit_transform(data[column])
    data = data.dropna(axis=0)

    iforest = IsolationForest(contamination=0.1, random_state=RANDOM_STATE)
    outliers = iforest.fit_predict(data)
    clean_data = data[(outliers != -1)]

    # we normalize
    min_max_scaler = MinMaxScaler()
    clean_array = min_max_scaler.fit_transform(clean_data)
    clean_data = pd.DataFrame(clean_array, columns=clean_data.keys())

    data_values = clean_data.drop([target_column], axis=1)
    data_target = clean_data[target_column]
    x_train, x_test, y_train, y_test = train_test_split(
        data_values, data_target, test_size=0.3, random_state=RANDOM_STATE
    )
    return (x_train, y_train), (x_test, y_test)

In [9]:
def log_params(X_train, X_test, model_name) -> None:
    mlflow.log_param("nb_features", X_train.shape[1])
    mlflow.log_param("nb_samples_train", X_train.shape[0])
    mlflow.log_param("nb_samples_test", X_test.shape[0])
    mlflow.log_param("model_name", model_name)

In [10]:
def main():
    mlflow.set_tracking_uri(URI)
    mlflow.sklearn.autolog()  # TODO change autolog() if you are using an other library than sklearn
    frac = 1  # TODO put here the wanted part of the total dataset to use (between 0 and 1)
    print("data loading")
    (X_train, Y_train), (X_test, Y_test) = get_data(frac)
    for model_name in MODEL_FUNCTION:
        run_name = f"{model_name}"  # TODO you can change the name of the form here
        with mlflow.start_run(run_name=run_name, experiment_id=EXPERIMENT_ID):
            model = MODEL_FUNCTION[model_name]
            model.fit(X_train, Y_train)
            mlflow.sklearn.log_model(model, "model")  # TODO change autolog() if you are using an other library than sklearn
            log_params(X_train, X_test, model_name)
            model_uri = mlflow.get_artifact_uri("model")
            eval_data = X_test
            eval_data["label"] = Y_test
            mlflow.evaluate(
                model=model_uri,
                data=eval_data,
                targets="label",
                model_type="classifier",  # TODO complete here the type of model ("regressor" or "classifier")
                evaluators="default",
            )
            mlflow.end_run()

In [11]:
main()  # We launch it all

data loading


2023/06/01 09:36:51 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'd4c6c34903ed4a60936cb07ade1976f9', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow
2023/06/01 09:37:25 INFO mlflow.models.evaluation.base: Evaluating the model with the default evaluator.
2023/06/01 09:37:25 INFO mlflow.models.evaluation.default_evaluator: The evaluation dataset is inferred as binary dataset, positive label is 1.0, negative label is 0.0.
2023/06/01 09:37:31 INFO mlflow.models.evaluation.default_evaluator: Shap explainer Tree is used.
Unable to serialize underlying model using MLflow, will use SHAP serialization
The loss 'binary_crossentropy' was deprecated in v1.1 and will be removed in version 1.3. Use 'log_loss' which is equivalent.
2023/06/01 09:37:58 INFO mlflow.models.evaluation.base: Evaluating the model with the default evaluator.
2023/06/01 09:37:58 INFO mlflow.models.evaluation.default

In [12]:
(X_train, Y_train), (X_test, Y_test) = get_data(1)
print(Y_test.count(1))

2023/06/01 10:18:51 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '3a5c5b22ab5f48bcbe2c20add6e17e32', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow
X does not have valid feature names, but IsolationForest was fitted with feature names
Using the level keyword in DataFrame and Series aggregations is deprecated and will be removed in a future version. Use groupby instead. ser.count(level=1) should use ser.groupby(level=1).count().


ValueError: Series.count level is only valid with a MultiIndex

In [17]:
nb_0 = 0
nb_1 = 0
for i in Y_test.to_numpy():
    if i == 1:
        nb_1 += 1
    elif i == 0:
        nb_0 += 1
    else:
        print("Non-binaire")
print(nb_0)
print(nb_1)

689
662
