In [1]:
import mlflow
import mlflow.sklearn
import spacy
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score
from spacy.lang.fr.stop_words import STOP_WORDS as fr_stop

In [1]:
def build_model(
    training_set,
    pipeline,
    mlflow_run_tags=None,
    mlflow_run_parameters=None,
    mlflow_run_description=None,
    validation_set=None
):
    """
    Build a sentiment analysis model, print the evaluation result and store everything to MLFlow.

    @param: training_set: pandas dataframe containing the input training set
    @param: pipeline: scikit-learn pipeline that will be applied to the input data
    @param: mlflow_run_tags: dict of tags that will be stored in the MLFlow run
    @param: mlflow_run_parameters: dict of parameters that will be stored in the MLFlow run
    @param: mlflow_run_description: textual description of the run
    @param: validation_set: if provided, used to evaluate the model and log result in MLFlow
    @return: the trained pipeline
    """
    # Séparation des features et des labels
    X_train,y_train = training_set[0], training_set[1]
    if validation_set is not None:
        X_val,y_val  = validation_set[0],validation_set[1]
    with mlflow.start_run() as run:
        mlflow.sklearn.autolog(log_datasets=False)
        # Ajout de tags
        if mlflow_run_tags:
            for tag, value in mlflow_run_tags.items():
                mlflow.set_tag(tag, value)        
        if mlflow_run_parameters:
            for param, value in mlflow_run_parameters.items():
                mlflow.log_param(param, value)
        if mlflow_run_description:
            mlflow.set_tag("Description", mlflow_run_description)

        # Entraînement le modèle
        pipeline.fit(X_train, y_train)
        # Évalue le modèle si le jeu de validation est fourni
        if validation_set is not None:
            y_pred = pipeline.predict(X_val)
            accuracy = accuracy_score(y_val, y_pred)
            precision = precision_score(y_val, y_pred)
            recall = recall_score(y_val, y_pred)
            
            mlflow.log_metric("accuracy_val", accuracy)
            mlflow.log_metric("precision_val", precision)
            mlflow.log_metric("recall_val", recall)
        mlflow.sklearn.log_model(pipeline, "model_pipeline")

    return pipeline


In [3]:
nlp = spacy.load("fr_core_news_sm")
train_data = pd.read_csv("../archive/train.csv")
valid_data = pd.read_csv("../archive/valid.csv")
test_data  = pd.read_csv("../archive/test.csv")

train_reviews = train_data.review.values
valid_review  = valid_data.review.values

y_val  = valid_data['polarity']
y_train = train_data['polarity']

### Experimentations


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB


mlflow.set_tracking_uri(uri="http://127.0.0.1:5000")

mlflow.set_experiment("MLflow log_reg experience")

MAX_ITER = 500
scores = []
parameters = {'C': [0.01, 0.1, 1, 10, 100], 'penalty': ['l1','l2']}
for c in parameters.get('C'):
    for pen in parameters.get('penalty'):
        lg = LogisticRegression(C=c, penalty=pen, max_iter=MAX_ITER, solver='liblinear', random_state=42)
        estimators = [('pre_processing', TfidfVectorizer(stop_words=list(fr_stop))),('lrg', lg)]
        pipe = Pipeline(estimators)
        pipe_res  = build_model((train_reviews,y_train),pipe,validation_set=(valid_review,y_val))



2024/11/12 17:30:11 INFO mlflow.tracking._tracking_service.client: 🏃 View run secretive-wasp-751 at: http://127.0.0.1:5000/#/experiments/999691338119668531/runs/c7b822185b154413a28079b9c6fc94e1.
2024/11/12 17:30:11 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/999691338119668531.
2024/11/12 17:32:35 INFO mlflow.tracking._tracking_service.client: 🏃 View run bedecked-owl-858 at: http://127.0.0.1:5000/#/experiments/999691338119668531/runs/384aee30ee6644c1a9284c352c0d0756.
2024/11/12 17:32:35 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/999691338119668531.
2024/11/12 17:35:04 INFO mlflow.tracking._tracking_service.client: 🏃 View run adorable-robin-205 at: http://127.0.0.1:5000/#/experiments/999691338119668531/runs/942cdd35b96e428fa4cb4ca743d2a79f.
2024/11/12 17:35:04 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/99

KeyboardInterrupt: 

In [None]:
from hyperopt import fmin, tpe, hp, Trials, STATUS_OK
from sklearn.metrics import accuracy_score

def build_optimized_model(
    training_set,
    validation_set,
    pipeline_template,
    search_space,
    mlflow_run_tags=None,
    mlflow_run_description=None
):
    """
    Build and optimize a machine learning model, logging all results to MLFlow.

    Parameters:
    - training_set: tuple (X_train, y_train)
    - validation_set: tuple (X_val, y_val)
    - pipeline_template: a scikit-learn pipeline template
    - search_space: hyperopt search space
    - mlflow_run_tags: dict of tags for MLFlow
    - mlflow_run_description: textual description for the MLFlow run

    Returns:
    - The trained and optimized pipeline.
    """
    X_train, y_train = training_set
    X_val, y_val = validation_set

    def objective(params):
        pipeline = pipeline_template.set_params(**params)        
        pipeline.fit(X_train, y_train)
        # Evaluate on the validation set
        y_pred = pipeline.predict(X_val)
        accuracy = accuracy_score(y_val, y_pred)
        return {'loss': 1 - accuracy, 'status': STATUS_OK}

    # Use of hyperopt to find the best parameters
    trials = Trials()
    best_params = fmin(
        fn=objective,
        space=search_space,
        algo=tpe.suggest,
        max_evals=5, 
        trials=trials
    )
    print(best_params)
    # Train the final model with the best parameters
    best_params_with_prefix = {f"lrg__{key}": value for key, value in best_params.items()}
    optimized_pipeline = pipeline_template.set_params(**best_params_with_prefix)
    optimized_pipeline.fit(X_train, y_train)
    
    # Log everything to MLFlow
    with mlflow.start_run() as run:
        if mlflow_run_tags:
            for tag, value in mlflow_run_tags.items():
                mlflow.set_tag(tag, value)
        if mlflow_run_description:
            mlflow.set_tag("Description", mlflow_run_description)

        # Log the best hyperparameters
        mlflow.log_params(best_params)
        
        # Evaluate on the validation set
        y_pred = optimized_pipeline.predict(X_val)
        accuracy = accuracy_score(y_val, y_pred)
        mlflow.log_metric("accuracy_val", accuracy)

        # Log the final model
        mlflow.sklearn.log_model(optimized_pipeline, "optimized_pipeline")

    return optimized_pipeline


In [12]:
### Search space
search_space = {
    'lrg__C': hp.loguniform('C', 0.01, 100),  # log scale for C (e.g., 0.01 to 100)
    'lrg__penalty': hp.choice('penalty', ['l1', 'l2']),
}

X_train, y_train = train_reviews, train_data['polarity']
X_val, y_val = valid_review, valid_data['polarity']

pipeline_template = Pipeline([
    ('pre_processing', TfidfVectorizer(stop_words=list(fr_stop))),
    ('lrg', LogisticRegression(solver='liblinear', random_state=42)),
])

optimized_model = build_optimized_model(
    training_set=(X_train, y_train),
    validation_set=(X_val, y_val),
    pipeline_template=pipeline_template,
    search_space=search_space,
    mlflow_run_tags={"optimization": "hyperopt"},
    mlflow_run_description="Hyperparameter optimization with hyperopt"
)


  0%|          | 0/5 [00:00<?, ?trial/s, best loss=?]





 60%|██████    | 3/5 [03:03<01:59, 59.56s/trial, best loss: 0.12490000000000001]




100%|██████████| 5/5 [05:03<00:00, 60.62s/trial, best loss: 0.12490000000000001]


InvalidParameterError: The 'penalty' parameter of LogisticRegression must be a str among {'elasticnet', 'l1', 'l2'} or None. Got 1 instead.