### Import

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
from spacy.lang.fr import stop_words
from sklearn.linear_model import LogisticRegression
from mlflow.models import infer_signature
stop_words = list(stop_words.STOP_WORDS)
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
import mlflow
import subprocess
from mlflow import MlflowClient

### Function

In [3]:
def build_model(
    training_set,
    pipeline,
    name,
    registered_name,
    mlflow_run_tags = None,
    mlflow_run_parameters = None,
    mlflow_run_description = None,
    test_set = None
):
    """
    Build a sentiment analysis model, print the evaluation result and store everything to MLFlow
    @param: training_set: pandas dataframe containing the input training set
    @param: pipeline: scikit-learn pipeline that will be applied to the input data
    @param: mlflow_run_tags: dict of tags that will be stored in the MLFlow run
    @param: mlflow_run_parameters: dict of parameters that will be stored in the MLFlow run
    @param: mlflow_run_description: textual description of the run
    @param: test_set: if provided, used to evaluate the model and log result in MLFlow
    @return: the trained pipeline
    """
    mlflow.start_run()
    mlflow.set_tag("mlflow.note.content" , mlflow_run_description)
    mlflow.set_tags(mlflow_run_tags)
    mlflow.log_params(mlflow_run_parameters)

    pipeline.fit(training_set["review"], training_set["polarity"])

    if test_set is not None:
        pred = pipeline.predict(test_set["review"])
        test_accuracy_score = accuracy_score(pred, test_set["polarity"].to_numpy())
        mlflow.log_metric("test_accuracy_score", test_accuracy_score)


    signature = infer_signature(training_set["review"], pipeline.predict(training_set["review"]))
    mlflow.sklearn.log_model(pipeline, name=name, signature=signature, registered_model_name=registered_name)

    client = MlflowClient()
    client.set_registered_model_tag(registered_name, "owner", "aina" )


    mlflow.end_run()



### Test

In [4]:
remote_server_uri = "http://localhost:5000/"
mlflow.set_tracking_uri(remote_server_uri)
mlflow.sklearn.autolog(log_datasets=False)
mlflow.set_experiment("/1st-experiment")

<Experiment: artifact_location='mlflow-artifacts:/529200409768567991', creation_time=1763472285759, experiment_id='529200409768567991', last_update_time=1763472285759, lifecycle_stage='active', name='/1st-experiment', tags={'mlflow.experimentKind': 'custom_model_development'}>

In [5]:
df_train = pd.read_csv("../../data/train.csv")
df_test = pd.read_csv("../../data/test.csv")
df_valid = pd.read_csv("../../data/valid.csv")

pipeline = Pipeline([
    ("tfidf", TfidfVectorizer(stop_words=stop_words)),
    ("logreg", LogisticRegression( solver="liblinear"))
])
commit_hash = subprocess.check_output(["git", "rev-parse", "HEAD"])
mlflow_run_tag={
"mlflow.note.content" : "This is MLOPS project",
"mlflow.source.name" : "model_design_2.ipynb",
"mlflow.source.git.commit" : commit_hash,
"mlflow.source.git.branch" : "HEAD",
}
mlflow_run_parameters =  {"stop_words" :stop_words , "solver" : "liblinear"}
mlflow_run_description = "This is a TfidfVectorizer couple with Logistic regression model"


build_model(df_train, pipeline,"sentiment-analyzer-baseline", "sentiment-analyzer-baseline",
            mlflow_run_tags=mlflow_run_tag, mlflow_run_parameters=mlflow_run_parameters,
            mlflow_run_description=mlflow_run_description)


Successfully registered model 'sentiment-analyzer-baseline'.
2025/11/25 14:14:17 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: sentiment-analyzer-baseline, version 1
Created version '1' of model 'sentiment-analyzer-baseline'.


üèÉ View run capricious-bear-718 at: http://localhost:5000/#/experiments/529200409768567991/runs/9fc033e53ddc45ca82e840ff5453943e
üß™ View experiment at: http://localhost:5000/#/experiments/529200409768567991
