# New model design

Here is the design of a word to vec Embeding to find the polarity of a critic.

In [3]:
import logging
import mlflow
import polars as pl
import numpy as np
import datetime

logger = logging.getLogger("mlflow")
logger.setLevel(logging.DEBUG)

In [8]:
df_train = (pl.scan_csv('../data/train/train.csv')).collect()
df_test = (pl.scan_csv('../data/test/test.csv')).collect()
df_train.head()

Unnamed: 0_level_0,film-url,review,polarity
i64,str,str,i64
0,"""http://www.allocine.fr/film/fi…","""Si vous cherchez du cinéma abr…",0
1,"""http://www.allocine.fr/film/fi…","""Trash, re-trash et re-re-trash…",0
2,"""http://www.allocine.fr/film/fi…","""Et si, dans les 5 premières mi…",0
3,"""http://www.allocine.fr/film/fi…","""Mon dieu ! Quelle métaphore fi…",0
4,"""http://www.allocine.fr/film/fi…","""Premier film de la saga Kozure…",1


In [13]:
dataset_preview = df_train.head()

dataset_preview.write_csv('../data/dataset_preview_brut.csv')

In [6]:
def compute_metrics(pred, true):
    # pred et true doivent être des tensors 0/1
    if type(pred) != np.ndarray:
        pred = pred.int()
        true = true.int()

    TP = ((pred == 1) & (true == 1)).sum().item()
    TN = ((pred == 0) & (true == 0)).sum().item()
    FP = ((pred == 1) & (true == 0)).sum().item()
    FN = ((pred == 0) & (true == 1)).sum().item()

    accuracy  = (TP + TN) / (TP + TN + FP + FN)
    precision = TP / (TP + FP) if (TP + FP) > 0 else 0
    recall    = TP / (TP + FN) if (TP + FN) > 0 else 0

    return accuracy, precision, recall


## Make a production type architecture for publishing model in mlflow

In [24]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from spacy.lang.fr.stop_words import STOP_WORDS as fr_stop
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, roc_auc_score, roc_curve
import matplotlib.pyplot as plt
import git
from datetime import datetime

def build_model(
        training_set,
        pipeline,
        pipeline_name = "sentiment_analysis_pipeline",
        mlflow_run_tags = None,
        mlflow_run_parameters = None,
        dataset_preview = None,
        test_set = None,
        traking_uri = "http://localhost:5000",
        experiment_name = "sentiment_analysis_experiment"
    ):
    """
        Build a sentiment analysis model, print the evaluation result and store everything to MLFlow
        @param: training_set: polars dataframe containing the input training set
        @param: pipeline: scikit-learn pipeline that will be applied to the input data
        @param: mlflow_run_tags: dict of tags that will be stored in the MLFlow run
        @param: mlflow_run_parameters: dict of parameters that will be stored in the MLFlow run
        @param: mlflow_run_description: textual description of the run
        @param: test_set: if provided, used to evaluate the model and log result in MLFlow
        @return: the trained pipeline
    """
    X_train = training_set.select("review").to_numpy().ravel()
    y_train = training_set.select("polarity").to_numpy().ravel()
    mlflow.set_tracking_uri(traking_uri)
    mlflow.set_experiment(experiment_name)
    mlflow.sklearn.autolog()
    with mlflow.start_run():
        if mlflow_run_tags:
            mlflow.set_tags(mlflow_run_tags)
        if mlflow_run_parameters:
            mlflow.log_params(mlflow_run_parameters)
        if dataset_preview:
            mlflow.log_artifact(dataset_preview, artifact_path="dataset_preview.csv")

        pipeline.fit(X_train, y_train)

        if test_set is not None:
            X_test = test_set.select("review").to_numpy().ravel()
            y_test = test_set.select("polarity").to_numpy().ravel()
            preds = pipeline.predict(X_test)
            # testing
            acc, prec, rec = compute_metrics(y_test, preds)
    
            # Confusion matrix
            cm = confusion_matrix(y_test, preds)
            disp = ConfusionMatrixDisplay(confusion_matrix=cm)
            fig, ax = plt.subplots(figsize=(6, 6))
            disp.plot(ax=ax)
            ax.set_title("Confusion Matrix")
            mlflow.log_figure(fig, "test_confusion_matrix.png")
            plt.close(fig)

            # ROC AUC
            probs = pipeline.predict_proba(X_test)[:, 1]  # binary example
            auc = roc_auc_score(y_test, probs)
            fpr, tpr, _ = roc_curve(y_test, probs)
            fig, ax = plt.subplots()
            ax.plot(fpr, tpr)
            ax.set_xlabel("FPR")
            ax.set_ylabel("TPR")
            ax.set_title(f"ROC curve (AUC = {auc:.3f})")
            mlflow.log_figure(fig, "test_roc_curve.png")
            plt.close(fig)
            mlflow.log_metrics({
                "testing_accuracy": acc,
                "testing_precision": prec,
                "testing_recall": rec,
                "testing_auc": auc
                })
        mlflow.sklearn.log_model(sk_model=pipeline, name=f"{pipeline_name}_{datetime.now().microsecond}")

In [None]:

desc = "a sklearn logistic regression model for predict polarity \n the dataset had been preprosed with tfidf vectorizer with : \n - max_features=5000\n - strip_accents='ascii'\n - lowercase=True\n - stop_words=list(fr_stop)\n - spacy.lang.fr.stop_words stop words\n\n and StandardScaler with with_mean=False"

repo = git.Repo(".", search_parent_directories=True)

remote = repo.remotes.origin.url
remote_clean = remote.replace(".git", "")

git_user = remote_clean.split("/")[-2]
git_repo = remote_clean.split("/")[-1]
git_branch = repo.active_branch.name
git_commit = repo.head.commit.hexsha

tags ={
        "mlflow.user": git_user,
        "git.repo": git_repo,
        "git.remote": remote,
        "mlflow.source.git.branch": git_branch,
        "mlflow.source.git.commit": git_commit,
        "mlflow.source.name":"notebook/model_design3.ipynb",
        "mlflow.source.type":"NOTEBOOK",
        "mlflow.note.content":desc
    }

hyperParam = {
        'C':1.25,
        'penalty':'l2',
        'solver':'saga'
    }

pipe = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=5000, strip_accents='ascii', lowercase=True, stop_words=list(fr_stop))),
    ('scaler', StandardScaler(with_mean=False)),
    ('lr', LogisticRegression(**hyperParam))
])

build_model(
    training_set = df_train,
    pipeline = pipe,
    mlflow_run_tags = tags,
    mlflow_run_parameters = hyperParam,
    dataset_preview = '../data/dataset_preview_brut.csv',
    test_set = df_test
)

2025/11/30 19:44:42 DEBUG mlflow.utils.autologging_utils: Called autolog() method for sklearn autologging with args '()' and kwargs '{'log_input_examples': False, 'log_model_signatures': True, 'log_models': True, 'log_datasets': True, 'disable': False, 'exclusive': False, 'disable_for_unsupported_versions': False, 'silent': False, 'max_tuning_runs': 5, 'log_post_training_metrics': True, 'serialization_format': 'cloudpickle', 'registered_model_name': None, 'pos_label': None, 'extra_tags': None}'
2025/11/30 19:44:42 DEBUG mlflow.utils.gorilla: Patch fn on destination already existed. Overwrite old patch.
2025/11/30 19:44:42 DEBUG mlflow.utils.gorilla: Patch fn on destination already existed. Overwrite old patch.
2025/11/30 19:44:42 DEBUG mlflow.utils.gorilla: Patch fn on destination already existed. Overwrite old patch.
2025/11/30 19:44:42 DEBUG mlflow.utils.gorilla: Patch fn on destination already existed. Overwrite old patch.
2025/11/30 19:44:42 DEBUG mlflow.utils.gorilla: Patch fn on 

🏃 View run classy-donkey-656 at: http://localhost:5000/#/experiments/2/runs/92324300620b4203a3e5a7288a910ee7
🧪 View experiment at: http://localhost:5000/#/experiments/2


In [28]:
desc = "a sklearn logistic regression model for predict polarity \n the dataset had been preprosed with tfidf vectorizer with : \n - max_features=5000\n - strip_accents='ascii'\n - lowercase=True\n - stop_words=list(fr_stop)\n - spacy.lang.fr.stop_words stop words\n\n"

tags ={
        "mlflow.user": git_user,
        "git.repo": git_repo,
        "git.remote": remote,
        "mlflow.source.git.branch": git_branch,
        "mlflow.source.git.commit": git_commit,
        "mlflow.source.name":"notebook/model_design3.ipynb",
        "mlflow.source.type":"NOTEBOOK",
        "mlflow.note.content":desc
    }

pipe = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=5000, strip_accents='ascii', lowercase=True, stop_words=list(fr_stop))),
    ('lr', LogisticRegression(**hyperParam))
])

build_model(
    training_set = df_train,
    pipeline = pipe,
    mlflow_run_tags = tags,
    mlflow_run_parameters = hyperParam,
    dataset_preview = '../data/dataset_preview_brut.csv',
    test_set = df_test
)

2025/11/30 19:51:16 DEBUG mlflow.utils.autologging_utils: Called autolog() method for sklearn autologging with args '()' and kwargs '{'log_input_examples': False, 'log_model_signatures': True, 'log_models': True, 'log_datasets': True, 'disable': False, 'exclusive': False, 'disable_for_unsupported_versions': False, 'silent': False, 'max_tuning_runs': 5, 'log_post_training_metrics': True, 'serialization_format': 'cloudpickle', 'registered_model_name': None, 'pos_label': None, 'extra_tags': None}'
2025/11/30 19:51:16 DEBUG mlflow.utils.gorilla: Patch fn on destination already existed. Overwrite old patch.
2025/11/30 19:51:16 DEBUG mlflow.utils.gorilla: Patch fn on destination already existed. Overwrite old patch.
2025/11/30 19:51:16 DEBUG mlflow.utils.gorilla: Patch fn on destination already existed. Overwrite old patch.
2025/11/30 19:51:16 DEBUG mlflow.utils.gorilla: Patch fn on destination already existed. Overwrite old patch.
2025/11/30 19:51:16 DEBUG mlflow.utils.gorilla: Patch fn on 

🏃 View run tasteful-turtle-282 at: http://localhost:5000/#/experiments/2/runs/1220e18196024416902215a7234b6d01
🧪 View experiment at: http://localhost:5000/#/experiments/2
