In [7]:
import mlflow
from mlflow.models import infer_signature
from mlflow import MlflowClient
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import spacy
from spacy.lang.fr.stop_words import STOP_WORDS as fr_stop

In [3]:
nlp = spacy.load("fr_core_news_sm")
train_data = pd.read_csv("../archive/train.csv")
valid_data = pd.read_csv("../archive/valid.csv")
test_data  = pd.read_csv("../archive/test.csv")

train_reviews = train_data.review.values
test_reviews  = test_data.review.values


In [4]:
vectorizer = TfidfVectorizer(stop_words=list(fr_stop))
train_reviews_vectorised = vectorizer.fit_transform(train_reviews)
test_reviews_vectorised = vectorizer.transform(test_reviews)




In [5]:
y_train = train_data['polarity']
y_test  = test_data['polarity']

## defining the model
model = LogisticRegression()

### MLFlow Tracking

In [11]:
# Set our tracking server uri for logging
import subprocess


mlflow.set_tracking_uri(uri="http://127.0.0.1:5000")
# Create a new MLflow Experiment
mlflow.set_experiment("MLflow Quickstart")

# Start an MLflow run
with mlflow.start_run():
    mlflow.sklearn.autolog(log_datasets=False)
    # Log the hyperparameters
    #mlflow.log_params(params)
    model.fit(train_reviews_vectorised,y_train)
    y_pred = model.predict(test_reviews_vectorised)
    # Log the loss metric
    accuracy = accuracy_score(y_test,y_pred)
    precision = precision_score(y_test,y_pred)
    recall   = recall_score(y_test,y_pred)

    mlflow.log_metric("accuracy_test", accuracy)
    mlflow.log_metric("recall_test", recall)
    mlflow.log_metric("precision_test",precision)
    commit = subprocess.check_output(['git',"rev-parse","HEAD"]).strip().decode("utf-8")
    branch = subprocess.check_output(["git","rev-parse","--abbrev-ref","HEAD"]).strip().decode("utf-8")
    source_name = "model_design_2.ipynb"
    mlflow.set_tag("mlflow.source.git.commit",commit)
    mlflow.set_tag("mlflow.source.git.branch",branch)
    mlflow.set_tag("mlflow.source.name",source_name)

    name = "LogisticRegression"
    tags = {"no_idea_key": "no_idea_value"}
    desc = "this model classifies the relevance of a film."
    
    # Set a tag that we can use to remind ourselves what this run was for
    mlflow.set_tag("Training Info", "Basic LR model for review relevant")
    mlflow.set_tag("Preprocessing", "TF-IDF Vectorizer")

    # Infer the model signature
    #signature = infer_signature(train_reviews_vectorised, model.predict(test_reviews_vectorised))
    # Log the model
    model_info = mlflow.sklearn.log_model(
        sk_model=model,
        artifact_path="Logistic_Regression v1",
        registered_model_name="logistic_model",

    )
    client = MlflowClient()
    client.set_registered_model_tag("logistic_model", "tags", "no idea")



Registered model 'logistic_model' already exists. Creating a new version of this model...
2024/11/12 16:34:05 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: logistic_model, version 2
Created version '2' of model 'logistic_model'.
2024/11/12 16:34:05 INFO mlflow.tracking._tracking_service.client: 🏃 View run placid-duck-186 at: http://127.0.0.1:5000/#/experiments/714668274150591966/runs/1e9864d16bfa49ec8e07376f3a492ceb.
2024/11/12 16:34:05 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/714668274150591966.
