https://www.youtube.com/watch?v=H4Fd7wsueZw
https://www.youtube.com/watch?v=OBjYJ5UvMQc
https://towardsdatascience.com/imdb-reviews-or-8143fe57c825
https://www.kaggle.com/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews

In [None]:
!pip install mlflow==2.11.1 optuna==4.0.0

In [1]:
import optuna
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

dataset = pd.read_csv("data/IMDB_Dataset.csv")[:10_000]
vectorizer = TfidfVectorizer(stop_words='english', max_features=5_000)
X = vectorizer.fit_transform(dataset["review"])
y = dataset["sentiment"].map({"positive": 1, "negative": 0})
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

In [None]:
import mlflow
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score,  f1_score


def objective(trial):
    mlflow.set_experiment("ML-RFC-IMDB-Reviews3")
    
    with mlflow.start_run():
        params = dict(
            n_estimators=trial.suggest_int("n_estimators", low=1, high=200),
            max_depth=trial.suggest_int("max_depth", low=1, high=16),
        )

        model = RandomForestClassifier(**params, n_jobs=-1, random_state=42)
        model.fit(X_train, y_train)

        metrics = {
            "accuracy_train": accuracy_score(y_train, model.predict(X_train)),
            "accuracy_test": accuracy_score(y_test, model.predict(X_test)),
            "precision_test": precision_score(y_test, model.predict(X_test)),
            "recall_test": recall_score(y_test, model.predict(X_test)),
            "f1_test": f1_score(y_test, model.predict(X_test)),
        }
        
        mlflow.log_params(params)
        mlflow.log_metrics(metrics)
        mlflow.sklearn.log_model(
            sk_model=model,
            input_example=X_train,
            artifact_path="model"
        )

    return metrics["f1_test"]

study = optuna.create_study()
study.optimize(objective, n_trials=50)

study.best_params

In [None]:
!mlflow server --host 127.0.0.1 --port 8080