In [1]:
# 1. Installation
# Run once in terminal or notebook:
# !pip install mlflow pandas scikit-learn numpy

import mlflow
import mlflow.sklearn
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error

def simulate_ecommerce_data(num_users=100, num_items=50, random_state=42):
    """
    Simulate a user-item rating matrix for recommendation.
    Returns a DataFrame with columns: user_id, item_id, rating
    """
    np.random.seed(random_state)
    user_ids = np.random.choice(range(num_users), size=num_users*10)
    item_ids = np.random.choice(range(num_items), size=num_users*10)
    # Ratings between 1 and 5
    ratings = np.random.randint(1, 6, size=num_users*10)

    df = pd.DataFrame({
        "user_id": user_ids,
        "item_id": item_ids,
        "rating": ratings
    })
    return df

def prepare_features(df):
    """
    Create simple user and item features for regression.
    One-hot encode user_id and item_id.
    """
    user_features = pd.get_dummies(df["user_id"], prefix="user")
    item_features = pd.get_dummies(df["item_id"], prefix="item")
    features = pd.concat([user_features, item_features], axis=1)
    return features

def train_and_log_recommendation_model(df, alpha=1.0):
    """
    Train a Ridge regression recommendation model, log experiment in MLflow.
    Returns MLflow run id or None on failure.
    """
    try:
        mlflow.set_experiment("Ecommerce_Recommendation")

        with mlflow.start_run() as run:
            run_id = run.info.run_id

            # Prepare features and target
            X = prepare_features(df)
            y = df["rating"]

            # Train-test split
            X_train, X_test, y_train, y_test = train_test_split(
                X, y, test_size=0.2, random_state=42
            )

            # Train Ridge regression model
            model = Ridge(alpha=alpha)
            model.fit(X_train, y_train)

            # Predict and evaluate
            y_pred = model.predict(X_test)
            rmse = mean_squared_error(y_test, y_pred, squared=False)

            # Log parameters and metrics
            mlflow.log_param("alpha", alpha)
            mlflow.log_metric("rmse", rmse)

            # Log model
            mlflow.sklearn.log_model(model, "ridge_recommendation_model")

            print(f"Run {run_id} logged with RMSE: {rmse:.4f}")
            return run_id
    except Exception as e:
        print(f"Error during training/logging: {e}")
        return None

# --- Example usage and unit test ---
import unittest

class TestEcommerceRecommendation(unittest.TestCase):

    def setUp(self):
        self.df = simulate_ecommerce_data(num_users=50, num_items=20)

    def test_simulated_data(self):
        self.assertFalse(self.df.empty)
        self.assertIn("user_id", self.df.columns)
        self.assertIn("item_id", self.df.columns)
        self.assertIn("rating", self.df.columns)

    def test_model_training_logging(self):
        run_id = train_and_log_recommendation_model(self.df, alpha=0.5)
        self.assertIsNotNone(run_id)

if __name__ == "__main__":
    print("Simulating e-commerce dataset...")
    data = simulate_ecommerce_data()

    print("Training and logging recommendation model...")
    run_id = train_and_log_recommendation_model(data)

    print("\nTo view the MLflow UI, run:\nmlflow ui --port 5000\nand open http://localhost:5000")

    # Run unit tests
    unittest.main(argv=['first-arg-is-ignored'], exit=False)


2025/05/23 18:51:52 INFO mlflow.tracking.fluent: Experiment with name 'Ecommerce_Recommendation' does not exist. Creating a new experiment.
F.
FAIL: test_model_training_logging (__main__.TestEcommerceRecommendation)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/tmp/ipykernel_13874/3796272557.py", line 98, in test_model_training_logging
    self.assertIsNotNone(run_id)
AssertionError: unexpectedly None

----------------------------------------------------------------------
Ran 2 tests in 0.054s

FAILED (failures=1)


Simulating e-commerce dataset...
Training and logging recommendation model...
Error during training/logging: got an unexpected keyword argument 'squared'

To view the MLflow UI, run:
mlflow ui --port 5000
and open http://localhost:5000
Error during training/logging: got an unexpected keyword argument 'squared'
