<a href="https://colab.research.google.com/github/achett/mlflow/blob/main/MLFlow.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install mlflow
!pip install statsmodels
!pip install numpy
!pip install pandas

Collecting mlflow
  Downloading mlflow-2.12.1-py3-none-any.whl (20.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.2/20.2 MB[0m [31m33.4 MB/s[0m eta [36m0:00:00[0m
Collecting alembic!=1.10.0,<2 (from mlflow)
  Downloading alembic-1.13.1-py3-none-any.whl (233 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.4/233.4 kB[0m [31m13.8 MB/s[0m eta [36m0:00:00[0m
Collecting docker<8,>=4.0.0 (from mlflow)
  Downloading docker-7.0.0-py3-none-any.whl (147 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m147.6/147.6 kB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
Collecting gitpython<4,>=3.1.9 (from mlflow)
  Downloading GitPython-3.1.43-py3-none-any.whl (207 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m207.3/207.3 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting graphene<4 (from mlflow)
  Downloading graphene-3.3-py2.py3-none-any.whl (128 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [32]:
import os
import mlflow
import pandas as pd
import numpy as np
from statsmodels.tsa.arima.model import ARIMA
from sklearn.metrics import mean_squared_error
from getpass import getpass

os.environ['MLFLOW_TRACKING_USERNAME'] = 'achett2'
os.environ['MLFLOW_TRACKING_PASSWORD'] = '2af8bb9bfaeec663848b89f850e29831f075854c'

mlflow.set_tracking_uri('https://dagshub.com/achett2/my-first-repo.mlflow')

# generate a time index with a daily frequency
dates = pd.date_range(start="2022-12-01", end="2023-12-01", freq="D")

# generate the seasonal component (weekly)
seasonality = np.sin(np.arange(len(dates)) * (2 * np.pi / 365.25) * 7)

# generate the trend component
trend = np.linspace(-5, 5, len(dates)) + 2 * np.sin(
    np.arange(len(dates)) * (2 * np.pi / 365.25) * 0.1
)

# generate the residual component
residuals = np.random.normal(0, 1, len(dates))

# generate the final time series by adding the components
time_series = seasonality + trend + residuals

# create a dataframe from the time series
data = pd.DataFrame({"date": dates, "value": time_series})
data.set_index("date", inplace=True)

# Split data into train and test sets
split = int(len(data) * 0.80)
train, test = data[0:split], data[split:len(data)]

# Set the MLflow experiment name
mlflow.set_experiment('Time Series ARIMA with statsmodels')

# Start an MLflow run
with mlflow.start_run():
    # Log dataset statistics
    mlflow.log_param("Dataset size", len(data))
    mlflow.log_param("Training size", len(train))
    mlflow.log_param("Test size", len(test))

    # Fit an ARIMA model
    model = ARIMA(train, order=(5,2,0))  # Modify order as appropriate for the dataset
    model_fit = model.fit()

    # Log model parameters
    mlflow.log_param("ARIMA order", "(5,2,0)")

    # Make predictions
    predictions = model_fit.forecast(steps=len(test))

    # Calculate and log mean squared error
    mse = mean_squared_error(test, predictions)
    mlflow.log_metric("MSE", mse)

    # Save the model to file
    model_fit.save("model_arima.pkl")
    mlflow.log_artifact("model_arima.pkl", "model")

    # Log the sklearn model and register as version 1
    mlflow.statsmodels.log_model(
        statsmodels_model=model_fit,
        artifact_path="stats-model",
        registered_model_name="arima_model2",
    )

  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
Registered model 'arima_model2' already exists. Creating a new version of this model...
2024/04/22 23:03:30 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: arima_model2, version 4
Created version '4' of model 'arima_model2'.


In [33]:
#################
# VIEW MODELS
#################
from mlflow.tracking import MlflowClient
from pprint import pprint

client = MlflowClient()
for rm in client.search_registered_models():
    pprint(dict(rm), indent=4)

{   'aliases': {},
    'creation_timestamp': 1713745529578,
    'description': '',
    'last_updated_timestamp': 1713745750547,
    'latest_versions': [   <ModelVersion: aliases=[], creation_timestamp=1713745750547, current_stage='None', description='', last_updated_timestamp=1713745750547, name='arima_model', run_id='2048cfbcec474b33b32cb27e7e2dc3df', run_link='', source='mlflow-artifacts:/39c361fdc23d4d1aa2d568056b49e217/2048cfbcec474b33b32cb27e7e2dc3df/artifacts/stats-model', status='READY', status_message='', tags={}, user_id='', version='2'>],
    'name': 'arima_model',
    'tags': {}}
{   'aliases': {},
    'creation_timestamp': 1713746137964,
    'description': '',
    'last_updated_timestamp': 1713827010835,
    'latest_versions': [   <ModelVersion: aliases=[], creation_timestamp=1713827010835, current_stage='None', description='', last_updated_timestamp=1713827010835, name='arima_model2', run_id='0d3f9fc2290240fd9e9be9fd2a33fcbc', run_link='', source='mlflow-artifacts:/39c361f

In [34]:
#################
# SERVE MODEL
#################
import mlflow.pyfunc

model_name = "arima_model2"
model_version = 4

model_served = mlflow.pyfunc.load_model(model_uri=f"models:/{model_name}/{model_version}")

Downloading artifacts:   0%|          | 0/9 [00:00<?, ?it/s]

2024-01-01   -13.523994
2024-01-02   -13.691295
2024-01-03   -13.858597
2024-01-04   -14.025899
2024-01-05   -14.193201
2024-01-06   -14.360502
2024-01-07   -14.527804
Freq: D, Name: predicted_mean, dtype: float64

In [None]:
#################
# GENERATE PREDICTION
#################
# prediction dataframes for a TimeSeriesModel must have exactly one row and include columns called start and end
start = pd.to_datetime("2024-01-01")
end = pd.to_datetime("2024-01-07")

# generate predictions
prediction_data = pd.DataFrame({"start": start, "end": end}, index=[0])

model_served.predict(prediction_data)

In [None]:
#################
# BUILD DOCKER CONTAINER
#################
# !mlflow models build-docker --model-uri "models:/arima_model/2" --name "arima_model_mlops" --enable-mlserver

run_id = '2048cfbcec474b33b32cb27e7e2dc3df'
image_name = "arima_model_mlops"
mlflow.models.build_docker(
    model_uri=f"https://dagshub.com/achett2/my-first-repo.mlflow/#/models/arima_model/versions/1",
    name=image_name,
    enable_mlserver=True,
)

ValueError: not enough values to unpack (expected 2, got 1)