In [0]:
# -------------------------------
# Installing all required libraries and dependencies
# -------------------------------
%pip install --force-reinstall --no-cache-dir \
  numpy==1.25.2 \
  pandas==2.1.3 \
  scikit-learn==1.3.2 \
  jax==0.4.25 \
  jaxlib==0.4.25 \
  numpyro==0.13.2 \
  lightweight-mmm==0.1.9 \
  mlflow \
  openpyxl

In [0]:
# -------------------------------
# Restarting python after installing new libraries
# -------------------------------
dbutils.library.restartPython()

In [0]:
# -------------------------------
# Importing all relevant libraries
# -------------------------------

# Library to suppress warnings or deprecation notes
import warnings

warnings.filterwarnings("ignore")

from sklearn import metrics
from datetime import datetime

import mlflow
import numpyro
import warnings
import itertools
import arviz as az
import numpy as np
import pandas as pd
import mlflow.pyfunc
import mlflow.sklearn 
import jax.numpy as jnp
import databricks.connect as db_connect
import mlflow.tracking._model_registry.utils
from sklearn import datasets
from itertools import cycle
from sklearn.model_selection import train_test_split
from sklearn.linear_model import ElasticNet, lasso_path, enet_path
from sklearn.metrics import mean_squared_error, r2_score,mean_absolute_error

# libaries to help with data visualization
import matplotlib.pyplot as plt
import seaborn as sns

In [0]:
# -------------------------------
# Set Experiment Name
# -------------------------------
path="/Workspace/Users/ankur242199@exlservice.com/Data_Files/"
exp_path="/Workspace/Users/ankur242199@exlservice.com/experiments/"
experiment_name = (path+ f"lightweight_mmm_20250831_072528")

# -------------------------------
# Load particular experiment runs:
# -------------------------------
print(experiment_name)
experiment = mlflow.get_experiment_by_name(experiment_name) if mlflow.get_experiment_by_name(experiment_name) is not None else None
print(experiment)

# -------------------------------
# Listing all iterations under given experiment id in pandas dataframe:# -------------------------------
# -------------------------------
runs_df = mlflow.search_runs(
    experiment_ids=[experiment.experiment_id],
    output_format="pandas")

# -------------------------------
# Ensure numeric conversion for all required metrics:
# -------------------------------
for col in ["metrics.train_r2", "metrics.test_r2", "metrics.train_mape", "metrics.test_mape",
            "metrics.train_rmse", "metrics.test_rmse"]:
    if col in runs_df.columns:
        runs_df[col] = pd.to_numeric(runs_df[col], errors="coerce")

In [0]:
# -------------------------------
# Below logic is to select the best model from all the iterations
# 1. Keep only models where the absolute difference between train R² and test R² is ≤ 5% of train R²
# 2. a. From these, pick the model with the highest train R²
#    b. If there are no models in Step 1, Find models whose train R² is within 95% of the maximum train R².
#       Among these, pick the model with the smallest relative drop from train R² to test R².
# -------------------------------

# -------------------------------
# Step 1: Filter models with abs(train R2 - test R2) < 5% of train R2
# -------------------------------
runs_df["r2_diff_pct"] = abs(runs_df["metrics.train_R2"] - runs_df["metrics.test_R2"]) / runs_df["metrics.train_R2"] * 100

filtered_candidates = runs_df[runs_df["r2_diff_pct"] <= 5].copy()

if not filtered_candidates.empty:
    # Step 2a: Pick model with maximum train R²
    best_run = filtered_candidates.loc[filtered_candidates["metrics.train_R2"].idxmax()]
else:
    # -------------------------------
    # Step 2b: Fallback: previous logic
    # -------------------------------
    max_train_r2 = runs_df["metrics.train_R2"].max()
    threshold = max_train_r2 * 0.95  # within 5% of max

    candidates = runs_df[runs_df["metrics.train_R2"] >= threshold].copy()
    candidates["r2_drop_pct"] = ((candidates["metrics.train_R2"] - candidates["metrics.test_R2"]) / candidates["metrics.train_R2"]) * 100

    best_run = candidates.loc[candidates["r2_drop_pct"].idxmin()]


# -------------------------------
# Display the final selected run
# -------------------------------
print("Final selected run:")
print(best_run[["artifact_uri","run_id", "metrics.train_R2", "metrics.test_R2", "r2_diff_pct"]])

# -------------------------------
#downloading the artifacts
# -------------------------------
artifact_uri = best_run["artifact_uri"]
run_id = best_run["run_id"]

# Correctly download the artifact using the run_id and artifact_path
artifact_path = "dbfs:/databricks/mlflow-tracking/2222958670772396/067e03eac67d44af9420bf0af22f6adf/artifacts/model-0.010000-0.010000/MLmodel"
print("Artifacts_path")
print(artifact_path)

local_path = mlflow.artifacts.download_artifacts(
    artifact_uri=artifact_path,
    dst_path=exp_path
)


In [0]:
model_name="sales_order_model"

In [0]:
def print_model_info(mod):
    for i in mod:
        print("name{}".format(i.name))
        print("version{}".format(i.version))
        print("run_id{}".format(i.run_id))
        print("current_stage{}".format(i.current_stage))

In [0]:
client=mlflow.tracking.MlflowClient()
artifact_path="dbfs:/databricks/mlflow-tracking/2222958670772396/067e03eac67d44af9420bf0af22f6adf/artifacts/model-0.010000-0.010000/MLmodel"
try:
    client.create_registered_model(model_name)
except Exception as e:
    pass
model_version=client.create_model_version(model_name,artifact_path,best_run.run_id)
