In [0]:
# -------------------------------
# Installing all required libraries and dependencies
# -------------------------------
%pip install --force-reinstall --no-cache-dir \
  numpy==1.25.2 \
  pandas==2.1.3 \
  scikit-learn==1.3.2 \
  jax==0.4.25 \
  jaxlib==0.4.25 \
  numpyro==0.13.2 \
  lightweight-mmm==0.1.9 \
  mlflow \
  openpyxl

In [0]:
# -------------------------------
# Restarting python after installing new libraries
# -------------------------------
dbutils.library.restartPython()

In [0]:
# -------------------------------
# Importing all relevant libraries
# -------------------------------
from sklearn import metrics
from datetime import datetime

import mlflow
import numpyro
import warnings
import itertools
import arviz as az
import numpy as np
import pandas as pd
import mlflow.pyfunc
import mlflow.sklearn 
import jax.numpy as jnp
import databricks.connect as db_connect
import mlflow.tracking._model_registry.utils
from sklearn import datasets
from itertools import cycle
from sklearn.model_selection import train_test_split
from sklearn.linear_model import ElasticNet, lasso_path, enet_path
from sklearn.metrics import mean_squared_error, r2_score,mean_absolute_error

# libaries to help with data visualization
import matplotlib.pyplot as plt
import seaborn as sns

warnings.filterwarnings("ignore")

In [0]:
# -------------------------------
# Import the relevant modules of the lightweight mmm library
# -------------------------------
from lightweight_mmm import lightweight_mmm
from lightweight_mmm import optimize_media
from lightweight_mmm import plot
from lightweight_mmm import preprocessing
from lightweight_mmm import utils

In [0]:
# -------------------------------
# Load data from snowflake and converting it into pandas dataframe
# -------------------------------
path="/Workspace/Users/ankur242199@exlservice.com/Data_Files/"
exp_path="/Workspace/Users/ankur242199@exlservice.com/experiments/"
data=pd.read_csv(path+ "POC_Clean_Dataset.csv")
Train = pd.read_csv(path + "Train_Dataset.csv")
Test = pd.read_csv(path + "Test_Dataset.csv")

In [0]:
Train.shape, Test.shape

#drop the date column as its string column
data.drop(columns=['WEEK_START'], inplace=True)

data.columns


In [0]:
def plot_elastic (x,y,l1_ratio):
    eps= 5e-3
    alpha_enet, coef_enet, _ = enet_path(x, y, eps=eps,l1_ratio=l1_ratio)
    global image

    fig = plt.figure()
    az = plt.gca()
    colors = cycle(['b', 'r', 'g', 'c', 'k'])
    neg_alpa_ents = -np.log10(alpha_enet)

    for coef_enet,c in zip(alpha_enet,colors):
        l1=plt.plot(neg_alpa_ents,coef_enet,c=c,linestyle='--')
    plt.xlabel('Log Alpha')
    plt.ylabel('Cofficients')
    title = "ElasticNet path by alpha for l1_ratio = " + str(l1_ratio) 
    plt.title(title)
    plt.axis('tight')
    image= fig
    fig.savefig(exp_path+ 'elasticnet_path.png')
    plt.close(fig)
    return image

In [0]:
def metric_for_model(actual,pred):
    rmse = np.sqrt(mean_squared_error(actual,pred))
    mae = mean_absolute_error(actual,pred)
    r2 = r2_score(actual,pred)
    return rmse,mae,r2
    

In [0]:
def train_sales_order_model(data,l1_alpha,l1_ratio):

    import shutil
    import os
    
    # Set the registry URI to use Unity Catalog
    #mlflow.set_registry_uri("databricks-uc")

    train,test = train_test_split(data)
    train_x = train.drop(['ORDERS'],axis=1)
    train_y = train[['ORDERS']]
    test_x = test.drop(['ORDERS'],axis=1)  
    test_y = test[['ORDERS']]
    print("shape_x:", train_x.shape, test_x.shape)
    print("shape_y:", train_y.shape, test_y.shape)

    
    #check if user has not given any alpha value 
    if float(l1_alpha) is None:
        alpha= 0.05
    else:
        alpha = float(l1_alpha)
        
    #check if user has not given any l1_ratio value 
    if float(l1_ratio) is None:
        ratio= 0.5
    else:
        ratio = float(l1_ratio)

    CATALOG_NAME = "dmpipeline-dev"
    SCHEMA_NAME = "poc"
    MODEL_NAME = "sales_order_model"
    registered_model_name=f"{CATALOG_NAME}.{SCHEMA_NAME}.{MODEL_NAME}"
    print("Registered model name:", registered_model_name)

    with mlflow.start_run() as run:
        lr= ElasticNet(alpha=alpha,l1_ratio=ratio,random_state=42) #l1_ratio instead of l1_score
        lr.fit(train_x,train_y)
        lr_predict = lr.predict(test_x)
        rmse,mae,r2 = metric_for_model(test_y,lr_predict)
        print("ElasticNet Model(alpha=%f, l1_ratio=%f" % (alpha,l1_ratio)) #l1_ratio instead of l1_score
        print("RMSE:",rmse)
        print("MAE:",mae)
        print("R2:",r2)

        input_example = test_x.iloc[[0]]

        mlflow.log_param("alpha",alpha)
        mlflow.log_param("l1_ratio",l1_ratio)
        mlflow.log_param("RMSE",rmse)
        mlflow.log_param("MAE",mae)
        mlflow.log_param("R2",r2)
        mlflow.sklearn.log_model(lr, "model", input_example=input_example,
        registered_model_name=f"{CATALOG_NAME}.{SCHEMA_NAME}.{MODEL_NAME}")

    
    # Set the experiment
        experiment_name = "/Shared/name_of_experiment/"
        mlflow.set_experiment(experiment_name)

    # Define the model path
        model_path = "/Workspace/Users/ankur242199@exlservice.com/experiments/model-%f-%f" % (alpha, l1_ratio)

    # Remove existing contents if the path exists
        if os.path.exists(model_path):
            shutil.rmtree(model_path)

# Save the model
        mlflow.sklearn.save_model(lr, model_path)

# Log the artifact              
        mlflow.log_artifact(model_path)

        best_run=run.info
        return best_run
    


In [0]:
#%fs rm -r /Workspace/Users/ankur242199@exlservice.com/experiments/
data.head()

#%pip install mlflow[databricks]

In [0]:

# Call the model function
train_sales_order_model(data, 0.01, 0.01)


In [0]:
best_model=train_sales_order_model(data, 0.65, 0.21)

In [0]:
model_name="sales_order_model"
print(best_model)

In [0]:
def print_model_info(mod):
    for i in mod:
        print("name{}".format(i.name))
        print("version{}".format(i.version))
        print("run_id{}".format(i.run_id))
        print("current_stage{}".format(i.current_stage))

In [0]:
client=mlflow.tracking.MlflowClient()
try:
    client.create_registered_model(model_name)
except Exception as e:
    pass
model_version=client.create_model_version(model_name,f"{best_model.artifact_uri}/model", best_model.run_id)


In [0]:
CATALOG_NAME = "dmpipeline-dev"
SCHEMA_NAME = "poc"
MODEL_NAME = "sales_order_model"
registered_model_name=f"{CATALOG_NAME}.{SCHEMA_NAME}.{MODEL_NAME}"
print("Registered model name:", registered_model_name)

model_x=client.get_latest_versions(name=registered_model_name)
print_model_info(model_x)

In [0]:
# -------------------------------
# Listing all media and control variables with target and date column 
# -------------------------------
media_cols = ['ASA_APP', 'BING_DISPLAY', 'BING_SEARCH',
       'DV360_DISPLAY_OR_OLV', 'FACEBOOK_SOCIAL',
       'GOOGLE_DISPLAY_OR_OLV', 'GOOGLE_SEARCH', 'LINKEDIN_SOCIAL', 'META_APP',
       'REDDIT_SOCIAL', 'TAPTICA_APP', 'TWITTER_SOCIAL',
       'COMMISSIONS_AFFILIATE', 'PLACEMENT_AFFILIATE', 'REDBOX_APP', 'LIFTOFF_APP', 'BRAND_SPEND']
control_cols = ['NEWS_ANOMALY', 'HOLIDAY_FLAG', 'SALE_FLAG', 'WSJ_EMAILS_TOTAL']
target_col = 'ORDERS'
date_col = 'WEEK_START'

In [0]:
# -------------------------------
# Sorting data based on date column and creating new dataframes for media, control, sales and cost
# -------------------------------
Train_df = Train.sort_values(date_col).reset_index(drop = True)
Test_df = Test.sort_values(date_col).reset_index(drop = True)

# -------------------------------
# Media variables
# -------------------------------
media_data_train = Train_df[media_cols].astype(float).to_numpy()
media_data_test = Test_df[media_cols].astype(float).to_numpy()
print("Media shapes:", media_data_train.shape, media_data_test.shape)

# -------------------------------
# Control variables
# -------------------------------
control_data_train = Train_df[control_cols].astype(float).to_numpy()
control_data_test  = Test_df[control_cols].astype(float).to_numpy()
print("Control shapes:", control_data_train.shape, control_data_train.shape)

# -------------------------------
# Target variable
# -------------------------------
target_train = Train_df[target_col].astype(float).to_numpy()
target_test = Test_df[target_col].astype(float).to_numpy()
print("Target shapes:", target_train.shape, target_test.shape)

# -------------------------------
# Costs variables
# -------------------------------
costs_train = Train_df[media_cols].sum(axis = 0).to_numpy()
costs_test = Test_df[media_cols].sum(axis = 0).to_numpy()
print("Costs shapes:", costs_train.shape, costs_test.shape)


In [0]:
# -------------------------------
# Define scalers
# -------------------------------
media_scaler   = preprocessing.CustomScaler(divide_operation=jnp.mean)
control_scaler = preprocessing.CustomScaler(divide_operation=jnp.mean)
target_scaler  = preprocessing.CustomScaler(divide_operation=jnp.mean)
cost_scaler    = preprocessing.CustomScaler(divide_operation=jnp.mean)

# -------------------------------
# Scale media data (all columns)
# -------------------------------
media_data_train_scaled = media_scaler.fit_transform(media_data_train)
media_data_test_scaled  = media_scaler.transform(media_data_test)

# -------------------------------
# Scale only 3rd column of control data (index 3)
# -------------------------------
# Training
control_data_train_scaled = control_data_train.copy()
control_data_train_scaled[:, 3] = control_scaler.fit_transform(control_data_train[:, 3].reshape(-1, 1)).flatten()

# Test
control_data_test_scaled = control_data_test.copy()
control_data_test_scaled[:, 3] = control_scaler.transform(control_data_test[:, 3].reshape(-1, 1)).flatten()

# -------------------------------
# Scale target variable
# -------------------------------
target_train_scaled = target_scaler.fit_transform(target_train)
target_test_scaled  = target_scaler.transform(target_test)

# -------------------------------
# Scale costs (all columns)
# -------------------------------
cost_train_scaled = cost_scaler.fit_transform(costs_train)
cost_test_scaled  = cost_scaler.transform(costs_test)


In [0]:
import joblib

# Save all scalers locally
joblib.dump(media_scaler, "media_scaler.pkl")
joblib.dump(target_scaler, "target_scaler.pkl")
joblib.dump(cost_scaler, "cost_scaler.pkl")
joblib.dump(control_scaler, "control_scaler.pkl")

In [0]:
# Create or set an experiment
exp_path="/Workspace/Users/ankur242199@exlservice.com/experiments/"
experiment_name = (path+ f"lightweight_mmm_{datetime.now().strftime('%Y%m%d_%H%M%S')}")
mlflow.set_experiment(experiment_name)
with mlflow.start_run(run_name=experiment_name, nested=True):
    mlflow.log_artifact("media_scaler.pkl", artifact_path="scalers")
    mlflow.log_artifact("target_scaler.pkl", artifact_path="scalers")
    mlflow.log_artifact("cost_scaler.pkl", artifact_path="scalers")
    mlflow.log_artifact("control_scaler.pkl", artifact_path="scalers")


In [0]:
# Define hyperparameter search space
model_names = ["hill_adstock"]
warmup_values = [200, 300]
samples_values = [400, 500]
degree = [1, 2, 3]

# -------------------------------
# Training and Testing model performance on multiple warmup and sample values and logging using MLflow
# -------------------------------
for model_name, n_warmup, n_samples, n_deg in itertools.product(model_names, warmup_values, samples_values, degree):

    # Creating run name using model name, warmpup and sample values
    run_name = f"{model_name}_warmup{n_warmup}_samples{n_samples}"
    
    # Logging model name with hyperparameters and iteration name
    with mlflow.start_run(run_name=run_name):
        # Log params
        mlflow.log_param("model_name", model_name)
        mlflow.log_param("n_warmup", n_warmup)
        mlflow.log_param("n_samples", n_samples)
        mlflow.log_param("n_degree", n_deg)
        mlflow.log_param("n_media_channels", media_cols)
        
        # Train the model
        mmm = lightweight_mmm.LightweightMMM(model_name=model_name)
        mmm.fit(
            media=media_data_train_scaled,
            media_prior=cost_train_scaled,
            target=target_train_scaled,
            extra_features=control_data_train_scaled,
            number_warmup=n_warmup,
            number_samples=n_samples,
            number_chains=4,
            degrees_seasonality=n_deg,
            seasonality_frequency=52,
            seed=1
        )
        
        # Predict on Train data and doing inverse transform
        target_train = mmm._target
        posterior_pred = mmm.trace["mu"]
        posterior_pred = target_scaler.inverse_transform(posterior_pred)
        target_train = target_scaler.inverse_transform(target_train)

        # Calculating Train R2
        r2_train = az.r2_score(y_true=target_train, y_pred=posterior_pred)
        train_r2 = r2_train.iloc[0]

        # Calculating Train MAPE
        train_mape = 100 * metrics.mean_absolute_percentage_error(y_true=target_train, y_pred=posterior_pred.mean(axis=0))

        # Calculating Train RMSE
        train_rmse = np.sqrt(metrics.mean_squared_error(
            y_true=target_train, 
            y_pred=posterior_pred.mean(axis=0)
        ))

        # Logging Train artifacts and model
        mlflow.log_metric("train_R2", train_r2)
        mlflow.log_metric("train_mape", train_mape)
        mlflow.log_metric("train_rmse", train_rmse)
        mlflow.sklearn.log_model(mmm, "model")

        # Predict on Test data (scaled)
        posterior_pred_test = mmm.predict(
            media=media_data_test_scaled,
            extra_features=control_data_test_scaled
        )

        # Inverse transform predictions
        posterior_pred_test_unscaled = target_scaler.inverse_transform(posterior_pred_test)

        # Calculation Test R2
        r2_test = az.r2_score(
            y_true=target_test,
            y_pred=posterior_pred_test_unscaled
        )
        test_r2 = r2_test.iloc[0]

        # Calculating Test MAPE
        test_mape = 100 * metrics.mean_absolute_percentage_error(
            y_true=target_test,
            y_pred=posterior_pred_test_unscaled.mean(axis=0)
        )

        # Calculating Test RMSE
        test_rmse = np.sqrt(metrics.mean_squared_error(
            y_true=target_test, 
            y_pred=posterior_pred_test_unscaled.mean(axis=0)
        ))

        # Logging Test artifacts
        mlflow.log_metric("test_R2", test_r2)
        mlflow.log_metric("test_mape", test_mape)
        mlflow.log_metric("test_rmse", test_rmse)

In [0]:
# -------------------------------
# Load particular experiment runs:
# -------------------------------
print(experiment_name)
experiment = mlflow.get_experiment_by_name(experiment_name)

# -------------------------------
# Listing all iterations under given experiment id in pandas dataframe:# -------------------------------
# -------------------------------
runs_df = mlflow.search_runs(
    experiment_ids=[experiment.experiment_id],
    output_format="pandas")
# -------------------------------
# Ensure numeric conversion for all required metrics:
# -------------------------------
for col in ["metrics.train_r2", "metrics.test_r2", "metrics.train_mape", "metrics.test_mape",
            "metrics.train_rmse", "metrics.test_rmse"]:
    if col in runs_df.columns:
        runs_df[col] = pd.to_numeric(runs_df[col], errors="coerce")

In [0]:
# -------------------------------
# Below logic is to select the best model from all the iterations
# 1. Keep only models where the absolute difference between train R² and test R² is ≤ 5% of train R²
# 2. a. From these, pick the model with the highest train R²
#    b. If there are no models in Step 1, Find models whose train R² is within 95% of the maximum train R².
#       Among these, pick the model with the smallest relative drop from train R² to test R².
# -------------------------------

# -------------------------------
# Step 1: Filter models with abs(train R2 - test R2) < 5% of train R2
# -------------------------------
runs_df["r2_diff_pct"] = abs(runs_df["metrics.train_R2"] - runs_df["metrics.test_R2"]) / runs_df["metrics.train_R2"] * 100

filtered_candidates = runs_df[runs_df["r2_diff_pct"] <= 5].copy()

if not filtered_candidates.empty:
    # Step 2a: Pick model with maximum train R²
    best_run = filtered_candidates.loc[filtered_candidates["metrics.train_R2"].idxmax()]
else:
    # -------------------------------
    # Step 2b: Fallback: previous logic
    # -------------------------------
    max_train_r2 = runs_df["metrics.train_R2"].max()
    threshold = max_train_r2 * 0.95  # within 5% of max

    candidates = runs_df[runs_df["metrics.train_R2"] >= threshold].copy()
    candidates["r2_drop_pct"] = ((candidates["metrics.train_R2"] - candidates["metrics.test_R2"]) / candidates["metrics.train_R2"]) * 100

    best_run = candidates.loc[candidates["r2_drop_pct"].idxmin()]

# -------------------------------
# Display the final selected run
# -------------------------------
print("Final selected run:")
print(best_run[["run_id", "metrics.train_R2", "metrics.test_R2", "r2_diff_pct"]])


In [0]:
run_id = best_run['run_id']

# Load back the model
loaded_model = mlflow.sklearn.load_model(f"runs:/{run_id}/model")
loaded_model

In [0]:
# -------------------------------
# Plotting Train results and metrics
# -------------------------------

plot.plot_model_fit(loaded_model, target_scaler=target_scaler)

In [0]:
# -------------------------------
# Plotting Test results and metrics
# -------------------------------
posterior_pred_test = loaded_model.predict(
    media=media_data_test_scaled,
    extra_features=control_data_test_scaled
)

# Inverse transform predictions & actuals
posterior_pred_test_unscaled = target_scaler.inverse_transform(posterior_pred_test)
plot.plot_out_of_sample_model_fit(out_of_sample_predictions=posterior_pred_test_unscaled,
                                  out_of_sample_target=target_test)