In [0]:
# -------------------------------
# Installing all required libraries and dependencies
# -------------------------------
%pip install --force-reinstall --no-cache-dir \
  numpy==1.25.2 \
  pandas==2.1.3 \
  scikit-learn==1.3.2 \
  jax==0.4.25 \
  jaxlib==0.4.25 \
  numpyro==0.13.2 \
  lightweight-mmm==0.1.9 \
  mlflow \
  openpyxl

In [0]:
# -------------------------------
# Restarting python after installing new libraries
# -------------------------------
dbutils.library.restartPython()

In [0]:
# -------------------------------
# Importing all relevant libraries
# -------------------------------
from sklearn import metrics
from datetime import datetime

import mlflow
import numpyro
import warnings
import itertools
import arviz as az
import numpy as np
import pandas as pd
import mlflow.pyfunc
import mlflow.sklearn 
import jax.numpy as jnp
import databricks.connect as db_connect
import mlflow.tracking._model_registry.utils

warnings.filterwarnings("ignore")

In [0]:
# -------------------------------
# Import the relevant modules of the lightweight mmm library
# -------------------------------
from lightweight_mmm import lightweight_mmm
from lightweight_mmm import optimize_media
from lightweight_mmm import plot
from lightweight_mmm import preprocessing
from lightweight_mmm import utils

In [0]:
# -------------------------------
# Load data from snowflake and converting it into pandas dataframe
# -------------------------------

Test = pd.read_csv("/Workspace/Users/himanshu164137@exlservice.com/Test_Dataset.csv")

In [0]:
# Converting all columns to upper case
Test.columns = [i.upper() for i in Test.columns]
Test.head()

In [0]:
# -------------------------------
# Checking Min and Max data
# -------------------------------
Test['WEEK_START'].min(), Test['WEEK_START'].max()

In [0]:
# -------------------------------
# Creating Month and week number variable : If date is greater than equal to 28, month indicator increased by 1 and 
# week number reset to 1 
# -------------------------------
Test["WEEK_START"] = pd.to_datetime(Test["WEEK_START"])
Test["CUSTOM_MONTH"] = (Test["WEEK_START"].dt.month 
                      + (Test["WEEK_START"].dt.day >= 28)).astype(int)

# Sequential month counter (1, 2, 3, …)
Test["MONTH_NUM"] = (Test["CUSTOM_MONTH"] != Test["CUSTOM_MONTH"].shift()).cumsum()

# --- Step 2: Week counter within each month ---
Test["WEEK_NO_IN_MONTH"] = Test.groupby("MONTH_NUM").cumcount() + 1
Test["CUSTOM_MONTH"] = np.where(Test["CUSTOM_MONTH"] == 13, 1, Test["CUSTOM_MONTH"])

In [0]:
# -------------------------------
# Checking Min and Max values in above create variables
# -------------------------------
print(Test['CUSTOM_MONTH'].min(), Test['CUSTOM_MONTH'].max())
print(Test['MONTH_NUM'].min(), Test['MONTH_NUM'].max())
print(Test['WEEK_NO_IN_MONTH'].min(), Test['WEEK_NO_IN_MONTH'].max())

# Dropping as not needed 
Test = Test.drop(columns = ['MONTH_NUM'])

In [0]:
Test.describe(percentiles = [0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 0.75, 0.90, 0.95, 0.975, 0.99])

In [0]:
# -------------------------------
# Listing all media and control variables with target and date column 
# -------------------------------
media_cols = ['ASA_APP', 'BING_DISPLAY', 'BING_SEARCH',
       'DV360_DISPLAY_OR_OLV', 'FACEBOOK_SOCIAL',
       'GOOGLE_DISPLAY_OR_OLV', 'GOOGLE_SEARCH', 'LINKEDIN_SOCIAL', 'META_APP',
       'REDDIT_SOCIAL', 'TAPTICA_APP', 'TWITTER_SOCIAL',
       'COMMISSIONS_AFFILIATE', 'PLACEMENT_AFFILIATE', 'REDBOX_APP', 'LIFTOFF_APP', 'BRAND_SPEND']
control_cols = ['NEWS_ANOMALY', 'HOLIDAY_FLAG', 'SALE_FLAG', 'WSJ_EMAILS_TOTAL']
target_col = 'ORDERS'
date_col = 'WEEK_START'

In [0]:
# -------------------------------
# Sorting data based on date column and creating new dataframes for media, control, sales and cost
# -------------------------------
Test_df = Test.sort_values(date_col).reset_index(drop = True)

# -------------------------------
# Media variables
# -------------------------------
media_data_test = Test_df[media_cols].astype(float).to_numpy()
print("Media shapes:", media_data_test.shape)

# -------------------------------
# Control variables
# -------------------------------
control_data_test  = Test_df[control_cols].astype(float).to_numpy()
print("Control shapes:", control_data_test.shape)

# -------------------------------
# Target variable
# -------------------------------
target_test = Test_df[target_col].astype(float).to_numpy()
print("Target shapes:", target_test.shape)

# -------------------------------
# Costs variables
# -------------------------------
costs_test = Test_df[media_cols].sum(axis = 0).to_numpy()
print("Costs shapes:", costs_test.shape)


In [0]:
# Use the run_id from the logging step
scaler_run_id = "0936a3efc66449bda432f7ba055e6c91"

# Download artifacts
media_scaler_path   = mlflow.artifacts.download_artifacts(run_id=scaler_run_id, artifact_path="scalers/media_scaler.pkl")
target_scaler_path  = mlflow.artifacts.download_artifacts(run_id=scaler_run_id, artifact_path="scalers/target_scaler.pkl")
cost_scaler_path    = mlflow.artifacts.download_artifacts(run_id=scaler_run_id, artifact_path="scalers/cost_scaler.pkl")
control_scaler_path = mlflow.artifacts.download_artifacts(run_id=scaler_run_id, artifact_path="scalers/control_scaler.pkl")


In [0]:

# Load them back
import joblib
media_scaler   = joblib.load(media_scaler_path)
target_scaler  = joblib.load(target_scaler_path)
cost_scaler    = joblib.load(cost_scaler_path)
control_scaler = joblib.load(control_scaler_path)

In [0]:
# -------------------------------
# Scale media data (all columns)
# -------------------------------
media_data_test_scaled  = media_scaler.transform(media_data_test)

# -------------------------------
# Scale only 3rd column of control data (index 3)
# -------------------------------

# Test
control_data_test_scaled = control_data_test.copy()
control_data_test_scaled[:, 3] = control_scaler.transform(control_data_test[:, 3].reshape(-1, 1)).flatten()

# -------------------------------
# Scale target variable
# -------------------------------
target_test_scaled  = target_scaler.transform(target_test)

# -------------------------------
# Scale costs (all columns)
# -------------------------------
cost_test_scaled  = cost_scaler.transform(costs_test)


In [0]:
# Use the run_id from when you logged the model
run_id = "ae68467358f14d35b7de54ea7d36ac6f"

# Load back the model
loaded_model = mlflow.sklearn.load_model(f"runs:/{run_id}/model")
loaded_model


In [0]:
experiment_name = f"/Users/himanshu164137@exlservice.com/lightweight_mmm_inference_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
mlflow.set_experiment(experiment_name)
with mlflow.start_run(run_name=experiment_name, nested=True):
    # Predict on Test data (scaled)
    posterior_pred_test = loaded_model.predict(
        media=media_data_test_scaled,
        extra_features=control_data_test_scaled
    )

    # Inverse transform predictions
    posterior_pred_test_unscaled = target_scaler.inverse_transform(posterior_pred_test)

    # Calculation Test R2
    r2_test = az.r2_score(
        y_true=target_test,
        y_pred=posterior_pred_test_unscaled
    )
    test_r2 = r2_test.iloc[0]

    # Calculating Test MAPE
    test_mape = 100 * metrics.mean_absolute_percentage_error(
        y_true=target_test,
        y_pred=posterior_pred_test_unscaled.mean(axis=0)
    )

    # Calculating Test RMSE
    test_rmse = np.sqrt(metrics.mean_squared_error(
        y_true=target_test, 
        y_pred=posterior_pred_test_unscaled.mean(axis=0)
    ))

    # Logging Test artifacts
    mlflow.log_metric("test_R2", test_r2)
    mlflow.log_metric("test_mape", test_mape)
    mlflow.log_metric("test_rmse", test_rmse)

In [0]:
# Inverse transform predictions & actuals
posterior_pred_test_unscaled = target_scaler.inverse_transform(posterior_pred_test)
plot.plot_out_of_sample_model_fit(out_of_sample_predictions=posterior_pred_test_unscaled,
                                  out_of_sample_target=target_test)

In [0]:
mean_predictions = posterior_pred_test_unscaled.mean(axis = 0)
output_df = pd.DataFrame(mean_predictions)
output_df.columns = ['Test_orders_prediction']

In [0]:
# Save persistently
output_df.to_csv("/Workspace/Users/himanshu164137@exlservice.com/Test_Predictions.csv", index=False)