##### Model version
- Python 3.9.18
-  Pycaret 3.2.0

In [10]:
pip show pycaret

Name: pycaretNote: you may need to restart the kernel to use updated packages.

Version: 3.2.0
Summary: PyCaret - An open source, low-code machine learning library in Python.
Home-page: https://github.com/pycaret/pycaret
Author: Moez Ali
Author-email: moez.ali@queensu.ca
License: MIT
Location: c:\users\zeesh\anaconda3\envs\forecasting_env\lib\site-packages
Requires: category-encoders, cloudpickle, deprecation, imbalanced-learn, importlib-metadata, ipython, ipywidgets, jinja2, joblib, kaleido, lightgbm, markupsafe, matplotlib, nbformat, numba, numpy, pandas, plotly, plotly-resampler, pmdarima, psutil, pyod, requests, schemdraw, scikit-learn, scikit-plot, scipy, sktime, statsmodels, tbats, tqdm, xxhash, yellowbrick
Required-by: 


## Experiment with PyCaret

In [11]:
import pickle
import mlflow
import pandas as pd
from pycaret.time_series import *
from pycaret.datasets import get_data
import matplotlib.pyplot as plt

# ... data preparation and setup
data = get_data('pycaret_downloads')
data['Date'] = pd.to_datetime(data['Date'])
data.set_index('Date', inplace=True)


Unnamed: 0,Date,Total
0,7/8/2021,3773
1,7/9/2021,3172
2,7/10/2021,2079
3,7/11/2021,2725
4,7/12/2021,4542


# EDA

In [12]:
import plotly.express as px

# Assuming daily_data is a DataFrame with a 'date' column
dt=data.reset_index()
df_test= dt.copy()

fig = px.line(df_test, x='Date', y='Total', markers=True)
# Update the marker color
fig.update_traces(marker=dict(color='red'))

# Update the layout to include zooming functionality
fig.update_layout(
    xaxis=dict(
        rangeselector=dict(
            buttons=list([
                dict(count=1, label="1d", step="day", stepmode="backward"),
                dict(count=7, label="1w", step="day", stepmode="backward"),
                dict(count=1, label="1m", step="month", stepmode="backward"),
                dict(count=1, label="1y", step="year", stepmode="backward"),
                dict(step="all")
            ])
        ),
        rangeslider=dict(visible=True),
        type="date"
    )
)

fig.show()

# Modeling

In [13]:
# Log best model and PyCaret artifacts to MLflow
mlflow.autolog()
with mlflow.start_run():
    # Set up PyCaret
    s = setup(data, fh=12, session_id=123)
    # Log experiment to MLflow
    mlflow.set_experiment("Pycaret_experiment")
    # Train models
    check_stats()
    
    best = compare_models()
    # Log PyCaret experiment
    mlflow.log_params({'fh': 12, 'session_id': 123})


2024/03/07 15:02:19 INFO mlflow.tracking.fluent: Autologging successfully enabled for xgboost.
2024/03/07 15:02:19 INFO mlflow.tracking.fluent: Autologging successfully enabled for lightgbm.
2024/03/07 15:02:19 INFO mlflow.tracking.fluent: Autologging successfully enabled for statsmodels.


2024/03/07 15:02:20 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.


Unnamed: 0,Description,Value
0,session_id,123
1,Target,Total
2,Approach,Univariate
3,Exogenous Variables,Not Present
4,Original data shape,"(90, 1)"
5,Transformed data shape,"(90, 1)"
6,Transformed train set shape,"(78, 1)"
7,Transformed test set shape,"(12, 1)"
8,Rows with missing values,0.0%
9,Fold Generator,ExpandingWindowSplitter




Unnamed: 0,Model,MASE,RMSSE,MAE,RMSE,MAPE,SMAPE,R2,TT (Sec)
theta,Theta Forecaster,1.2803,1.1409,736.2266,898.0371,0.1556,0.1523,0.7302,0.0433
knn_cds_dt,K Neighbors w/ Cond. Deseasonalize & Detrending,1.346,1.2833,737.3306,934.8335,0.143,0.1419,0.7346,0.29
huber_cds_dt,Huber w/ Cond. Deseasonalize & Detrending,1.3698,1.3083,738.8352,922.5881,0.1326,0.1401,0.7353,0.2033
en_cds_dt,Elastic Net w/ Cond. Deseasonalize & Detrending,1.4299,1.3469,777.9459,964.8748,0.1413,0.1466,0.7146,0.6967
ridge_cds_dt,Ridge w/ Cond. Deseasonalize & Detrending,1.4299,1.3469,777.9461,964.8752,0.1413,0.1466,0.7146,0.67
lasso_cds_dt,Lasso w/ Cond. Deseasonalize & Detrending,1.4299,1.3469,777.9457,964.8746,0.1413,0.1466,0.7146,0.3
llar_cds_dt,Lasso Least Angular Regressor w/ Cond. Deseasonalize & Detrending,1.4299,1.3469,777.9457,964.8746,0.1413,0.1466,0.7146,0.1833
lr_cds_dt,Linear w/ Cond. Deseasonalize & Detrending,1.4299,1.3469,777.9461,964.8752,0.1413,0.1466,0.7146,1.4133
lightgbm_cds_dt,Light Gradient Boosting w/ Cond. Deseasonalize & Detrending,1.468,1.3076,795.8164,936.367,0.1531,0.1538,0.7309,1.82
br_cds_dt,Bayesian Ridge w/ Cond. Deseasonalize & Detrending,1.4741,1.3402,804.2119,968.4707,0.152,0.1547,0.7141,0.19


# Check statistics

In [14]:
check_stats()

2024/03/07 15:07:21 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '445ae369eec14a408fe168d97826ec12', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current statsmodels workflow
2024/03/07 15:07:28 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '4184aa1011174ff4bb3ea78309b8505d', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current statsmodels workflow
2024/03/07 15:07:34 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'f2e874a0a259498b91a8dda3ce2c958d', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current statsmodels workflow
2024/03/07 15:07:41 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '9093e0bedc8d455082a69ff84cdf0c56', which will track hyperparameters, performance metrics, model artifacts, a

Unnamed: 0,Test,Test Name,Data,Property,Setting,Value
0,Summary,Statistics,Transformed,Length,,90.0
1,Summary,Statistics,Transformed,# Missing Values,,0.0
2,Summary,Statistics,Transformed,Mean,,4960.366667
3,Summary,Statistics,Transformed,Median,,4776.5
4,Summary,Statistics,Transformed,Standard Deviation,,1939.000116
5,Summary,Statistics,Transformed,Variance,,3759721.448315
6,Summary,Statistics,Transformed,Kurtosis,,-0.920113
7,Summary,Statistics,Transformed,Skewness,,0.22622
8,Summary,Statistics,Transformed,# Distinct Values,,89.0
9,White Noise,Ljung-Box,Transformed,Test Statictic,"{'alpha': 0.05, 'K': 24}",293.08932


## Compare Model

In [15]:
best = compare_models()

Unnamed: 0,Model,MASE,RMSSE,MAE,RMSE,MAPE,SMAPE,R2,TT (Sec)


In [16]:
# final_model = pull(best)
# final_model.fit()

In [17]:
best.get_params()

{'deseasonalize': True, 'initial_level': None, 'sp': 7}

In [18]:
# best_mae_models_top3 = compare_models(sort = 'R2', n_select = 3)
# list of top 3 models by MAE
# best_mae_models_top3


## Analyze Model

In [19]:
naive=create_model('naive')

Unnamed: 0,cutoff,MASE,RMSSE,MAE,RMSE,MAPE,SMAPE,R2
0,2021-08-18,3.9079,4.2981,1788.25,2390.4059,0.6371,0.3749,-0.4494
1,2021-08-30,1.9968,1.6706,1288.8333,1617.7194,0.3451,0.2642,-0.0072
2,2021-09-11,5.2294,4.0201,3244.1667,3662.0296,0.464,0.6339,-2.9818
Mean,NaT,3.7114,3.3296,2107.0833,2556.7183,0.4821,0.4243,-1.1462
SD,NaT,1.327,1.1786,829.487,842.8309,0.1199,0.1549,1.3105


In [20]:
theta=create_model('theta')

Unnamed: 0,cutoff,MASE,RMSSE,MAE,RMSE,MAPE,SMAPE,R2
0,2021-08-18,1.2785,1.3734,585.0244,763.8488,0.1576,0.1421,0.852
1,2021-08-30,1.3553,1.1081,874.7426,1073.0322,0.163,0.1787,0.5568
2,2021-09-11,1.2072,0.941,748.9128,857.2304,0.1462,0.1362,0.7818
Mean,NaT,1.2803,1.1409,736.2266,898.0371,0.1556,0.1523,0.7302
SD,NaT,0.0605,0.178,118.6166,129.4797,0.007,0.0188,0.1259


In [21]:
tuned_theta= tune_model(theta)

Unnamed: 0,cutoff,MASE,RMSSE,MAE,RMSE,MAPE,SMAPE,R2
0,2021-08-18,1.2785,1.3734,585.0244,763.8488,0.1576,0.1421,0.852
1,2021-08-30,1.3553,1.1081,874.7426,1073.0322,0.163,0.1787,0.5568
2,2021-09-11,1.2072,0.941,748.9128,857.2304,0.1462,0.1362,0.7818
Mean,NaT,1.2803,1.1409,736.2266,898.0371,0.1556,0.1523,0.7302
SD,NaT,0.0605,0.178,118.6166,129.4797,0.007,0.0188,0.1259


Fitting 3 folds for each of 2 candidates, totalling 6 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   6 out of   6 | elapsed:    0.0s finished


## EDA-2nd

In [22]:
plot_model(plot='train_test_split')

In [23]:
plot_model(plot='cv')

In [24]:
plot_model(plot='decomp')

In [25]:
plot_model(plot='decomp', data_kwargs={'type':'Multiplicative'})

In [26]:
plot_model(plot='decomp', data_kwargs={'sesonal_period':24})

In [27]:
plot_model(plot='acf')

In [28]:
plot_model(plot='pacf')


In [29]:
plot_model(best, plot = 'diagnostics')


In [30]:
# plot forecast
plot_model(best, plot = 'forecast')

In [31]:
# plot forecast for 36 months in future
plot_model(best, plot = 'forecast', data_kwargs = {'fh' : 36})

In [32]:
plot_model(estimator=tuned_theta, data_kwargs={"fh":36})
# plot_model(estimator=naive, data_kwargs={"fh":36})
# plot_model([theta, tuned_theta], data_kwargs={"labels": ["Baseline", "Tuned"]})

In [33]:
# residuals plot
# plot_model(best, plot = 'insample')

In [34]:
# residuals plot
plot_model(best, plot = 'residuals')

In [35]:
# predict on test set
holdout_pred = predict_model(best)

Unnamed: 0,Model,MASE,RMSSE,MAE,RMSE,MAPE,SMAPE,R2
0,Theta Forecaster,0.4662,0.4499,308.2574,414.0053,0.0463,0.0476,0.9552


## Predicting

In [36]:
# generate forecast for 36 period in future
predict_model(best, fh = 36)

Unnamed: 0,y_pred
2021-09-24,7111.4964
2021-09-25,3578.8096
2021-09-26,4123.7043
2021-09-27,7779.4459
2021-09-28,8493.7955
2021-09-29,8468.0194
2021-09-30,8467.1125
2021-10-01,7243.3989
2021-10-02,3645.0132
2021-10-03,4199.7866


In [37]:
predict_model(tuned_theta, fh = 36)


Unnamed: 0,y_pred
2021-09-24,7111.4964
2021-09-25,3578.8096
2021-09-26,4123.7043
2021-09-27,7779.4459
2021-09-28,8493.7955
2021-09-29,8468.0194
2021-09-30,8467.1125
2021-10-01,7243.3989
2021-10-02,3645.0132
2021-10-03,4199.7866


## MLflow tracking

In [39]:
import pandas as pd
import mlflow.tracking

# Replace with your experiment ID
experiment_id = "257795701779964022"

# Get all runs within the experiment
client = mlflow.tracking.MlflowClient()
runs = client.search_runs(experiment_ids=experiment_id)
# Create an empty dataframe
df_runs = pd.DataFrame(columns=["Run ID", "Model Name"])

# Iterate through each run and extract model name and metrics
for run in runs:
    run_data = client.get_run(run.info.run_id)
    model_name = run_data.data.params.get("model_name")  # Assuming "model_name" is logged as a parameter
    metrics = run_data.data.metrics  # Capture all logged metrics

    # Append the run data to the dataframe
    row_data = {"Run ID": run.info.run_id, "Model Name": model_name}
    for metric_name, metric_value in metrics.items():
        row_data[metric_name] = metric_value
        
    df_runs = df_runs.append(row_data, ignore_index=True)

# Replace missing metric values with "NA"
df_runs = df_runs.fillna("NA")

# Print the dataframe
display(df_runs)


Unnamed: 0,Run ID,Model Name,aic,bic,centered_tss,condition_number,df_model,df_resid,ess,fvalue,...,rsquared,rsquared_adj,scale,ssr,uncentered_tss,training_mean_absolute_error,training_mean_squared_error,training_r2_score,training_root_mean_squared_error,training_score
0,23ddb3d9a5664e95aefbad471848ff04,,1022.437956,1027.151374,,,,,,,...,,,,,,,,,,
1,ab1ae28319f34c27aaf965bd9e690af9,,1022.437956,1027.151374,,,,,,,...,,,,,,,,,,
2,2277b92f1c294c61b0b9b8f9d49f8744,,1022.437956,1027.151374,,,,,,,...,,,,,,,,,,
3,ecb9342004dd4fb28b0cb06c510a887b,,1022.437956,1027.151374,,,,,,,...,,,,,,,,,,
4,918ab134e35e49858d5d736645293278,,1584.835013,1589.834633,334615208.9,105.697069,1.0,88.0,110702669.591015,43.507322,...,0.330836,0.323232,2544460.673966,223912539.308985,2549086581.0,,,,,
5,2081bde40e5b4dfdab69633c5e201c88,,1319.729608,1341.390081,231369581.02439,26158.572416,8.0,73.0,193728992.824041,46.96465,...,0.837314,0.819485,515624.495895,37640588.200349,231651494.0,,,,,
6,3fb19747eb62434dbf3f992885369726,,1244.26518,1277.078456,225249930.987013,29319.840747,13.0,63.0,192592595.951007,28.579593,...,0.855017,0.8251,518370.397397,32657335.036006,225550150.0,,,,,
7,73e59fc1ec46405aad1c457f4db10e53,,1251.090002,1281.559472,225249930.987013,29065.390263,12.0,64.0,188626890.679261,27.469322,...,0.837412,0.806926,572235.004809,36623040.307752,225550150.0,,,,,
8,dcc185933bc946299fcde56560e1d484,,1249.181658,1277.307323,225249930.987013,28777.381253,11.0,65.0,188583270.897905,30.391524,...,0.837218,0.80967,564102.462909,36666660.089108,225550150.0,,,,,
9,6c61e821af644b01b29c06f50070908e,,1247.498392,1273.280251,225249930.987013,28579.796848,10.0,66.0,188432134.547205,33.778558,...,0.836547,0.811781,557845.400603,36817796.439808,225550150.0,,,,,


## Model registration

- (**) Fetching top 12 models

In [60]:
import mlflow

# Replace with your experiment ID
experiment_id = "257795701779964022"

# Specify the metric to evaluate the best run
metric_r2 = "rsquared"

# Fetch the best run based on the given metric
top_12_best_run = mlflow.search_runs(
    experiment_ids=experiment_id,
    order_by=[f"metric.{metric_r2} DESC"],
    max_results=12
)
top_12_best_run[['run_id','artifact_uri','metrics.aic','metrics.bic','metrics.rsquared']]

Unnamed: 0,run_id,artifact_uri,metrics.aic,metrics.bic,metrics.rsquared
0,3fb19747eb62434dbf3f992885369726,file:///c:/Users/zeesh/OneDrive%20-%20ORMAE/Do...,1244.26518,1277.078456,0.855017
1,bf8f85432f754329a6e22241ff2e7002,file:///c:/Users/zeesh/OneDrive%20-%20ORMAE/Do...,1244.26518,1277.078456,0.855017
2,73e59fc1ec46405aad1c457f4db10e53,file:///c:/Users/zeesh/OneDrive%20-%20ORMAE/Do...,1251.090002,1281.559472,0.837412
3,e6b548b070ac441a8cef4a78b25f6bc9,file:///c:/Users/zeesh/OneDrive%20-%20ORMAE/Do...,1251.090002,1281.559472,0.837412
4,2081bde40e5b4dfdab69633c5e201c88,file:///c:/Users/zeesh/OneDrive%20-%20ORMAE/Do...,1319.729608,1341.390081,0.837314
5,fd126a2edf744210803fab7529e53c6f,file:///c:/Users/zeesh/OneDrive%20-%20ORMAE/Do...,1319.729608,1341.390081,0.837314
6,dcc185933bc946299fcde56560e1d484,file:///c:/Users/zeesh/OneDrive%20-%20ORMAE/Do...,1249.181658,1277.307323,0.837218
7,c53607def6ea426bae2f8bae7acbbb9e,file:///c:/Users/zeesh/OneDrive%20-%20ORMAE/Do...,1249.181658,1277.307323,0.837218
8,6c61e821af644b01b29c06f50070908e,file:///c:/Users/zeesh/OneDrive%20-%20ORMAE/Do...,1247.498392,1273.280251,0.836547
9,0062b2b6cc2d41b7a812567acd3d6ca2,file:///c:/Users/zeesh/OneDrive%20-%20ORMAE/Do...,1247.498392,1273.280251,0.836547


- ### (a) Getting top 1 best model run_id

In [46]:
import mlflow

# Replace with your experiment ID
experiment_id = "257795701779964022"

# Specify the metric to evaluate the best run
metric = "rsquared"

# Fetch the best run based on the given metric
best_run = mlflow.search_runs(
    experiment_ids=experiment_id,
    order_by=[f"metric.{metric} DESC"],
    max_results=1
).iloc[0]
best_run

run_id                                            3fb19747eb62434dbf3f992885369726
experiment_id                                                   257795701779964022
status                                                                    FINISHED
artifact_uri                     file:///c:/Users/zeesh/OneDrive%20-%20ORMAE/Do...
start_time                                        2024-03-07 09:38:43.033000+00:00
end_time                                          2024-03-07 09:38:51.335000+00:00
metrics.mse_model                                                  14814815.073154
metrics.f_pvalue                                                               0.0
metrics.condition_number                                              29319.840747
metrics.bic                                                            1277.078456
metrics.centered_tss                                              225249930.987013
metrics.llf                                                             -608.13259
metr

In [47]:
run_id=best_run.run_id
run_id

'3fb19747eb62434dbf3f992885369726'

- ### (b) Registrattion model

In [None]:
# model_name='PyCaret_model'
# model_path=best_run.artifact_uri
# mlflow.register_model(f"{model_path}/model",model_name)
# time.sleep(15)

- ### (c) Changing stage to production

In [None]:
# from mlflow.tracking import MlflowClient

# model_version = 1  # replace with the actual model version
# client = MlflowClient()
# client.transition_model_version_stage(
#     name=model_name,
#     version=model_version,
#     stage="Production"
# )

# Testing the best Model saved in Artifacts folder in MLflow

In [None]:
import mlflow
from pycaret.datasets import get_data
#...data preparation and setup
model_name='PyCaret_model'
test_set = get_data('pycaret_downloads')
test_set['Date'] = pd.to_datetime(test_set['Date'])
test_set.drop('Total', axis=1, inplace=True)

logged_model = 'runs:/xx/model'

# Load model as a PyFuncModel.
loaded_model = mlflow.pyfunc.load_model(logged_model)

# Predict on a Pandas DataFrame.
import pandas as pd
loaded_model.predict(test_set)

In [45]:
import pickle
import pandas as pd

test_set = get_data('pycaret_downloads').head(20)
test_set['Date'] = pd.to_datetime(test_set['Date'])
test_set.set_index('Date', inplace=True)
# Load the saved model from the pickle file
model_path=r'C:\Users\zeesh\OneDrive - ORMAE\Documents\Projects\Self_Programming_Practice\Zee_github_projects\sales_forecasting\Pycaret_forecast\mlruns\257795701779964022\31224a34430b4bfc9fbf4be33a9e5acf\artifacts\model\model.pkl'
with open(model_path, 'rb') as model_file:
    loaded_model = pickle.load(model_file)

# Make predictions using the loaded model
predictions = loaded_model.predict(test_set)

# Display the predictions
predictions


Unnamed: 0,Date,Total
0,7/8/2021,3773
1,7/9/2021,3172
2,7/10/2021,2079
3,7/11/2021,2725
4,7/12/2021,4542


array([57.29213483, 57.29213483, 57.29213483, 57.29213483, 57.29213483,
       57.29213483, 57.29213483, 57.29213483, 57.29213483, 57.29213483,
       57.29213483, 57.29213483, 57.29213483, 57.29213483, 57.29213483,
       57.29213483, 57.29213483, 57.29213483, 57.29213483, 57.29213483])