In [0]:
!pip install holidays==0.24
!pip install prophet==1.1.2

In [0]:
dbutils.library.restartPython()

In [0]:
%sql
create catalog if not exists forecasting_poc

In [0]:
%sql
use catalog forecasting_poc

In [0]:
%sql
create schema if not exists prophet

In [0]:
%sql
use prophet

In [0]:
import pandas as pd
from prophet import Prophet
import matplotlib.pyplot as plt

# Download the dataset directly from Prophet's GitHub repository
url = "https://raw.githubusercontent.com/facebook/prophet/master/examples/example_wp_log_peyton_manning.csv"
df = pd.read_csv(url)

# Convert pandas DataFrame to Spark DataFrame
spark_df = spark.createDataFrame(df)

# Save as table time_series_bronze
spark_df.write.mode('overwrite').saveAsTable("time_series_bronze")

In [0]:
%sql
select * from time_series_bronze

In [0]:
import pandas as pd
import numpy as np

# Load the base dataset from your bronze table
base_df = spark.table("time_series_bronze").toPandas()

# Ensure the 'ds' column is in datetime format
base_df['ds'] = pd.to_datetime(base_df['ds'])

# Get today's date (normalized to remove the time component)
current_date = pd.to_datetime("today").normalize()

# Calculate the shift needed so that the max date in base_df becomes today
max_base_date = base_df['ds'].max()
date_shift = current_date - max_base_date
base_df['ds'] = base_df['ds'] + date_shift

# Generate 1500 time series by replicating the base data with slight modifications on 'y'
num_series = 150
dfs = []

for i in range(1, num_series + 1):
    df_copy = base_df.copy()
    df_copy['time_series'] = f"time_series_{i}"
    # Add small random noise (mean=0, std=0.1) to slightly vary the target variable
    df_copy['y'] = df_copy['y'] + np.random.normal(loc=0, scale=0.1, size=len(df_copy))
    df_copy['y'] = df_copy['y'].round(2)  # Round to nearest hundredth
    dfs.append(df_copy)

# Combine all replicated series into one DataFrame
df_multigrain = pd.concat(dfs, ignore_index=True)

# Convert the Pandas DataFrame to a Spark DataFrame
spark_df = spark.createDataFrame(df_multigrain)

# Save the Spark DataFrame as a table called time_series_curated (overwrite if exists)
spark_df.write.mode("overwrite").saveAsTable("time_series_curated")

In [0]:
%sql
select * from time_series_curated

In [0]:
from mlflow.models.signature import infer_signature
from prophet import Prophet
from pyspark.sql import SparkSession
import mlflow
import pandas as pd
from pyspark.sql.functions import col

In [0]:
from pyspark.sql.functions import col, lit, concat
import pyspark.sql.functions as F

sql_statement = '''
  SELECT
    time_series as id,
    CAST(ds as date) as ds,
    SUM(y) as y
  FROM time_series_curated
  where ds >= '2021-01-01'
  GROUP BY id, ds
  ORDER BY id, ds
  '''

id_history = (
  spark
    .sql( sql_statement )
  )

In [0]:
from pyspark.sql.types import *
 
result_schema =StructType([
  StructField('ds',DateType()),
  StructField('id',StringType()),
  StructField('y',FloatType()),
  StructField('yhat',FloatType()),
  StructField('yhat_upper',FloatType()),
  StructField('yhat_lower',FloatType())
  ])

In [0]:
def forecast( history_pd: pd.DataFrame ) -> pd.DataFrame:
  history_pd = history_pd.dropna()
  
  # configure the model
  model = Prophet(
    interval_width=0.95,
    growth='linear',
    daily_seasonality=False,
    weekly_seasonality=True,
    yearly_seasonality=True,
    seasonality_mode='multiplicative'
    )
  
  # train the model
  model.fit( history_pd )
  
  # make predictions
  future_pd = model.make_future_dataframe(
    periods=365, 
    freq='d', 
    include_history=True
    )
  forecast_pd = model.predict( future_pd )  

  # get relevant fields from forecast
  f_pd = forecast_pd[ ['ds','yhat', 'yhat_upper', 'yhat_lower'] ].set_index('ds')
  
  # get relevant fields from history
  h_pd = history_pd[['ds','id','y']].set_index('ds')
  
  # join history and forecast
  results_pd = f_pd.join( h_pd, how='left' )
  results_pd.reset_index(level=0, inplace=True)
  
  # get store & item from incoming data set
  results_pd['id'] = history_pd['id'].iloc[0]
  # --------------------------------------
  
  # return expected dataset
  return results_pd[ ['ds', 'id', 'y', 'yhat', 'yhat_upper', 'yhat_lower'] ]  

In [0]:
from pyspark.sql.functions import current_date
 
results = (
  id_history
    .groupBy('id')
      .applyInPandas(forecast, schema=result_schema)
    .withColumn('training_date', current_date() )
    )

results.write.mode("overwrite").saveAsTable("forecast")

display(results)