## Multiple Time Series Model Using Apache Spark and Facebook Prophet


### Yogesh Awdhut Gadade

#### Goal: Multiple time series analysis forecasting.

### Platform: Google Collab. One can try Databricks and AWS too.

In [None]:
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt

from fbprophet import Prophet
plt.rcParams['figure.figsize'] = (15, 8)
plt.rcParams['axes.grid'] = False

In [None]:
!pip install pyspark

In [None]:
from pyspark.sql import SparkSession
import pyspark
spark = SparkSession.builder.master('local').getOrCreate()

In [None]:
df = pd.read_csv("/content/sample_data/weekly_sales_data.csv")
df.shape

In [None]:
df.dtypes

In [None]:
df['date'] = pd.to_datetime(df['date'], infer_datetime_format=True)
df.head()

In [None]:
item_df = df.set_index('date')
item_df.query('store_id == 25')[['sales']].plot()
plt.show()

In [None]:
item_df.query('store_id == 41')[['sales']].plot(figsize=(15, 8))
plt.show()

In [None]:
sdf = spark.createDataFrame(df)
sdf.printSchema() #data type of each col
sdf.show(5) #It gives you head of pandas DataFrame
sdf.count() #500 records

In [None]:
sdf.select(['store_id']).groupby('store_id').agg({'store_id': 'count'}).show()

In [None]:
sdf.createOrReplaceTempView("sales")
spark.sql("select store_id, count(*) from sales group by store_id order by store_id").show()

In [None]:
sql = "SELECT store_id, date as ds, sum(sales) as y FROM sales GROUP BY store_id, ds ORDER BY store_id, ds"
spark.sql(sql).show()

In [None]:
store_part = (spark.sql(sql).repartition(spark.sparkContext.defaultParallelism, ['store_id'])).cache()
sdf.explain()

In [None]:
from pyspark.sql.types import *
result_schema = StructType([
                  StructField('ds', TimestampType()),
                  StructField('store_id', IntegerType()),
                  StructField('y', DoubleType()),
                  StructField('yhat', DoubleType()),
                  StructField('yhat_upper', DoubleType()),
                  StructField('yhat_lower', DoubleType())
])

In [None]:
from pyspark.sql.functions import pandas_udf, PandasUDFType
@pandas_udf(result_schema, PandasUDFType.GROUPED_MAP)
def forecast_sales(store_pd):
  model = Prophet(interval_width=0.95, seasonality_mode= 'multiplicative', weekly_seasonality=True, yearly_seasonality=True)
  model.fit(store_pd)
  future_pd = model.make_future_dataframe(periods=5, freq='w')
  forecast_pd = model.predict(future_pd)
  f_pd = forecast_pd[['ds', 'yhat', 'yhat_upper', 'yhat_lower']].set_index('ds')
  st_pd = store_pd[['ds', 'store_id', 'y']].set_index('ds')
  result_pd = f_pd.join(st_pd, how='left')
  result_pd.reset_index(level=0, inplace=True)
  result_pd['store_id'] = store_pd['store_id'].iloc[0]
  return result_pd[['ds', 'store_id', 'y', 'yhat', 'yhat_upper', 'yhat_lower']]

In [None]:
from pyspark.sql.functions import current_date
results = (store_part.groupby('store_id').apply(forecast_sales).withColumn('training_date', current_date()))
results.cache()
results.show()

In [None]:
results.coalesce(1)
print(results.count())
results.createOrReplaceTempView('forecasted')
spark.sql("SELECT store_id, count(*) FROM  forecasted GROUP BY store_id").show()

In [None]:
final_df = results.toPandas()

In [None]:
final_df = final_df.set_index('ds')

In [None]:
for store_id in list(final_df.store_id.unique()):
  final_df.query('store_id == {}'.format(store_id))[['y', 'yhat']].plot()
  plt.show()

Source:



*   https://towardsdatascience.com/implementing-facebook-prophet-efficiently-c241305405a3
* https://www.analyticsvidhya.com/blog/2022/01/apache-spark-and-facebook-prophet/

 