In [16]:
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import LabelEncoder
from statsmodels.tsa.statespace.sarimax import SARIMAX
import matplotlib.pyplot as plt

In [2]:
df_cleaned = pd.read_csv('./df_cleaned.csv')

In [3]:
df_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34720691 entries, 0 to 34720690
Data columns (total 7 columns):
 #   Column      Dtype  
---  ------      -----  
 0   Unnamed: 0  int64  
 1   item_id     int64  
 2   store_id    int64  
 3   value       float64
 4   date        object 
 5   wm_yr_wk    int64  
 6   sell_price  float64
dtypes: float64(2), int64(4), object(1)
memory usage: 1.8+ GB


In [4]:
calendar_events = pd.read_csv('./calendar_events.csv')

In [5]:
calendar_events.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 167 entries, 0 to 166
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   date        167 non-null    object
 1   event_name  167 non-null    object
 2   event_type  167 non-null    object
dtypes: object(3)
memory usage: 4.0+ KB


In [6]:
df_cleaned['date'] = pd.to_datetime(df_cleaned['date'], format='%Y-%m-%d')
df_cleaned['sell_price'] = df_cleaned['sell_price'].fillna(0)
calendar_events['date'] = pd.to_datetime(df_cleaned['date'], format='%Y-%m-%d')
df_cleaned = pd.merge(df_cleaned, calendar_events, left_on='date', right_on='date', how='left')

In [7]:
df_cleaned['rev'] = df_cleaned['value'] * df_cleaned['sell_price']

In [8]:
df_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 34720691 entries, 0 to 34720690
Data columns (total 10 columns):
 #   Column      Dtype         
---  ------      -----         
 0   Unnamed: 0  int64         
 1   item_id     int64         
 2   store_id    int64         
 3   value       float64       
 4   date        datetime64[ns]
 5   wm_yr_wk    int64         
 6   sell_price  float64       
 7   event_name  object        
 8   event_type  object        
 9   rev         float64       
dtypes: datetime64[ns](1), float64(3), int64(4), object(2)
memory usage: 2.8+ GB


In [9]:
column_to_drop = 'Unnamed: 0'
df_cleaned = df_cleaned.drop(column_to_drop, axis=1)
df_cleaned

Unnamed: 0,item_id,store_id,value,date,wm_yr_wk,sell_price,event_name,event_type,rev
0,1437,0,0.0,2013-07-13,11325,9.58,SuperBowl,Sporting,0.0
1,1437,0,0.0,2013-07-14,11325,9.58,ValentinesDay,Cultural,0.0
2,1437,0,0.0,2013-07-15,11325,9.58,PresidentsDay,National,0.0
3,1437,0,0.0,2013-07-16,11325,9.58,LentStart,Religious,0.0
4,1437,0,0.0,2013-07-17,11325,9.58,LentWeek2,Religious,0.0
...,...,...,...,...,...,...,...,...,...
34720686,1436,9,3.0,2015-04-14,11511,1.00,,,3.0
34720687,1436,9,0.0,2015-04-15,11511,1.00,,,0.0
34720688,1436,9,4.0,2015-04-16,11511,1.00,,,4.0
34720689,1436,9,0.0,2015-04-17,11511,1.00,,,0.0


In [10]:
label_encoders = {}
categorical_cols = ['item_id', 'store_id']
for col in categorical_cols:
    le = LabelEncoder()
    df_cleaned[col] = le.fit_transform(df_cleaned[col])
    label_encoders[col] = le

In [11]:
time_series_df = df_cleaned.set_index('date')
daily_revenue = time_series_df.resample('D').sum()

  daily_revenue = time_series_df.resample('D').sum()


In [39]:
def forecast_total_revenue(input_date):
    
    # SARIMA Model Training
    order = (1, 1, 1)  # Non-seasonal order
    seasonal_order = (1, 1, 1, 7)  # Seasonal order with a 7-day period
    model = SARIMAX(daily_revenue['rev'], order=order, seasonal_order=seasonal_order)
    model_fit = model.fit(disp=False)
    
    # Create a date range for the next 7 days
    forecast_start_date = input_date
    forecast_end_date = input_date + pd.DateOffset(days=6)
    forecast_dates = pd.date_range(forecast_start_date, forecast_end_date)
    
    total_revenue = 0  # Initialize total revenue
    
    # Forecast revenue for each day in the date range
    for forecast_date in forecast_dates:
        # Create a new SARIMA model for each forecast
        model = SARIMAX(daily_revenue['rev'], order=order, seasonal_order=seasonal_order)
        model_fit = model.fit(disp=False)
        
        # Forecast revenue for the current day
        forecast = model_fit.get_forecast(steps=1)
        forecasted_value = forecast.predicted_mean[0]
        
        # Add the forecasted value to total revenue
        total_revenue += forecasted_value
    
    return total_revenue

In [41]:
input_date = pd.to_datetime('2020-1-30', format='%Y-%m-%d')  # Replace with your desired input date
total_revenue = forecast_total_revenue(input_date)

# Print the total forecasted revenue
print(f"Total Forecasted Revenue for the Next 7 Days: ${total_revenue:.2f}")

Total Forecasted Revenue for the Next 7 Days: $962692.41
