In [1]:
import pandas as pd
import numpy as np
from statsmodels.tsa.arima.model import ARIMA
from datetime import datetime, timedelta

## Exploratory Data Analysis

In [5]:
# Read data
oil_df = pd.read_csv('../data/oil.csv')
holiday_df = pd.read_csv('../data/holidays_events.csv')
stores_df = pd.read_csv('../data/stores.csv')
transactions_df = pd.read_csv('../data/transactions.csv')
train_df = pd.read_csv('../data/train.csv')

In [3]:
oil_df.head()

Unnamed: 0,date,dcoilwtico
0,2013-01-01,
1,2013-01-02,93.14
2,2013-01-03,92.97
3,2013-01-04,93.12
4,2013-01-07,93.2


In [6]:
holiday_df.head()

Unnamed: 0,date,type,locale,locale_name,description,transferred
0,2012-03-02,Holiday,Local,Manta,Fundacion de Manta,False
1,2012-04-01,Holiday,Regional,Cotopaxi,Provincializacion de Cotopaxi,False
2,2012-04-12,Holiday,Local,Cuenca,Fundacion de Cuenca,False
3,2012-04-14,Holiday,Local,Libertad,Cantonizacion de Libertad,False
4,2012-04-21,Holiday,Local,Riobamba,Cantonizacion de Riobamba,False


In [8]:
stores_df.head()

Unnamed: 0,date,store_nbr,transactions
0,2013-01-01,25,770
1,2013-01-02,1,2111
2,2013-01-02,2,2358
3,2013-01-02,3,3487
4,2013-01-02,4,1922


In [9]:
transactions_df.head()

Unnamed: 0,date,store_nbr,transactions
0,2013-01-01,25,770
1,2013-01-02,1,2111
2,2013-01-02,2,2358
3,2013-01-02,3,3487
4,2013-01-02,4,1922


In [10]:
train_df.head()

Unnamed: 0,id,date,store_nbr,family,sales,onpromotion
0,0,2013-01-01,1,AUTOMOTIVE,0.0,0
1,1,2013-01-01,1,BABY CARE,0.0,0
2,2,2013-01-01,1,BEAUTY,0.0,0
3,3,2013-01-01,1,BEVERAGES,0.0,0
4,4,2013-01-01,1,BOOKS,0.0,0


## Forecast Development

In [43]:
import pandas as pd
import numpy as np
from statsmodels.tsa.holtwinters import ExponentialSmoothing
from datetime import datetime, timedelta
from sklearn.metrics import mean_absolute_error, mean_squared_error
from joblib import Parallel, delayed
import warnings
import os

warnings.filterwarnings('ignore')

In [81]:
def read_and_preprocess_data(file_path):
    df = pd.read_csv(file_path)
    return df

# Function to split data into training and test sets
def split_data(df, test_size=30, max_train_size=365):
    df['Date'] = pd.to_datetime(df['Date'])
    train = df[df['Date'] < df['Date'].max() - pd.Timedelta(days=test_size)]
    train = train[train['Date'] >= train['Date'].max() - pd.Timedelta(days=max_train_size)]
    test = df[df['Date'] >= df['Date'].max() - pd.Timedelta(days=test_size)]
    return train, test

# Fallback forecasting method
def simple_moving_average(data, window=7):
    return data.rolling(window=window, min_periods=1).mean().iloc[-1]

# Function to train model and forecast for a single group
def train_and_forecast_group(name, train_data, test_data):
    if len(train_data) < 14:  # Require at least 14 data points for forecasting
        forecast = pd.Series([train_data['Sales'].mean()] * len(test_data), index=test_data.index)
    else:
        try:
            model = ExponentialSmoothing(train_data['Sales'], 
                                         seasonal_periods=7, 
                                         trend='add', 
                                         seasonal='add').fit()
            forecast = model.forecast(len(test_data))
            
            # If forecast contains NaN, use fallback method
            if forecast.isnull().any():
                forecast = pd.Series([simple_moving_average(train_data['Sales'])] * len(test_data), 
                                     index=test_data.index)
        except:
            # If model fitting fails, use fallback method
            forecast = pd.Series([simple_moving_average(train_data['Sales'])] * len(test_data), 
                                 index=test_data.index)
    
    # Ensure non-negative forecasts
    forecast = forecast.clip(lower=0)
    
    result = pd.DataFrame({
        'Date': test_data.index,
        'Forecast': forecast.values,
        'Actual': test_data['Sales'].values
    })
    for i, col in enumerate(name):
        result[f'Group_{i+1}'] = col
    
    return result

# Main function to generate forecasts
def generate_forecasts(train_df, test_df, group_columns):
    grouped_train = train_df.groupby(group_columns)
    grouped_test = test_df.groupby(group_columns)
    
    forecasts = Parallel(n_jobs=-1)(
        delayed(train_and_forecast_group)(name, 
                                          train_group.set_index('Date'), 
                                          grouped_test.get_group(name).set_index('Date'))
        for name, train_group in grouped_train
        if name in grouped_test.groups
    )
    
    return pd.concat(forecasts, ignore_index=True)

# Function to evaluate forecasts
def evaluate_forecasts(forecasts):
    mae = mean_absolute_error(forecasts['Actual'], forecasts['Forecast'])
    rmse = np.sqrt(mean_squared_error(forecasts['Actual'], forecasts['Forecast']))
    mape = np.mean(np.abs((forecasts['Actual'] - forecasts['Forecast']) / forecasts['Actual'].clip(lower=1e-5))) * 100
    return mae, rmse, mape

In [82]:
# Main execution
if __name__ == "__main__":
    # Read the data
    full_train_data_df = read_and_preprocess_data('../data/train.csv')
    print("read in data complete")
    full_train_data_df.rename({'date':'Date', 'sales': 'Sales'}, axis=1, inplace=True)
    full_train_data_df.columns
    
    # Split into training and test sets
    train_df, test_df = split_data(full_train_data_df, 30)
    print("split data complete")
    
    # Generate forecasts
    group_columns = ['store_nbr', 'family']  # Replace with your grouping columns
    forecasts = generate_forecasts(train_df, test_df, group_columns)
    print("generate forecasts complete")
    
    # Save results to CSV file
    # forecasts.to_csv('forecasts_with_actual.csv', index=False)
    # print("\nForecasts have been saved to 'forecasts_with_actual.csv'.")

read in data complete
split data complete
generate forecasts complete
Sample of forecasts:
        Date  Forecast  Actual  Group_1     Group_2
0 2017-07-16  3.318018     2.0        1  AUTOMOTIVE
1 2017-07-17  2.880604     2.0        1  AUTOMOTIVE
2 2017-07-18  3.839021     3.0        1  AUTOMOTIVE
3 2017-07-19  4.474109     7.0        1  AUTOMOTIVE
4 2017-07-20  4.069853     4.0        1  AUTOMOTIVE
5 2017-07-21  4.899122    10.0        1  AUTOMOTIVE
6 2017-07-22  4.581126     8.0        1  AUTOMOTIVE
7 2017-07-23  3.319247     0.0        1  AUTOMOTIVE
8 2017-07-24  2.881833     4.0        1  AUTOMOTIVE
9 2017-07-25  3.840250    10.0        1  AUTOMOTIVE

Evaluation Metrics:
Mean Absolute Error: 77.86
Root Mean Squared Error: 284.47
Mean Absolute Percentage Error: 866891.64%


## Output Evaluation

In [None]:
if __name__ == "__main__":
    
    # Evaluate forecasts
    mae, rmse, mape = evaluate_forecasts(forecasts)

    # Print results
    print("Sample of forecasts:")
    print(forecasts.head(10))
    print(f"\nEvaluation Metrics:")
    print(f"Mean Absolute Error: {mae:.2f}")
    print(f"Root Mean Squared Error: {rmse:.2f}")
    print(f"Mean Absolute Percentage Error: {mape:.2f}%")


## Data Visualization

In [84]:

import matplotlib.pyplot as plt
from matplotlib.dates import DateFormatter
import os

In [106]:
def plot_forecasts(forecasts, group_columns, output_dir='forecast_plots'):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    for name, group in forecasts.groupby(group_columns):
        fig, ax = plt.subplots(figsize=(12, 6))
        
        ax.plot(group['Date'], group['Actual'], label='Actual', color='blue')
        ax.plot(group['Date'], group['Forecast'], label='Forecast', color='red', linestyle='--')
        
        output_name = str(name[0]) + "_" + name[1].replace('/', '')
        ax.set_title(f"Forecast vs Actual for {output_name}")
        ax.set_xlabel('Date')
        ax.set_ylabel('Sales')
        ax.legend()
        
        ax.xaxis.set_major_formatter(DateFormatter("%Y-%m-%d"))
        plt.xticks(rotation=45)
        plt.tight_layout()
        
        group_name = '_'.join(str(n) for n in name)
        plt.savefig(os.path.join(output_dir, f'forecast_{output_name}.png'))
        plt.close()

In [104]:
def plot_forecasts(forecasts, group_columns, output_dir='forecast_plots'):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    for name, group in forecasts.groupby(group_columns):
        output_name = str(name[0]) + "_" + name[1].replace('/', '')
        print(output_name)
        # print(group)

In [107]:
plot_forecasts(forecasts, ['Group_1', 'Group_2'])