In [775]:
# Import required libraries
import pandas as pd
import numpy as np
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from datetime import datetime
import matplotlib.pyplot as plt
import warnings
from sklearn.metrics import mean_absolute_error

# Suppress warnings for cleaner outputs
warnings.filterwarnings("ignore")

# Ensure plots are displayed inline
%matplotlib inline

### Prepare the Dataset

In [None]:
# Load the data
df = pd.read_excel('Dataset_palm_oil_forecasting.xlsx')
df['Date'] = pd.to_datetime(df['Date'])
df.set_index('Date', inplace=True)

# Ensure the DataFrame is sorted by its index (date in this case)
df = df.sort_index()

# Split the dataset into training and testing sets
train_end_date = '2023-12-01'
testing_start_date = (pd.to_datetime(train_end_date) + pd.DateOffset(months=1)).strftime('%Y-%m-%d')  # Start testing 1 month after training ends
testing_end_date = (pd.to_datetime(testing_start_date) + pd.DateOffset(months=11)).strftime('%Y-%m-%d')  # Testing ends 12 months later

train_data = df[:train_end_date]  # Training data
test_data = df[testing_start_date:testing_end_date]  # Testing data

# Load additional areas data for later calculations
areas = pd.read_excel('Oil_Production_Raw_Data.xlsx', sheet_name='Area')
areas.set_index('Date', inplace=True)

### Load and Align Exogenous Variables

In [777]:
# Load exogenous data
#exog_data = df[['Potash_Qty_kg/ha', 'Days_temp_below_weighted_avg', 'Days_temp_above_weighted_avg', 'Evapo_weighted_avg']]
#exog_data = df[['Evapo_weighted_avg']]
#exog_data = df[['Potash_Qty_kg/ha', 'Days_temp_below_weighted_avg', 'Days_temp_above_weighted_avg', 'Precip_sum_weighted_avg']]
#exog_data = df[['Potash_Qty_kg/ha', 'Days_temp_below_weighted_avg', 'Days_temp_above_weighted_avg']]
#exog_data = df[['Precip_sum_weighted_avg']]
#exog_data = df[['Evapo_weighted_avg', 'Potash_Qty_kg/ha']]
#exog_data = df[['Precip_sum_weighted_avg', 'Potash_Qty_kg/ha']]
exog_data = df[['Evapo_weighted_avg', 'Days_temp_below_weighted_avg', 'Days_temp_above_weighted_avg']]

# Align exogenous variables with training and testing data
exog_train = exog_data[:train_end_date]
exog_test = exog_data[testing_start_date:testing_end_date]

### Forecast Exogenous Variables

In [778]:
# Define basic SARIMA parameters
order = (1, 1, 1)  # ARIMA components (p, d, q)
seasonal_order = (0, 1, 1, 12)  # Minimal SARIMA seasonal components (P, D, Q, m=12 for monthly seasonality)

# Forecast for each exogenous variable
forecasted_exog = {}
for var in exog_data.columns:
    try:
        
        # Extract the exogenous variable's time series
        exog_var = exog_data[var]
        
        # Train SARIMA model
        model = SARIMAX(exog_var,
                               order=order,
                               seasonal_order=seasonal_order)
        fit = model.fit(disp=False)
        
        # Forecast next 12 months
        forecast = fit.forecast(steps=12)
        forecasted_exog[var] = forecast.values  # Store forecast
        
    except Exception as e:
        print(f"Failed to forecast {var}: {e}")

# Convert the forecasts into a DataFrame
forecasted_exog_df = pd.DataFrame(forecasted_exog, 
                                         index=pd.date_range(start=testing_start_date, periods=12, freq='MS'))

forecasted_exog_df

In [779]:
# Plot forecasts vs actuals for each exogenous variable
for var in exog_data.columns:
    try:
        # Extract actual values (exclude last 12 months used for forecasting)
        actual_values = exog_data[var]  # Historical data up to testing start date
        
        plt.figure(figsize=(12, 6))
        
        # Plot actual values
        plt.plot(actual_values.index, actual_values, label=f'Actual {var}', color='black')

        # Plot forecasted values
        plt.plot(forecasted_exog_df.index, forecasted_exog_df[var], label=f'Forecasted {var}', color='red')

        # Chart styling
        plt.title(f'SARIMA: Forecast vs Actuals for {var}')
        plt.xlabel('Date')
        plt.ylabel(f'{var} Level')
        plt.legend()
        plt.grid(True)
        
        # Show the figure
        plt.show()
    
    except Exception as e:
        print(f"Failed to plot {var}: {e}")

### Visualize ACF and PACF

##### This code visualizes the Autocorrelation Function (ACF) and Partial Autocorrelation Function (PACF) for the training data to help determine SARIMA parameters.

In [780]:
# Visualize ACF and PACF
plt.figure(figsize=(12, 8))

# Plot ACF
plt.subplot(2, 1, 1)
plot_acf(train_data['Palm_Oil'], lags=20, ax=plt.gca(), title="ACF - Autocorrelation Function")
plt.grid(True)

# Plot PACF
plt.subplot(2, 1, 2)
plot_pacf(train_data['Palm_Oil'], lags=20, ax=plt.gca(), title="PACF - Partial Autocorrelation Function")
plt.grid(True)

plt.tight_layout()
plt.show()

### Train SARIMA Model
##### Train the SARIMA model using specified parameters (p, d, q) and seasonal parameters (P, D, Q, m) and forecast values for the next 12 months.

In [781]:
# Initialize the results list
results = []

#### SARIMA Parameters Breakdown for Palm Oil Production Data

##### 1. Non-Seasonal Parameters (`order=(3, 1, 11)`):
These parameters account for **trends** and **short-term patterns** in the data.

###### **p=3 (Auto-Regressive Order):**
- This means the model incorporates the last **3 previous palm oil production values (lags)** to predict the current production.  
- **Example:** To predict the current month's production, the values for the last 3 months will be used (e.g., production in January, February, and March is used to predict April's production).

###### **d=1 (Differencing):**
- Differencing helps remove **trends** from the data and makes the time series stationary.  
- Differencing of `d=1` means the model uses the **first difference** of the palm oil data:  
  \( Y[t]_{\text{difference}} = Y[t] - Y[t-1] \).  
- **Example:** To predict April's production, the model considers the difference in production between March and February instead of the absolute values.

###### **q=11 (Moving Average Order):**
- This parameter refers to how the model uses **past forecast errors** (residuals) to predict the current value. Specifically, it uses the **last 11 residuals (errors)** from previous months for corrections in the forecast.  
- **Example:** Since production may have monthly fluctuations due to various factors like weather, pests, or market demand, errors from the past 11 months are considered.

---

##### 2. Seasonal Parameters (`seasonal_order=(2, 1, 2, 12)`):
These parameters account for **seasonal influences** (patterns that happen every 12 months).

###### **P=2 (Seasonal Auto-Regressive Order):**
- This means that the model considers palm oil production values from **the last 2 seasonal periods** to make predictions.  
- **Example:** To predict April 2025, the model may use production from April 2024 and April 2023 (12 and 24 months ago).

###### **D=1 (Seasonal Differencing):**
- Differencing is applied at the **seasonal level** to eliminate variations across years.  
- Seasonal differencing of `D=1` computes the **difference** between **same months** across consecutive years:  
  \( Y[t]_{\text{seasonal difference}} = Y[t] - Y[t-m] \), where \( m=12 \).  
- **Example:** To predict April 2025's production, the model uses the difference between April 2025 and April 2024.

###### **Q=2 (Seasonal Moving Average Order):**
- This means the model incorporates **seasonal forecast errors** from the **last 2 seasonal periods** into its predictions.  
- **Example:** Errors from April 2024 and April 2023 will be considered to predict April 2025.

###### **m=12 (Seasonal Cycle Length):**
- Indicates the length of the **seasonal cycle** is **12 months**.  
- **Explanation:** Palm oil production exhibits clear annual seasonality due to factors like harvest seasons, pests, rainfall patterns, and other yearly cycles.

---

##### Summary
The SARIMA model effectively combines **short-term patterns** from recent months and **long-term seasonal influences** tied to palm oil's annual production cycles to generate accurate forecasts.


In [782]:
# Specify SARIMA parameters
order = (3, 1, 11)  # p, d, q
seasonal_order = (2, 1, 2, 12)  # P, D, Q, m (assumes monthly seasonality)

# Align exogenous variables with the training data
exog_train = exog_data[:train_data.index[-1]]  # Exogenous variables for training
exog_forecast = forecasted_exog_df  # Forecasted exogenous variables for prediction

try:
    # Train the SARIMA model with exogenous variables
    sarima_model = SARIMAX(train_data['Palm_Oil'],
                           order=order, 
                           seasonal_order=seasonal_order,
                           exog=exog_train)  # Pass exogenous variables for training
    sarima_fit = sarima_model.fit(disp=False)

    # Forecast for the next 12 months using the forecasted exogenous variables
    forecast = sarima_fit.forecast(steps=12, exog=exog_forecast)  # Include exogenous variables for forecasting

    # Create forecast index based on the first day of each month
    forecast_index = pd.date_range(start=train_data.index[-1] + pd.DateOffset(months=1), periods=12, freq='MS')
    forecast_df = pd.DataFrame({'ForecastValue': forecast.values}, index=forecast_index)

except Exception as e:
    print(f"Error occurred during SARIMA model training or forecast: {e}")

forecast_df

### Compare Forecast with Actual Values

In [783]:
# Compare forecast with actual values
comparison_df = forecast_df.join(test_data[['Palm_Oil']], how='inner').rename(columns={'Palm_Oil': 'ActualValue'})

# Calculate error metrics
comparison_df['AbsoluteError'] = np.abs(comparison_df['ActualValue'] - comparison_df['ForecastValue'])
comparison_df['APE'] = np.abs((comparison_df['ActualValue'] - comparison_df['ForecastValue']) / comparison_df['ActualValue']) * 100
rmse = np.sqrt(np.mean((comparison_df['ActualValue'] - comparison_df['ForecastValue'])**2))
mse = np.mean((comparison_df['ActualValue'] - comparison_df['ForecastValue'])**2)

### Merge additional calculations

In [784]:
# Merge with areas data for additional calculations
comparison_df = pd.merge(comparison_df, areas, left_index=True, right_index=True, how='left')
comparison_df['CPO (T/ha) Forecast'] = comparison_df['ForecastValue'] * comparison_df['OER CPO']
comparison_df['CPO (T/ha) Actuals'] = comparison_df['ActualValue'] * comparison_df['OER CPO']
comparison_df['TotalProductionForecast'] = comparison_df['CPO (T/ha) Forecast'] * (comparison_df['Mature area'] + 0.75 * comparison_df['Immature Area'])
comparison_df['TotalProductionActuals'] = comparison_df['CPO (T/ha) Actuals'] * (comparison_df['Mature area'] + 0.75 * comparison_df['Immature Area'])

# Drop unnecessary columns
comparison_df = comparison_df.drop(['Area', 'Mature area', 'Immature Area', 'OER CPO', 'CPO (T/ha) Forecast', 'CPO (T/ha) Actuals'], axis=1)

### Store Results

In [785]:
# Save the results for this particular SARIMA model
training_start_date = train_data.index[0].strftime('%Y-%m-%d')
creation_date = datetime.now().strftime('%Y-%m-%d')

for date, forecast_value, actual_value, absolute_error, ape, total_forecast, total_actuals in zip(
        comparison_df.index, 
        comparison_df['ForecastValue'], 
        comparison_df['ActualValue'], 
        comparison_df['AbsoluteError'],
        comparison_df['APE'],
        comparison_df['TotalProductionForecast'],
        comparison_df['TotalProductionActuals']):
    results.append({
        'Date': date.strftime('%Y-%m-%d'),
        'ForecastValue': forecast_value,
        'ActualValue': actual_value,
        'ModelName': 'SARIMAX',
        'Variables': ', '.join(exog_data.columns),
        'ModelParameters': f'order={order}, seasonal_order={seasonal_order}',
        'TrainingStartDate': training_start_date,
        'TrainingEndDate': train_end_date,
        'TestingStartDate': testing_start_date,
        'TestingEndDate': testing_end_date,
        'RMSE': rmse,
        'MAPE': ape/100,
        'MSE': mse,
        'AbsoluteError': absolute_error,
        'TotalProductionForecast': total_forecast,
        'TotalProductionActuals': total_actuals,
        'Creation_Date': creation_date,
        'Comment': '-'
    })

### Consolidate and Display Results

In [786]:
# Create the consolidated results table
result_table = pd.DataFrame(results)

# Format date columns for display
result_table['Date'] = pd.to_datetime(result_table['Date']).dt.strftime('%d.%m.%Y')
result_table['Creation_Date'] = pd.to_datetime(result_table['Creation_Date']).dt.strftime('%d.%m.%Y')
result_table['TrainingStartDate'] = pd.to_datetime(result_table['TrainingStartDate']).dt.strftime('%d.%m.%Y')
result_table['TrainingEndDate'] = pd.to_datetime(result_table['TrainingEndDate']).dt.strftime('%d.%m.%Y')
result_table['TestingStartDate'] = pd.to_datetime(result_table['TestingStartDate']).dt.strftime('%d.%m.%Y')
result_table['TestingEndDate'] = pd.to_datetime(result_table['TestingEndDate']).dt.strftime('%d.%m.%Y')

# Reorder columns
result_table = result_table[['Date', 'ForecastValue', 'ActualValue', 'ModelName', 'Variables', 
                             'ModelParameters', 'TrainingStartDate', 'TrainingEndDate', 'TestingStartDate', 
                             'TestingEndDate', 'RMSE', 'MAPE', 'MSE', 'AbsoluteError', 'TotalProductionForecast', 
                             'TotalProductionActuals', 'Creation_Date', 'Comment']]

# Display the result table
result_table

In [787]:
# Plot actuals vs forecast
plt.figure(figsize=(12, 6))
plt.plot(comparison_df.index, comparison_df['ActualValue'], label='Actual Values', color='green')
plt.plot(comparison_df.index, comparison_df['ForecastValue'], label='Forecasted Values', color='black', linestyle='--')
plt.legend(loc='best')
plt.xlabel('Date')
plt.ylabel('Palm Oil Production')
plt.title('Actual vs Forecasted Palm Oil Production (SARIMA)')
plt.grid()
plt.show()

print(f"MAPE: {result_table['MAPE'].mean()}")

In [788]:
#all_results_df = pd.DataFrame(columns=result_table.columns)
all_results_df = pd.concat([result_table, all_results_df], ignore_index=True) 
all_results_df

In [None]:
all_results_df.to_excel('SARIMAX_results.xlsx')