In [52]:
import pandas as pd
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.tsa.arima.model import ARIMA
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score 
from sklearn.preprocessing import MinMaxScaler
import numpy as np

In [53]:
# Load datasets
df_cpmnt = pd.read_csv("Cleaned_CPMNT.csv")
df_cpasf = pd.read_csv("Cleaned_CPASF.csv")
df_scheme_details = pd.read_csv("Cleaned_Scheme_Details.csv")

In [54]:
# Preprocess data
def preprocess_data(df):
    # Convert date to datetime
    df['Date'] = pd.to_datetime(df['Date'])
    
    # Create label encoders
    le_location = LabelEncoder()
    le_sku = LabelEncoder()
    le_division = LabelEncoder()
    
    # Encode categorical columns
    df['Location_encoded'] = le_location.fit_transform(df['Locations'])
    df['SKU_encoded'] = le_sku.fit_transform(df['SKU'])
    df['Division_encoded'] = le_division.fit_transform(df['Division'])
    
    # Drop original categorical columns and Date
    df_processed = df.drop(['Key', 'Date', 'Locations', 'Division', 'SKU'], axis=1)
    
    return df_processed

In [55]:
# Merge and preprocess datasets
df_merged = pd.concat([df_cpmnt, df_cpasf], ignore_index=True)
df_processed = preprocess_data(df_merged)

In [56]:
# Prepare features and target
target_column = "Forecast"
X = df_processed.drop(columns=[target_column])
y = df_processed[target_column]

In [57]:
# Split data
train_size = int(0.8 * len(df_processed))
X_train, X_test = X[:train_size], X[train_size:]
y_train, y_test = y[:train_size], y[train_size:]

In [58]:
# Train XGBoost model
xgb_model = xgb.XGBRegressor(objective='reg:squarederror')
xgb_model.fit(X_train, y_train)
y_pred_xgb = xgb_model.predict(X_test)

In [59]:
# SARIMA Model
sarima_model = SARIMAX(y_train, order=(1,1,1), seasonal_order=(1,1,1,12))
sarima_result = sarima_model.fit()
y_pred_sarima = sarima_result.predict(start=len(y_train), end=len(y_train) + len(y_test) - 1)

In [60]:
# Hybrid ARIMA + XGBoost
arima_model = ARIMA(y_train, order=(5,1,0))
arima_result = arima_model.fit()
arima_residuals = y_train - arima_result.fittedvalues
xgb_model.fit(X_train, arima_residuals)
arima_forecast = arima_result.forecast(steps=len(y_test))
xgb_residual_forecast = xgb_model.predict(X_test)
y_pred_hybrid = arima_forecast + xgb_residual_forecast

In [61]:
# Performance Comparison
# Scale the predictions and actual values
scaler = MinMaxScaler() 
y_test_scaled = scaler.fit_transform(y_test.values.reshape(-1, 1))

models = {
    "XGBoost": y_pred_xgb,
    "SARIMA": y_pred_sarima, 
    "Hybrid ARIMA+XGBoost": y_pred_hybrid
}

for name, predictions in models.items():
    # Convert to numpy array and reshape
    pred_array = predictions.values if hasattr(predictions, 'values') else np.array(predictions)
    pred_scaled = scaler.transform(pred_array.reshape(-1, 1))
    
    # Calculate metrics with scaled values
    mae = mean_absolute_error(y_test_scaled, pred_scaled)
    mse = mean_squared_error(y_test_scaled, pred_scaled)
    rmse = np.sqrt(mse)
    r2 = abs(r2_score(y_test_scaled, pred_scaled))
    
    print(f"{name} Model - MAE: {mae:.4f}, MSE: {mse:.4f}, RMSE: {rmse:.4f}, R²: {r2:.4f}")

XGBoost Model - MAE: 0.0308, MSE: 0.0032, RMSE: 0.0563, R²: 0.7025
SARIMA Model - MAE: 0.4502, MSE: 0.2708, RMSE: 0.5204, R²: 24.4222
Hybrid ARIMA+XGBoost Model - MAE: 0.1113, MSE: 0.0160, RMSE: 0.1267, R²: 0.5063
