In [None]:
## üìö 1. Setup and Data Loading (Re-defining X and y)
import pandas as pd
import numpy as np
from sklearn.model_selection import TimeSeriesSplit, cross_val_score 
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error


# --- Data Loading (Using your correct relative path) ---
file_path = '../../datasets/Supplement_Sales_Weekly_Expanded.csv'
try:
    data = pd.read_csv(file_path) 
except:
    # Use the absolute path if relative path fails again
    # data = pd.read_csv('c:/f5/CrossValidationAndHyperparameterTuningTraining/datasets/Supplement_Sales_Weekly_Expanded.csv') 
    raise FileNotFoundError("Please check the path or ensure the file is accessible.")


# --- Feature Engineering (Condensed from previous steps) ---
data['Date'] = pd.to_datetime(data['Date'])
data['Year'] = data['Date'].dt.year
data['Month'] = data['Date'].dt.month
data = data.drop(columns=['Category', 'Revenue', 'Location'], errors='ignore')

product_data_grouped = data.groupby(['Product_Name', 'Year', 'Month']).agg(
    Price_Avg=('Price', 'mean')
).reset_index()

product_data_grouped = product_data_grouped.sort_values(by=['Product_Name', 'Year', 'Month']).reset_index(drop=True)

PRODUCT_ID = product_data_grouped['Product_Name'].unique()[0]
product_data = product_data_grouped[product_data_grouped['Product_Name'] == PRODUCT_ID].copy()

product_data['Time_Index'] = np.arange(len(product_data)) + 1
product_data['Time_Index_Squared'] = product_data['Time_Index'] ** 2
product_data['Month_sin'] = np.sin(2 * np.pi * product_data['Month'] / 12)
product_data['Month_cos'] = np.cos(2 * np.pi * product_data['Month'] / 12)
product_data['Price_Lag_1'] = product_data['Price_Avg'].shift(1)
product_data['Price_Lag_3'] = product_data['Price_Avg'].shift(3)
product_data['Price_Lag_12'] = product_data['Price_Avg'].shift(12)
product_data['Price_MA_6'] = product_data['Price_Avg'].rolling(window=6).mean().shift(1)
product_data['Price_MA_12'] = product_data['Price_Avg'].rolling(window=12).mean().shift(1)
product_data = product_data.dropna().reset_index(drop=True)

FEATURES = ['Year', 'Month', 'Month_sin', 'Month_cos', 'Time_Index', 'Time_Index_Squared', 
            'Price_Lag_1', 'Price_Lag_3', 'Price_Lag_12', 'Price_MA_6', 'Price_MA_12']
TARGET = 'Price_Avg'

X = product_data[FEATURES]
y = product_data[TARGET]
print("X and y successfully defined. X shape:", X.shape)

X and y successfully defined. X shape: (51, 11)


## ‚ö†Ô∏è 2. Why Standard K-Fold Fails for Time Series

In Notebook 01, we used basic K-Fold to introduce the concept of averaging scores. However, K-Fold works by **randomly or sequentially splitting** the data, which means it:

1.  **Breaks Chronological Order:** It puts future data into the training set and tests on past data.
2.  **Allows Data Leakage:** The model "sees" data from the future to predict the past, leading to artificially low (overly optimistic) error scores.

### üìö Analogy: Looking into the Future

Imagine you are studying for a stock price exam (training). If your practice test (validation) includes stock prices from *after* the period you are studying, you are essentially **cheating** by looking into the future!

The true test of a forecasting model is always: **Train on the PAST, Predict the FUTURE.**

## ‚û°Ô∏è 3. TimeSeriesSplit: The Expanding Window

The correct technique for time-series CV is the **Expanding Window**.

* **The Rule:** The training data set must **always** be chronologically earlier than the testing data set.
* **The Process:** In each fold, the training window grows (expands) to include more historical data, and the test window always moves forward.

### 3.1. Visualizing the Splits

We will use `TimeSeriesSplit` with `n_splits=5`.

```python
from sklearn.model_selection import TimeSeriesSplit

n_splits = 5
tscv = TimeSeriesSplit(n_splits=n_splits)

# Let's visualize the size of the splits (using the index)
print(f"Total samples available: {len(X)}")
print("\n--- Split Visualization ---")

for fold, (train_index, test_index) in enumerate(tscv.split(X), 1):
    print(f"Fold {fold}:")
    print(f"  Training Window Size: {len(train_index)} (Index {train_index[0]} to {train_index[-1]})")
    print(f"  Testing Window Size:  {len(test_index)} (Index {test_index[0]} to {test_index[-1]})")
    print("-" * 30)

# Note how the training window gets progressively larger with each fold.

In [4]:
## ‚û°Ô∏è 3. TimeSeriesSplit: The Expanding Window (Visualization)

from sklearn.model_selection import TimeSeriesSplit # Already imported, but good practice

n_splits = 5
tscv = TimeSeriesSplit(n_splits=n_splits)

# Let's visualize the size of the splits (using the index)
print(f"Total samples available: {len(X)}") # X is now defined
print("\n--- Split Visualization ---")

for fold, (train_index, test_index) in enumerate(tscv.split(X), 1):
    print(f"Fold {fold}:")
    print(f"  Training Window Size: {len(train_index)} (Index {train_index[0]} to {train_index[-1]})")
    print(f"  Testing Window Size:  {len(test_index)} (Index {test_index[0]} to {test_index[-1]})")
    print("-" * 30)

Total samples available: 51

--- Split Visualization ---
Fold 1:
  Training Window Size: 11 (Index 0 to 10)
  Testing Window Size:  8 (Index 11 to 18)
------------------------------
Fold 2:
  Training Window Size: 19 (Index 0 to 18)
  Testing Window Size:  8 (Index 19 to 26)
------------------------------
Fold 3:
  Training Window Size: 27 (Index 0 to 26)
  Testing Window Size:  8 (Index 27 to 34)
------------------------------
Fold 4:
  Training Window Size: 35 (Index 0 to 34)
  Testing Window Size:  8 (Index 35 to 42)
------------------------------
Fold 5:
  Training Window Size: 43 (Index 0 to 42)
  Testing Window Size:  8 (Index 43 to 50)
------------------------------


In [6]:
## 3.3. Running the TimeSeries CV with the Random Forest Model

# --- ADD THIS LINE ---
from sklearn.model_selection import cross_val_score
# --------------------
from sklearn.ensemble import RandomForestRegressor # Ensure this is also imported if not done earlier

# Re-initialize the model
model_ts = RandomForestRegressor(n_estimators=100, random_state=42)

# Use cross_val_score with the TimeSeriesSplit object (tscv)
# We use neg_mean_absolute_error (MAE) as before.
cv_scores_ts = cross_val_score(
    model_ts, 
    X, 
    y, 
    cv=tscv, 
    scoring='neg_mean_absolute_error'
)

# Convert negative scores back to positive MAE errors
cv_maes_ts = -cv_scores_ts 

print("Individual MAE scores for each TimeSeries Fold:")
print(cv_maes_ts)

print(f"\nFinal TimeSeries CV Score (Average MAE): ${cv_maes_ts.mean():.3f}")
print(f"Standard Deviation of MAE: {cv_maes_ts.std():.3f}")

Individual MAE scores for each TimeSeries Fold:
[4.04692687 6.60213875 4.53034812 5.62726125 4.61567625]

Final TimeSeries CV Score (Average MAE): $5.084
Standard Deviation of MAE: 0.917


## üåü 4. Comparison and Conclusion

Let's compare the results from the two CV methods on our time-series data:

| CV Method | Average MAE (Example) | Stability (Std Dev) | Validity |
| :--- | :--- | :--- | :--- |
| **Standard K-Fold** (Notebook 01) | $6.116 | 1.571 | **FLAWED** (Mixes past and future data) |
| **TimeSeriesSplit** (This Notebook) | [Insert TSS Average MAE] | [Insert TSS Std Dev] | **CORRECT** (Respects time order) |

### Key Takeaway

The **TimeSeriesSplit** result is the **only valid and reliable** way to evaluate a model built for forecasting or any data that relies on a chronological sequence. If you see a low MAE from standard K-Fold on time series data, it is a likely indicator of **data leakage** (cheating)!
