# FULL_02: Train Final Models (Production Pipeline)

**Purpose:** Train XGBoost and LSTM on full Guayas dataset, compare performance at scale  
**Input:** `data/processed/full_featured_data.pkl` (4.8M rows, 33 features)  
**Output:** Production artifacts for best model

**Key Decisions Applied:**
- DEC-013: 7-day train/test gap
- DEC-014: 33 features
- DEC-016: Q4 2013 + Q1 2014 training (temporal consistency)

**Week 3 Baseline (300K sample):**
- XGBoost Tuned: RMSE 6.4860
- LSTM: RMSE 6.2552

**Metrics:** RMSE, MAE, MAPE (non-zero), Bias

**Environment:** WSL2 Ubuntu 22.04, Python 3.11, TensorFlow 2.20.0 (GPU)

In [2]:
### Section 1: Environment Setup
# Source: w03_d01_MODEL_baseline.ipynb

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
import time
import warnings
warnings.filterwarnings('ignore')

# ML libraries
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error
import xgboost as xgb

# Deep learning
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping

# MLflow
import mlflow
import mlflow.sklearn
import mlflow.keras

# Path configuration
PROJECT_ROOT = Path.cwd().parent if Path.cwd().name == 'notebooks' else Path.cwd()
DATA_PROCESSED = PROJECT_ROOT / 'data' / 'processed'
ARTIFACTS_DIR = PROJECT_ROOT / 'artifacts'
OUTPUTS_DIR = PROJECT_ROOT / 'outputs' / 'figures' / 'full_pipeline'

# Create output directories
ARTIFACTS_DIR.mkdir(parents=True, exist_ok=True)
OUTPUTS_DIR.mkdir(parents=True, exist_ok=True)

# Check GPU
print("Environment Setup:")
print(f"  TensorFlow version: {tf.__version__}")
print(f"  GPU available: {tf.config.list_physical_devices('GPU')}")
print(f"  XGBoost version: {xgb.__version__}")
print(f"  MLflow version: {mlflow.__version__}")
print(f"\nPaths:")
print(f"  Data: {DATA_PROCESSED}")
print(f"  Artifacts: {ARTIFACTS_DIR}")
print(f"  Outputs: {OUTPUTS_DIR}")

2025-11-24 13:33:40.345880: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Environment Setup:
  TensorFlow version: 2.20.0
  GPU available: [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
  XGBoost version: 3.1.2
  MLflow version: 3.6.0

Paths:
  Data: /home/berto/Demand-forecasting-in-retail/data/processed
  Artifacts: /home/berto/Demand-forecasting-in-retail/artifacts
  Outputs: /home/berto/Demand-forecasting-in-retail/outputs/figures/full_pipeline


In [4]:
### Section 2: Load Data
# Source: FULL_01 output

print("Loading full featured data...")
start_time = time.time()

df = pd.read_pickle(DATA_PROCESSED / 'full_featured_data.pkl')

load_time = time.time() - start_time

print(f"\nDataset loaded:")
print(f"  Shape: {df.shape}")
print(f"  Memory: {df.memory_usage(deep=True).sum() / 1e6:.1f} MB")
print(f"  Load time: {load_time:.1f} seconds")
print(f"  Date range: {df['date'].min().date()} to {df['date'].max().date()}")

# Define 33 features per DEC-014
FEATURE_COLUMNS = [
    # Temporal (8)
    'unit_sales_lag1', 'unit_sales_lag7', 'unit_sales_lag14', 'unit_sales_lag30',
    'unit_sales_7d_avg', 'unit_sales_14d_avg', 'unit_sales_30d_avg',
    'unit_sales_lag1_7d_corr',
    
    # Calendar (7)
    'year', 'month', 'day', 'dayofweek', 'dayofyear', 'weekofyear', 'quarter',
    
    # Holiday (4)
    'holiday_proximity', 'is_holiday', 'holiday_period', 'days_to_next_holiday',
    
    # Promotion (2)
    'onpromotion', 'promo_item_interaction',
    
    # Store/Item (7)
    'cluster', 'store_avg_sales', 'item_avg_sales', 'item_store_avg',
    'cluster_avg_sales', 'family_avg_sales', 'city_avg_sales',
    
    # Derived (5)
    'perishable', 'weekend', 'month_start', 'month_end', 'is_payday'
]

TARGET = 'unit_sales'

print(f"\nFeature configuration:")
print(f"  Features: {len(FEATURE_COLUMNS)}")
print(f"  Target: {TARGET}")

Loading full featured data...

Dataset loaded:
  Shape: (4801160, 42)
  Memory: 2578.5 MB
  Load time: 2.7 seconds
  Date range: 2013-10-01 to 2014-03-31

Feature configuration:
  Features: 33
  Target: unit_sales


In [5]:
### Section 3: Train/Test Split
# DEC-016: Q4 2013 + Q1 2014 training (temporal consistency)
# DEC-013: 7-day gap between train end and test start

# Define date boundaries
TRAIN_START = '2013-10-01'
TRAIN_END = '2014-02-21'
GAP_START = '2014-02-22'
GAP_END = '2014-02-28'
TEST_START = '2014-03-01'
TEST_END = '2014-03-31'

print("Data Split (DEC-016 + DEC-013):")
print("-" * 50)
print(f"  Training: {TRAIN_START} to {TRAIN_END}")
print(f"  Gap:      {GAP_START} to {GAP_END} (7 days)")
print(f"  Test:     {TEST_START} to {TEST_END}")

# Apply splits
train_mask = (df['date'] >= TRAIN_START) & (df['date'] <= TRAIN_END)
test_mask = (df['date'] >= TEST_START) & (df['date'] <= TEST_END)

df_train = df[train_mask].copy()
df_test = df[test_mask].copy()

print(f"\nSplit results:")
print(f"  Training rows: {len(df_train):,}")
print(f"  Test rows: {len(df_test):,}")
print(f"  Gap rows excluded: {len(df[(df['date'] >= GAP_START) & (df['date'] <= GAP_END)]):,}")

print(f"\nTraining period:")
print(f"  Days: {df_train['date'].nunique()}")
print(f"  Stores: {df_train['store_nbr'].nunique()}")
print(f"  Items: {df_train['item_nbr'].nunique()}")

print(f"\nTest period:")
print(f"  Days: {df_test['date'].nunique()}")
print(f"  Stores: {df_test['store_nbr'].nunique()}")
print(f"  Items: {df_test['item_nbr'].nunique()}")

Data Split (DEC-016 + DEC-013):
--------------------------------------------------
  Training: 2013-10-01 to 2014-02-21
  Gap:      2014-02-22 to 2014-02-28 (7 days)
  Test:     2014-03-01 to 2014-03-31

Split results:
  Training rows: 3,798,720
  Test rows: 817,780
  Gap rows excluded: 184,660

Training period:
  Days: 144
  Stores: 10
  Items: 2638

Test period:
  Days: 31
  Stores: 10
  Items: 2638


The data split is working correctly:

Training: 3.8M rows (144 days)
Test: 818K rows (31 days)
Gap: 185K rows excluded (7 days)

This aligns with expectations from the handoff document.

In [10]:
### Section 4: Prepare Features and Target
# Source: w03_d04_MODEL_lstm.ipynb

print("Preparing features and target...")

# Check and fix data types (handle UInt32 from weekofyear)
print("\nFeature data types:")
print(df_train[FEATURE_COLUMNS].dtypes.value_counts())

# Convert all features to standard float64 to avoid UInt32/nullable type issues
for col in FEATURE_COLUMNS:
    if df_train[col].dtype.name.startswith('UInt') or df_train[col].dtype.name.startswith('Int'):
        print(f"  Converting {col} from {df_train[col].dtype} to float64")
        df_train[col] = df_train[col].astype('float64')
        df_test[col] = df_test[col].astype('float64')

# Extract features and target
X_train = df_train[FEATURE_COLUMNS].values.astype(np.float32)
y_train = df_train[TARGET].values.astype(np.float32)
X_test = df_test[FEATURE_COLUMNS].values.astype(np.float32)
y_test = df_test[TARGET].values.astype(np.float32)

print(f"\nShapes:")
print(f"  X_train: {X_train.shape}, dtype: {X_train.dtype}")
print(f"  y_train: {y_train.shape}, dtype: {y_train.dtype}")
print(f"  X_test: {X_test.shape}, dtype: {X_test.dtype}")
print(f"  y_test: {y_test.shape}, dtype: {y_test.dtype}")

# Check for any remaining NaN
train_nan = np.isnan(X_train).sum()
test_nan = np.isnan(X_test).sum()
print(f"\nNaN check:")
print(f"  X_train NaN: {train_nan}")
print(f"  X_test NaN: {test_nan}")

# Target statistics
print(f"\nTarget statistics:")
print(f"  Train - mean: {y_train.mean():.2f}, std: {y_train.std():.2f}, median: {np.median(y_train):.2f}")
print(f"  Test  - mean: {y_test.mean():.2f}, std: {y_test.std():.2f}, median: {np.median(y_test):.2f}")

# Non-zero counts (for MAPE calculation later)
train_nonzero = (y_train > 0).sum()
test_nonzero = (y_test > 0).sum()
print(f"\nNon-zero sales:")
print(f"  Train: {train_nonzero:,} ({train_nonzero/len(y_train)*100:.1f}%)")
print(f"  Test: {test_nonzero:,} ({test_nonzero/len(y_test)*100:.1f}%)")

Preparing features and target...

Feature data types:
float64    15
int64      11
int32       6
UInt32      1
Name: count, dtype: int64
  Converting weekofyear from UInt32 to float64
float64    15
int64      11
int32       6
UInt32      1
Name: count, dtype: int64
  Converting weekofyear from UInt32 to float64

Shapes:
  X_train: (3798720, 33), dtype: float32
  y_train: (3798720,), dtype: float32
  X_test: (817780, 33), dtype: float32
  y_test: (817780,), dtype: float32

NaN check:
  X_train NaN: 0
  X_test NaN: 0

Target statistics:

Shapes:
  X_train: (3798720, 33), dtype: float32
  y_train: (3798720,), dtype: float32
  X_test: (817780, 33), dtype: float32
  y_test: (817780,), dtype: float32

NaN check:
  X_train NaN: 0
  X_test NaN: 0

Target statistics:
  Train - mean: 2.75, std: 11.62, median: 0.00
  Test  - mean: 3.74, std: 12.18, median: 0.00

Non-zero sales:
  Train: 1,388,533 (36.6%)
  Test: 375,806 (46.0%)
  Train - mean: 2.75, std: 11.62, median: 0.00
  Test  - mean: 3.74, s

3.8M training, 818K test
No NaN
36.6% non-zero in train, 46.0% non-zero in test
Using float32 for efficiency

In [11]:
### Section 5: Define Evaluation Metrics
# RMSE, MAE, MAPE (non-zero), Bias

def calculate_metrics(y_true, y_pred):
    """Calculate all evaluation metrics."""
    # Primary metrics
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mae = mean_absolute_error(y_true, y_pred)
    bias = np.mean(y_pred - y_true)
    
    # MAPE on non-zero only (avoid division by zero)
    mask = y_true > 0
    if mask.sum() > 0:
        mape = np.mean(np.abs((y_pred[mask] - y_true[mask]) / y_true[mask])) * 100
    else:
        mape = np.nan
    
    return {
        'rmse': rmse,
        'mae': mae,
        'bias': bias,
        'mape_nonzero': mape
    }

def print_metrics(metrics, model_name):
    """Print metrics in formatted table."""
    print(f"\n{model_name} Performance:")
    print("-" * 40)
    print(f"  RMSE:          {metrics['rmse']:.4f}")
    print(f"  MAE:           {metrics['mae']:.4f}")
    print(f"  Bias:          {metrics['bias']:.4f}")
    print(f"  MAPE (non-zero): {metrics['mape_nonzero']:.2f}%")

print("Metrics function defined: RMSE, MAE, Bias, MAPE (non-zero)")

Metrics function defined: RMSE, MAE, Bias, MAPE (non-zero)


In [12]:
### Section 6: MLflow Setup
# Source: w03_d02_MODEL_mlflow-features.ipynb

# Set MLflow tracking
MLFLOW_DIR = PROJECT_ROOT / 'mlflow_results'
MLFLOW_DIR.mkdir(parents=True, exist_ok=True)

mlflow.set_tracking_uri(f"file://{MLFLOW_DIR}")
mlflow.set_experiment("full_pipeline_model_comparison")

print("MLflow Configuration:")
print(f"  Tracking URI: file://{MLFLOW_DIR}")
print(f"  Experiment: full_pipeline_model_comparison")
print(f"\nRuns to log:")
print(f"  1. xgboost_full_q4q1")
print(f"  2. lstm_full_q4q1")

2025/11/24 13:53:08 INFO mlflow.tracking.fluent: Experiment with name 'full_pipeline_model_comparison' does not exist. Creating a new experiment.


MLflow Configuration:
  Tracking URI: file:///home/berto/Demand-forecasting-in-retail/mlflow_results
  Experiment: full_pipeline_model_comparison

Runs to log:
  1. xgboost_full_q4q1
  2. lstm_full_q4q1
