# Sales Forecasting - Distributed Training


In [None]:
%pip install -q kubeflow-training mlflow yamlmagic
%load_ext yamlmagic


## Configuration


In [None]:
%%yaml parameters

# =============================================================================
# Cluster Configuration
# =============================================================================
namespace: feast-trainer-demo
shared_pvc: feast-pvc
runtime: torch-distributed

# =============================================================================
# Training Hyperparameters
# =============================================================================
epochs: 30
batch_size: 256
learning_rate: 0.001

# =============================================================================
# Model Architecture
# =============================================================================
model:
  hidden_dims: [256, 128, 64]
  dropout: 0.2

# =============================================================================
# Feature Columns (must match Feast features)
# =============================================================================
features:
  - lag_1
  - lag_2
  - lag_4
  - lag_8
  - lag_52
  - rolling_mean_4w
  - store_size
  - temperature
  - fuel_price
  - cpi
  - unemployment

# =============================================================================
# Distributed Training (1 worker for quickstart, increase for production)
# =============================================================================
num_workers: 1
resources_per_worker:
  cpu: 4
  memory: 16Gi
  
# GPU Configuration: "none", "nvidia", or "amd"
gpu_type: nvidia
gpu_count: 1

# =============================================================================
# MLflow Tracking
# =============================================================================
mlflow:
  experiment_name: sales-forecasting

# =============================================================================
# Data Paths (PVC mounted at /shared)
# =============================================================================
paths:
  data_dir: /shared/data
  model_dir: /shared/models

In [None]:
# Extract key parameters for convenience
NAMESPACE = parameters['namespace']
SHARED_PVC = parameters['shared_pvc']
RUNTIME = parameters['runtime']
MLFLOW_URI = f"http://mlflow.{NAMESPACE}.svc.cluster.local:5000"


## Authentication


In [None]:
import os

K8S_TOKEN = os.getenv("K8S_TOKEN", "<YOUR_TOKEN>")
K8S_API_SERVER = os.getenv("K8S_API_SERVER", "<YOUR_API_SERVER>")


In [None]:
from kubernetes import client as k8s
from kubeflow.training import TrainerClient, CustomTrainer
from kubeflow.training.types import KubernetesBackendConfig
from kubeflow.training.types import (
    PodTemplateOverrides, PodTemplateOverride,
    PodSpecOverride, ContainerOverride,
    Labels, Annotations
)

cfg = k8s.Configuration()
if K8S_TOKEN and K8S_API_SERVER:
    cfg.host = K8S_API_SERVER
    cfg.verify_ssl = False
    cfg.api_key = {"authorization": f"Bearer {K8S_TOKEN}"}

trainer_client = TrainerClient(
    KubernetesBackendConfig(
        namespace=NAMESPACE,
        client_configuration=cfg if K8S_TOKEN else None
    )
)


## Training Runtime


In [None]:
runtime = trainer_client.get_runtime(RUNTIME)

## Training Function


In [None]:
def train_sales_model(parameters):
    """
    Distributed training function for sales forecasting model.
    Supports CPU, NVIDIA GPU (CUDA), and AMD GPU (ROCm).
    """
    import os
    import json
    import torch
    import torch.nn as nn
    import torch.distributed as dist
    import pandas as pd
    from sklearn.preprocessing import StandardScaler
    from sklearn.model_selection import train_test_split
    from pathlib import Path
    from datetime import datetime

    # =========================================================================
    # Device Detection (CPU / NVIDIA CUDA / AMD ROCm)
    # =========================================================================
    def detect_device():
        """Detect available compute device and return (device, backend, device_type)"""
        if torch.cuda.is_available():
            # Check if ROCm (AMD) or CUDA (NVIDIA)
            is_rocm = hasattr(torch.version, 'hip') and torch.version.hip is not None
            device_type = "rocm" if is_rocm else "cuda"
            backend = "nccl"  # nccl works for both CUDA and ROCm
            local_rank = int(os.getenv("LOCAL_RANK", 0))
            torch.cuda.set_device(local_rank)
            device = torch.device("cuda", local_rank)
            return device, backend, device_type
        else:
            return torch.device("cpu"), "gloo", "cpu"
    
    device, backend, device_type = detect_device()

    # =========================================================================
    # Distributed Setup
    # =========================================================================
    dist.init_process_group(backend=backend)
    rank = dist.get_rank()
    world_size = dist.get_world_size()
    
    if rank == 0:
        gpu_info = ""
        if device_type != "cpu":
            gpu_name = torch.cuda.get_device_name(0)
            gpu_mem = torch.cuda.get_device_properties(0).total_memory / 1e9
            gpu_info = f" ({gpu_name}, {gpu_mem:.1f}GB)"
        print(f"Device: {device_type.upper()}{gpu_info} | Workers: {world_size}")

    # =========================================================================
    # Configuration
    # =========================================================================
    mlflow_uri = os.getenv("MLFLOW_TRACKING_URI", "http://mlflow:5000")
    data_dir = parameters.get('paths', {}).get('data_dir', '/shared/data')
    model_dir = parameters.get('paths', {}).get('model_dir', '/shared/models')
    epochs = parameters.get('epochs', 50)
    batch_size = parameters.get('batch_size', 256)
    lr = parameters.get('learning_rate', 0.001)
    feature_cols = parameters.get('features', [])
    hidden_dims = parameters.get('model', {}).get('hidden_dims', [256, 128, 64])
    dropout = parameters.get('model', {}).get('dropout', 0.2)

    # =========================================================================
    # Model Definition
    # =========================================================================
    class SalesMLP(nn.Module):
        def __init__(self, input_dim, hidden_dims, dropout):
            super().__init__()
            layers = []
            prev_dim = input_dim
            for h_dim in hidden_dims:
                layers.extend([
                    nn.Linear(prev_dim, h_dim),
                    nn.BatchNorm1d(h_dim),
                    nn.ReLU(),
                    nn.Dropout(dropout),
                ])
                prev_dim = h_dim
            layers.append(nn.Linear(prev_dim, 1))
            self.net = nn.Sequential(*layers)
        
        def forward(self, x):
            return self.net(x).squeeze(-1)

    # =========================================================================
    # Load Data
    # =========================================================================
    df = pd.read_parquet(f"{data_dir}/features.parquet")
    available_features = [c for c in feature_cols if c in df.columns]
    df = df.dropna(subset=available_features + ["weekly_sales"])
    X = df[available_features].values
    y = df["weekly_sales"].values
    
    if rank == 0:
        print(f"Data: {len(df):,} samples, {len(available_features)} features")

    # =========================================================================
    # Preprocessing
    # =========================================================================
    scaler_X = StandardScaler()
    scaler_y = StandardScaler()
    X_scaled = scaler_X.fit_transform(X)
    y_scaled = scaler_y.fit_transform(y.reshape(-1, 1)).flatten()
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_scaled, test_size=0.2, random_state=42)

    # =========================================================================
    # Model + DDP
    # =========================================================================
    model = SalesMLP(len(available_features), hidden_dims, dropout).to(device)
    if world_size > 1:
        model = nn.parallel.DistributedDataParallel(model, device_ids=[device.index] if device_type != "cpu" else None)
    
    train_dataset = torch.utils.data.TensorDataset(torch.FloatTensor(X_train), torch.FloatTensor(y_train))
    sampler = torch.utils.data.distributed.DistributedSampler(train_dataset)
    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, sampler=sampler)
    
    X_test_tensor = torch.FloatTensor(X_test).to(device)
    y_test_tensor = torch.FloatTensor(y_test).to(device)
    
    criterion = nn.MSELoss()
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=1e-5)
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=epochs)
    
    best_loss = float('inf')
    best_state = None

    # =========================================================================
    # MLflow (rank 0)
    # =========================================================================
    if rank == 0:
        try:
            import mlflow
            os.environ["MLFLOW_TRACKING_URI"] = mlflow_uri
            exp_name = parameters.get('mlflow', {}).get('experiment_name', 'sales-forecasting')
            run_name = os.getenv("RUN_NAME", f"sales-{datetime.now().strftime('%m%d-%H%M')}")
            mlflow.set_experiment(exp_name)
            mlflow.start_run(run_name=run_name)
            mlflow.log_params({
                "epochs": epochs, "batch_size": batch_size, "learning_rate": lr,
                "world_size": world_size, "device_type": device_type,
                "hidden_dims": hidden_dims, "dropout": dropout
            })
        except Exception as e:
            print(f"MLflow: {e}")

    # =========================================================================
    # Training Loop
    # =========================================================================
    for epoch in range(epochs):
        sampler.set_epoch(epoch)
        model.train()
        
        for X_batch, y_batch in train_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            optimizer.zero_grad()
            loss = criterion(model(X_batch), y_batch)
            loss.backward()
            optimizer.step()
        
        scheduler.step()
        
        model.eval()
        with torch.no_grad():
            test_loss = criterion(model(X_test_tensor), y_test_tensor).item()
        
        if test_loss < best_loss:
            best_loss = test_loss
            if rank == 0:
                base_model = model.module if hasattr(model, 'module') else model
                best_state = base_model.state_dict().copy()
        
        if rank == 0 and (epoch + 1) % 10 == 0:
            print(f"Epoch {epoch+1}/{epochs} | Loss: {test_loss:.6f} | Best: {best_loss:.6f}")
            try: mlflow.log_metrics({"val_loss": test_loss, "best_loss": best_loss}, step=epoch)
            except: pass

    # =========================================================================
    # Save Model (rank 0)
    # =========================================================================
    dist.barrier()
    
    if rank == 0:
        import joblib
        Path(model_dir).mkdir(parents=True, exist_ok=True)
        
        torch.save(best_state, f"{model_dir}/best_model.pt")
        joblib.dump({"scaler_X": scaler_X, "scaler_y": scaler_y, "feature_cols": available_features}, f"{model_dir}/scalers.joblib")
        
        with open(f"{model_dir}/model_metadata.json", "w") as f:
            json.dump({
                "input_dim": len(available_features),
                "hidden_dims": hidden_dims,
                "feature_columns": available_features,
                "best_loss": best_loss,
                "device_type": device_type
            }, f, indent=2)
        
        try:
            mlflow.log_metric("best_loss", best_loss)
            mlflow.log_artifacts(model_dir)
            mlflow.end_run()
        except: pass
        
        print(f"Done! Best loss: {best_loss:.6f} | Model: {model_dir}")
    
    dist.barrier()
    dist.destroy_process_group()

## Submit Training Job


In [None]:
from datetime import datetime

job_id = datetime.now().strftime("%m%d-%H%M")
job_name = f"sales-training-{job_id}"


In [None]:
# Build resource spec based on GPU type
resources = {
    "cpu": parameters['resources_per_worker']['cpu'],
    "memory": parameters['resources_per_worker']['memory'],
}

gpu_type = parameters.get('gpu_type', 'none')
gpu_count = parameters.get('gpu_count', 0)

if gpu_type == 'nvidia' and gpu_count > 0:
    resources["nvidia.com/gpu"] = gpu_count
elif gpu_type == 'amd' and gpu_count > 0:
    resources["amd.com/gpu"] = gpu_count

# Submit job
job = trainer_client.train(
    trainer=CustomTrainer(
        func=train_sales_model,
        num_nodes=parameters['num_workers'],
        resources_per_node=resources,
        packages_to_install=["scikit-learn", "pandas", "pyarrow", "joblib", "mlflow", "matplotlib"],
        env={"MLFLOW_TRACKING_URI": MLFLOW_URI, "RUN_NAME": f"sales-forecast-{job_id}"},
    ),
    runtime=runtime,
    parameters=parameters,
    options=[
        Labels({"app": "sales-forecasting", "job-type": "training", "run-id": job_id}),
        Annotations({"description": f"Sales forecasting - {job_id}"}),
        PodTemplateOverrides(
            PodTemplateOverride(
                target_jobs=["node"],
                spec=PodSpecOverride(
                    volumes=[{"name": "shared", "persistentVolumeClaim": {"claimName": SHARED_PVC}}],
                    containers=[ContainerOverride(name="node", volume_mounts=[{"name": "shared", "mountPath": "/shared"}])]
                )
            )
        ),
    ],
)

## Monitor Progress


In [None]:
trainer_client.wait_for_job_status(name=job, status={"Running"}, timeout=300)


In [None]:
_ = trainer_client.get_job_logs(name=job, follow=True)


In [None]:
trainer_client.wait_for_job_status(name=job, status={"Complete", "Failed"}, timeout=3600)


## MLflow


In [None]:
import mlflow

mlflow.set_tracking_uri(MLFLOW_URI)
experiment = mlflow.get_experiment_by_name(parameters['mlflow']['experiment_name'])
runs = mlflow.search_runs(
    experiment_ids=[experiment.experiment_id],
    max_results=5,
    order_by=["start_time DESC"]
)
runs[["tags.mlflow.runName", "metrics.best_loss"]]


## Cleanup


In [None]:
# trainer_client.delete_job(name=job)
