# Model Training with Kubeflow and MLflow

This notebook demonstrates:
1. **Fetch features** from Feast feature store
2. **Train locally** for quick validation
3. **Submit Kubeflow TrainJob** for distributed training
4. **Track experiments** with MLflow
5. **Register model** in MLflow Model Registry

## Prerequisites
- Completed `02-feast-features.ipynb` (features registered in Feast)
- MLflow server running
- Kubeflow Training Operator installed


---
## 1. Setup and Configuration


In [None]:
import os
import warnings
warnings.filterwarnings('ignore')

# Configuration - aligned with example manifests
NAMESPACE = os.environ.get("NAMESPACE", "feast-mlops-demo")
MLFLOW_TRACKING_URI = f"http://mlflow.{NAMESPACE}.svc.cluster.local:5000"
SHARED_DIR = os.environ.get("SHARED_DIR", "/shared")

# Set MLflow tracking
os.environ["MLFLOW_TRACKING_URI"] = MLFLOW_TRACKING_URI

print(f"""
Configuration:
  Namespace: {NAMESPACE}
  MLflow: {MLFLOW_TRACKING_URI}
  Shared Storage: {SHARED_DIR}
""")


---
## 2. Fetch Features from Feast


In [None]:
from pathlib import Path
from datetime import datetime, timedelta, timezone
import pandas as pd
import numpy as np
from feast import FeatureStore

# Initialize Feast
REPO_DIR = Path(SHARED_DIR) / "feature_repo"
fs = FeatureStore(repo_path=str(REPO_DIR))

print(f"Feast project: {fs.project}")
print(f"Feature views: {[fv.name for fv in fs.list_feature_views()]}")


In [None]:
# Create training entity dataframe
num_samples = 10000
np.random.seed(42)

entity_df = pd.DataFrame({
    "store_id": np.random.randint(1, 51, num_samples),
    "dept_id": np.random.randint(1, 13, num_samples),
    "event_timestamp": [
        datetime.now(timezone.utc) - timedelta(days=np.random.randint(1, 365))
        for _ in range(num_samples)
    ]
})

# Fetch historical features
print(f"üöÄ Fetching features for {len(entity_df):,} training samples...")

training_df = fs.get_historical_features(
    entity_df=entity_df,
    features=[
        "sales_features:weekly_sales",
        "sales_features:lag_1",
        "sales_features:lag_2",
        "sales_features:lag_4",
        "sales_features:rolling_mean_4w",
        "store_features:store_size",
        "store_features:temperature",
        "store_features:fuel_price",
        "store_features:cpi",
        "store_features:unemployment",
    ]
).to_df()

print(f"‚úÖ Retrieved {len(training_df):,} rows")
training_df.head()


---
## 3. Local Training with MLflow Tracking


In [None]:
import torch
import torch.nn as nn
import mlflow
import mlflow.pytorch
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Prepare data
training_df_clean = training_df.dropna()
feature_cols = ["lag_1", "lag_2", "lag_4", "rolling_mean_4w",
                "store_size", "temperature", "fuel_price", "cpi", "unemployment"]
target_col = "weekly_sales"

X = training_df_clean[feature_cols].values
y = training_df_clean[target_col].values

scaler_X = StandardScaler()
scaler_y = StandardScaler()
X_scaled = scaler_X.fit_transform(X)
y_scaled = scaler_y.fit_transform(y.reshape(-1, 1)).flatten()

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_scaled, test_size=0.2, random_state=42)
print(f"Train: {len(X_train):,} | Test: {len(X_test):,}")


In [None]:
# Define model
class SalesForecastModel(nn.Module):
    def __init__(self, input_dim=9):
        super().__init__()
        self.network = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(128, 64),
            nn.BatchNorm1d(64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 1)
        )
    
    def forward(self, x):
        return self.network(x).squeeze(-1)

model = SalesForecastModel(input_dim=len(feature_cols))
print(f"Model parameters: {sum(p.numel() for p in model.parameters()):,}")


In [None]:
# Training function with MLflow logging
def train_model(model, X_train, y_train, X_test, y_test, epochs=50, lr=0.001, batch_size=256):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)
    
    X_train_t = torch.FloatTensor(X_train).to(device)
    y_train_t = torch.FloatTensor(y_train).to(device)
    X_test_t = torch.FloatTensor(X_test).to(device)
    y_test_t = torch.FloatTensor(y_test).to(device)
    
    criterion = nn.MSELoss()
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=1e-5)
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=epochs)
    
    train_dataset = torch.utils.data.TensorDataset(X_train_t, y_train_t)
    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    
    history = {"train_loss": [], "test_loss": []}
    
    for epoch in range(epochs):
        model.train()
        epoch_loss = 0
        for X_batch, y_batch in train_loader:
            optimizer.zero_grad()
            pred = model(X_batch)
            loss = criterion(pred, y_batch)
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()
        
        scheduler.step()
        
        # Eval
        model.eval()
        with torch.no_grad():
            test_pred = model(X_test_t)
            test_loss = criterion(test_pred, y_test_t).item()
        
        train_loss = epoch_loss / len(train_loader)
        history["train_loss"].append(train_loss)
        history["test_loss"].append(test_loss)
        
        # Log to MLflow
        mlflow.log_metrics({
            "train_loss": train_loss,
            "test_loss": test_loss,
            "lr": scheduler.get_last_lr()[0]
        }, step=epoch)
        
        if (epoch + 1) % 10 == 0:
            print(f"Epoch {epoch+1:3d} | Train: {train_loss:.4f} | Test: {test_loss:.4f}")
    
    return model, history


In [None]:
# Run training with MLflow tracking
mlflow.set_experiment("sales-forecasting")

with mlflow.start_run(run_name="local-pytorch-training"):
    # Log hyperparameters
    mlflow.log_params({
        "model_type": "SalesForecastModel",
        "epochs": 50,
        "learning_rate": 0.001,
        "batch_size": 256,
        "optimizer": "AdamW",
        "features": feature_cols,
        "train_samples": len(X_train),
        "test_samples": len(X_test)
    })
    
    # Train
    trained_model, history = train_model(model, X_train, y_train, X_test, y_test)
    
    # Log model
    mlflow.pytorch.log_model(trained_model, "model")
    
    # Save locally too
    model_path = Path(SHARED_DIR) / "models" / "sales_forecast_model.pt"
    model_path.parent.mkdir(parents=True, exist_ok=True)
    torch.save(trained_model.state_dict(), model_path)
    mlflow.log_artifact(str(model_path))
    
    # Save scalers
    import joblib
    scalers_path = Path(SHARED_DIR) / "models" / "scalers.joblib"
    joblib.dump({"scaler_X": scaler_X, "scaler_y": scaler_y}, scalers_path)
    mlflow.log_artifact(str(scalers_path))
    
    run_id = mlflow.active_run().info.run_id
    print(f"\n‚úÖ Training complete! MLflow run_id: {run_id}")


In [None]:
# Plot training history
import matplotlib.pyplot as plt

fig, ax = plt.subplots(figsize=(10, 4))
ax.plot(history["train_loss"], label="Train Loss", linewidth=2)
ax.plot(history["test_loss"], label="Test Loss", linewidth=2, linestyle="--")
ax.set_xlabel("Epoch")
ax.set_ylabel("Loss (MSE)")
ax.set_title("Training Progress")
ax.legend()
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()


---
## 4. Distributed Training with Kubeflow

For larger datasets or models, use Kubeflow Training Operator for distributed training.


In [None]:
# Create training script for Kubeflow
training_script = '''
import os
import torch
import torch.nn as nn
import torch.distributed as dist
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.utils.data import DataLoader, TensorDataset, DistributedSampler
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
import mlflow

# Setup distributed
def setup_distributed():
    if "WORLD_SIZE" in os.environ:
        dist.init_process_group(backend="nccl" if torch.cuda.is_available() else "gloo")
        return dist.get_rank(), dist.get_world_size()
    return 0, 1

def cleanup_distributed():
    if dist.is_initialized():
        dist.destroy_process_group()

class SalesForecastModel(nn.Module):
    def __init__(self, input_dim=9):
        super().__init__()
        self.network = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(128, 64),
            nn.BatchNorm1d(64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 1)
        )
    
    def forward(self, x):
        return self.network(x).squeeze(-1)

def train():
    rank, world_size = setup_distributed()
    device = torch.device(f"cuda:{rank}" if torch.cuda.is_available() else "cpu")
    
    # Load data
    data_path = os.environ.get("DATA_PATH", "/mnt/shared/data")
    df = pd.read_parquet(f"{data_path}/training_data.parquet")
    
    feature_cols = ["lag_1", "lag_2", "lag_4", "rolling_mean_4w",
                    "store_size", "temperature", "fuel_price", "cpi", "unemployment"]
    X = df[feature_cols].values
    y = df["weekly_sales"].values
    
    scaler = StandardScaler()
    X = scaler.fit_transform(X)
    
    dataset = TensorDataset(torch.FloatTensor(X), torch.FloatTensor(y))
    sampler = DistributedSampler(dataset) if world_size > 1 else None
    loader = DataLoader(dataset, batch_size=256, sampler=sampler, shuffle=(sampler is None))
    
    model = SalesForecastModel().to(device)
    if world_size > 1:
        model = DDP(model, device_ids=[rank] if torch.cuda.is_available() else None)
    
    optimizer = torch.optim.AdamW(model.parameters(), lr=0.001)
    criterion = nn.MSELoss()
    
    # MLflow on rank 0 only
    if rank == 0:
        mlflow.set_tracking_uri(os.environ.get("MLFLOW_TRACKING_URI"))
        mlflow.set_experiment("sales-forecasting")
        mlflow.start_run(run_name="kubeflow-distributed-training")
        mlflow.log_params({"world_size": world_size, "epochs": 50})
    
    for epoch in range(50):
        if sampler:
            sampler.set_epoch(epoch)
        
        epoch_loss = 0
        for X_batch, y_batch in loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            optimizer.zero_grad()
            loss = criterion(model(X_batch), y_batch)
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()
        
        if rank == 0 and (epoch + 1) % 10 == 0:
            avg_loss = epoch_loss / len(loader)
            mlflow.log_metric("train_loss", avg_loss, step=epoch)
            print(f"Epoch {epoch+1}: Loss = {avg_loss:.4f}")
    
    if rank == 0:
        model_to_save = model.module if hasattr(model, "module") else model
        torch.save(model_to_save.state_dict(), "/mnt/shared/models/distributed_model.pt")
        mlflow.pytorch.log_model(model_to_save, "model")
        mlflow.end_run()
        print("Training complete!")
    
    cleanup_distributed()

if __name__ == "__main__":
    train()
'''

# Save script
script_path = Path(SHARED_DIR) / "scripts" / "train_distributed.py"
script_path.parent.mkdir(parents=True, exist_ok=True)
script_path.write_text(training_script)
print(f"‚úÖ Training script saved to {script_path}")


In [None]:
# Submit TrainJob using Kubeflow SDK
from kubeflow.training import TrainerClient

trainer = TrainerClient(namespace=NAMESPACE)

# Submit training job
job_name = trainer.train(
    script_code=training_script,
    packages_to_install=["torch", "pandas", "pyarrow", "scikit-learn", "mlflow"],
    num_nodes=2,
    gpus_per_node=0,  # Set to 1+ for GPU training
    env_vars={
        "MLFLOW_TRACKING_URI": MLFLOW_TRACKING_URI,
        "DATA_PATH": f"{SHARED_DIR}/data"
    },
    confirmed=True
)

print(f"üöÄ Submitted TrainJob: {job_name}")


In [None]:
# Monitor training progress
import time

for _ in range(30):  # Check for 5 minutes
    status = trainer.get_job(job_name)
    print(f"Job status: {status.get('status', 'Unknown')}")
    
    if status.get('status') in ['Complete', 'Failed']:
        break
    time.sleep(10)

# Get logs
logs = trainer.get_training_logs(job_name)
print("\n--- Training Logs ---")
print(logs.get('logs', 'No logs available')[-2000:])


---
## 5. Register Model in MLflow


In [None]:
# Register best model
from mlflow.tracking import MlflowClient

client = MlflowClient()

# Get the best run from experiment
experiment = client.get_experiment_by_name("sales-forecasting")
runs = client.search_runs(
    experiment_ids=[experiment.experiment_id],
    order_by=["metrics.test_loss ASC"],
    max_results=1
)

if runs:
    best_run = runs[0]
    print(f"Best run: {best_run.info.run_id}")
    print(f"  Test Loss: {best_run.data.metrics.get('test_loss', 'N/A'):.4f}")
    
    # Register model
    model_uri = f"runs:/{best_run.info.run_id}/model"
    model_version = mlflow.register_model(model_uri, "sales-forecast-model")
    
    # Transition to production
    client.transition_model_version_stage(
        name="sales-forecast-model",
        version=model_version.version,
        stage="Production"
    )
    
    print(f"\n‚úÖ Model registered: sales-forecast-model v{model_version.version} (Production)")
else:
    print("‚ùå No runs found")


---
## Summary

‚úÖ **What we accomplished:**
1. Fetched historical features from Feast
2. Trained a PyTorch model locally with MLflow tracking
3. Submitted distributed training via Kubeflow
4. Registered the best model in MLflow Model Registry

**Next:** `04-inference.ipynb` - Use online features for real-time predictions
