# Sales Forecasting - Inference


## Setup


In [None]:
%pip install -q kserve requests numpy yamlmagic
%load_ext yamlmagic


## Configuration


In [None]:
%%yaml parameters

# =============================================================================
# Cluster Configuration
# =============================================================================
namespace: feast-trainer-demo
shared_pvc: feast-pvc

# =============================================================================
# Model Configuration
# =============================================================================
model_name: sales-forecast

# =============================================================================
# Serving Configuration (use training image - has PyTorch)
# =============================================================================
serving_image: quay.io/modh/training:py311-cuda124-torch251

resources:
  cpu_request: "500m"
  cpu_limit: "2"
  memory_request: 1Gi
  memory_limit: 4Gi

# =============================================================================
# Data Paths (must match training - PVC mounted at /shared)
# =============================================================================
paths:
  model_dir: /shared/models

In [None]:
# Extract parameters
NAMESPACE = parameters['namespace']
MODEL_NAME = parameters['model_name']
SHARED_PVC = parameters['shared_pvc']
SERVING_IMAGE = parameters['serving_image']
CPU_LIMIT = parameters['resources']['cpu_limit']
MEMORY_LIMIT = parameters['resources']['memory_limit']
CPU_REQUEST = parameters['resources']['cpu_request']
MEMORY_REQUEST = parameters['resources']['memory_request']


## Authentication


In [None]:
import os, time, urllib3
from datetime import datetime

urllib3.disable_warnings()

K8S_TOKEN = os.getenv("K8S_TOKEN", "<YOUR_TOKEN>")
K8S_API_SERVER = os.getenv("K8S_API_SERVER", "<YOUR_API_SERVER>")


In [None]:
from kubernetes import client as k8s
from kserve import KServeClient, V1beta1InferenceService, V1beta1InferenceServiceSpec, V1beta1PredictorSpec

cfg = k8s.Configuration()
cfg.host = K8S_API_SERVER
cfg.verify_ssl = False
cfg.api_key = {"authorization": f"Bearer {K8S_TOKEN}"}
k8s.Configuration.set_default(cfg)

core_api = k8s.CoreV1Api()
custom_api = k8s.CustomObjectsApi()
kserve_client = KServeClient()


## Serving Script


In [None]:
SERVE_SCRIPT = '''#!/usr/bin/env python3
"""Sales forecasting inference server - loads model architecture from metadata"""
import os, json, torch, torch.nn as nn, joblib, numpy as np
from flask import Flask, request, jsonify

app = Flask(__name__)
model, scalers, feature_cols, metadata = None, None, None, None


def build_model(input_dim, hidden_dims, dropout=0.2):
    """Build model dynamically from metadata"""
    layers = []
    prev_dim = input_dim
    for h_dim in hidden_dims:
        layers.extend([nn.Linear(prev_dim, h_dim), nn.BatchNorm1d(h_dim), nn.ReLU(), nn.Dropout(dropout)])
        prev_dim = h_dim
    layers.append(nn.Linear(prev_dim, 1))
    return nn.Sequential(*layers)


class SalesMLP(nn.Module):
    def __init__(self, input_dim, hidden_dims):
        super().__init__()
        self.net = build_model(input_dim, hidden_dims)
    def forward(self, x): return self.net(x).squeeze(-1)


def load_model():
    global model, scalers, feature_cols, metadata
    model_dir = os.getenv("MODEL_DIR", "/shared/models")
    
    with open(f"{model_dir}/model_metadata.json") as f:
        metadata = json.load(f)
    
    hidden_dims = metadata.get("hidden_dims", [256, 128, 64])
    model = SalesMLP(metadata["input_dim"], hidden_dims)
    model.load_state_dict(torch.load(f"{model_dir}/best_model.pt", map_location="cpu", weights_only=True))
    model.eval()
    
    scalers = joblib.load(f"{model_dir}/scalers.joblib")
    feature_cols = metadata["feature_columns"]
    print(f"Loaded: {len(feature_cols)} features, arch={hidden_dims}")


@app.route("/health", methods=["GET"])
def health():
    return jsonify({"status": "healthy", "model": "sales-forecast"})


@app.route("/v1/models/sales-forecast", methods=["GET"])
def model_info():
    return jsonify({
        "name": "sales-forecast",
        "features": feature_cols,
        "hidden_dims": metadata.get("hidden_dims"),
        "best_loss": metadata.get("best_loss"),
        "device_trained": metadata.get("device_type", "unknown")
    })


@app.route("/v1/models/sales-forecast:predict", methods=["POST"])
def predict():
    instances = request.json.get("instances", [])
    X = np.array([[inst.get(c, 0) if isinstance(inst, dict) else inst[i] for i, c in enumerate(feature_cols)] for inst in instances])
    X_scaled = scalers["scaler_X"].transform(X)
    with torch.no_grad():
        preds = model(torch.FloatTensor(X_scaled)).numpy()
    return jsonify({"predictions": scalers["scaler_y"].inverse_transform(preds.reshape(-1, 1)).flatten().tolist()})


@app.route("/v1/models/sales-forecast:explain", methods=["POST"])
def explain():
    weights = model.net[0].weight.abs().mean(dim=0).detach().numpy()
    importance = {f: float(w)/weights.sum() for f, w in zip(feature_cols, weights)}
    return jsonify({"feature_importance": dict(sorted(importance.items(), key=lambda x: -x[1]))})


if __name__ == "__main__":
    load_model()
    app.run(host="0.0.0.0", port=8080)
'''

## Deploy Model


In [None]:
deploy_id = datetime.now().strftime("%m%d-%H%M")
labels = {"app": "sales-forecasting", "deploy-id": deploy_id}
cm_name = f"{MODEL_NAME}-serve"

try:
    core_api.delete_namespaced_config_map(cm_name, NAMESPACE)
except: pass

core_api.create_namespaced_config_map(
    NAMESPACE,
    k8s.V1ConfigMap(
        metadata=k8s.V1ObjectMeta(name=cm_name, labels=labels),
        data={"serve.py": SERVE_SCRIPT}
    )
)


In [None]:
try:
    kserve_client.delete(MODEL_NAME, namespace=NAMESPACE)
    for _ in range(30):
        try: kserve_client.get(MODEL_NAME, namespace=NAMESPACE); time.sleep(2)
        except: break
except: pass

isvc = V1beta1InferenceService(
    api_version="serving.kserve.io/v1beta1",
    kind="InferenceService",
    metadata=k8s.V1ObjectMeta(name=MODEL_NAME, namespace=NAMESPACE, labels=labels),
    spec=V1beta1InferenceServiceSpec(
        predictor=V1beta1PredictorSpec(
            containers=[k8s.V1Container(
                name="kserve-container",
                image=SERVING_IMAGE,
                command=["/bin/bash", "-c", "pip install -q flask joblib numpy scikit-learn && python /scripts/serve.py"],
                env=[k8s.V1EnvVar(name="MODEL_DIR", value="/shared/models")],
                ports=[k8s.V1ContainerPort(container_port=8080, protocol="TCP")],
                volume_mounts=[
                    k8s.V1VolumeMount(name="model-storage", mount_path="/shared"),
                    k8s.V1VolumeMount(name="serve-script", mount_path="/scripts")
                ],
                resources=k8s.V1ResourceRequirements(
                    limits={"cpu": CPU_LIMIT, "memory": MEMORY_LIMIT},
                    requests={"cpu": CPU_REQUEST, "memory": MEMORY_REQUEST}
                )
            )],
            volumes=[
                k8s.V1Volume(name="model-storage", persistent_volume_claim=k8s.V1PersistentVolumeClaimVolumeSource(claim_name=SHARED_PVC)),
                k8s.V1Volume(name="serve-script", config_map=k8s.V1ConfigMapVolumeSource(name=cm_name))
            ]
        )
    )
)
kserve_client.create(isvc, namespace=NAMESPACE)


In [None]:
kserve_client.wait_isvc_ready(MODEL_NAME, namespace=NAMESPACE, timeout_seconds=300)
isvc_status = kserve_client.get(MODEL_NAME, namespace=NAMESPACE)
internal_url = isvc_status.get("status", {}).get("url", "")


In [None]:
# Create non-headless service for route (KServe creates headless service by default)
external_svc_name = f"{MODEL_NAME}-external"
try: core_api.delete_namespaced_service(external_svc_name, NAMESPACE)
except: pass

core_api.create_namespaced_service(
    NAMESPACE,
    k8s.V1Service(
        metadata=k8s.V1ObjectMeta(name=external_svc_name, labels=labels),
        spec=k8s.V1ServiceSpec(
            selector={"app": f"isvc.{MODEL_NAME}-predictor"},
            ports=[k8s.V1ServicePort(name="http", port=8080, target_port=8080)],
            type="ClusterIP"
        )
    )
)

route = {
    "apiVersion": "route.openshift.io/v1",
    "kind": "Route",
    "metadata": {"name": MODEL_NAME, "namespace": NAMESPACE, "labels": labels},
    "spec": {
        "to": {"kind": "Service", "name": external_svc_name, "weight": 100},
        "port": {"targetPort": "http"},
        "tls": {"termination": "edge", "insecureEdgeTerminationPolicy": "Redirect"}
    }
}

try: custom_api.delete_namespaced_custom_object("route.openshift.io", "v1", NAMESPACE, "routes", MODEL_NAME)
except: pass

custom_api.create_namespaced_custom_object("route.openshift.io", "v1", NAMESPACE, "routes", route)
route_info = custom_api.get_namespaced_custom_object("route.openshift.io", "v1", NAMESPACE, "routes", MODEL_NAME)
EXTERNAL_URL = f"https://{route_info.get('spec', {}).get('host', '')}"


## Test Inference


In [None]:
import requests, numpy as np

class InferenceClient:
    def __init__(self, url, token=None):
        self.url = url.rstrip("/")
        self.session = requests.Session()
        self.session.verify = False
        if token: self.session.headers["Authorization"] = f"Bearer {token}"
    
    def health(self): return self.session.get(f"{self.url}/health", timeout=10).json()
    def info(self): return self.session.get(f"{self.url}/v1/models/{MODEL_NAME}", timeout=10).json()
    def predict(self, instances): return self.session.post(f"{self.url}/v1/models/{MODEL_NAME}:predict", json={"instances": instances}, timeout=30).json()
    def explain(self): return self.session.post(f"{self.url}/v1/models/{MODEL_NAME}:explain", json={}, timeout=30).json()

client = InferenceClient(EXTERNAL_URL, K8S_TOKEN)


In [None]:
time.sleep(10)  # Wait for server startup
client.health()


In [None]:
# Sample prediction
sample = {
    "lag_1": 25000, "lag_2": 24000, "lag_4": 23000, "lag_8": 22000, "lag_52": 20000,
    "rolling_mean_4w": 24500, "store_size": 150000, "temperature": 65.0,
    "fuel_price": 2.8, "cpi": 215.0, "unemployment": 5.5
}
client.predict([sample])


In [None]:
# Latency test (10 requests)
import time
times = []
for _ in range(10):
    t0 = time.time()
    client.predict([sample])
    times.append((time.time() - t0) * 1000)
f"Mean: {np.mean(times):.0f}ms, P95: {np.percentile(times, 95):.0f}ms"


## Cleanup


In [None]:
# Uncomment to delete:
# kserve_client.delete(MODEL_NAME, namespace=NAMESPACE)
# custom_api.delete_namespaced_custom_object("route.openshift.io", "v1", NAMESPACE, "routes", MODEL_NAME)
# core_api.delete_namespaced_service(external_svc_name, NAMESPACE)
# core_api.delete_namespaced_config_map(cm_name, NAMESPACE)


In [None]:
f"Model: {MODEL_NAME} | URL: {EXTERNAL_URL}"
