In [None]:
%load_ext autoreload
%autoreload 2
import os
import sys
import torch
import matplotlib.pyplot as plt
import numpy as np
import wandb

# Add src to path so we can import modules
project_root = os.getcwd()
src_path = os.path.join(project_root, 'src')
if src_path not in sys.path:
    sys.path.append(src_path)

from env import KitchenEnv
from inventory_model import InventoryModelConfig
from train_inventory_model import PPOAgent, compute_gae

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [5]:
# Training Configuration
# Initialize Environment
env = KitchenEnv()

# Determine dimensions based on environment
max_shelf_life = 0
for inv_list in [env.raw_inv, env.prep_inv]:
    for inv in inv_list:
        max_shelf_life = max(max_shelf_life, len(inv))

item_feat_dim = 5 + max_shelf_life # static(5) + dynamic(max_shelf)
global_dim = 1 # Budget

config = InventoryModelConfig(
    item_input_dim=item_feat_dim,
    global_input_dim=global_dim,
    per_item_action_dim=1,
    item_hidden_dim=64,
    actor_hidden_dim=128,
    critic_hidden_dim=128
)

agent = PPOAgent(env, config)

print("Agent initialized.")
print(f"Item Feature Dim: {item_feat_dim}")
print(f"Global Feature Dim: {global_dim}")

Agent initialized.
Item Feature Dim: 12
Global Feature Dim: 1




In [6]:
# Training Loop
wandb.init(project="kitchen-inventory-rl", name="ppo_run_v1")

num_episodes = 500
update_epochs = 4

print(f"Starting training for {num_episodes} episodes...")

episode_rewards = []
losses = []

for ep in range(num_episodes):
    obs, _ = env.reset()
    terminated = False
    
    obs_buffer = []
    action_buffer = []
    log_prob_buffer = []
    reward_buffer = []
    value_buffer = []
    
    while not terminated:
        # Get action from agent
        action, value, log_prob = agent.get_action(obs)
        
        # Step environment
        next_obs, reward, terminated, truncated, info = env.step(action)
        
        # Store experience
        obs_buffer.append(obs)
        action_buffer.append(action)
        log_prob_buffer.append(log_prob)
        reward_buffer.append(reward)
        value_buffer.append(value)
        
        obs = next_obs
        
        if truncated:
            terminated = True
    
    # Bootstrap value for GAE
    _, next_value, _ = agent.get_action(obs)
    
    # Compute GAE
    advantages, returns = compute_gae(reward_buffer, value_buffer, next_value, 0.99, 0.95)
    
    # Normalize advantages
    adv_mean = np.mean(advantages)
    adv_std = np.std(advantages) + 1e-8
    advantages = [(a - adv_mean) / adv_std for a in advantages]
    
    # Prepare rollouts
    rollouts = list(zip(obs_buffer, action_buffer, log_prob_buffer, returns, advantages))
    
    # Update Policy
    ep_loss = 0
    for _ in range(update_epochs):
        l, pl, vl, el = agent.train_step(rollouts)
        ep_loss += l
        
    avg_loss = ep_loss / update_epochs
    ep_reward = sum(reward_buffer)
    
    episode_rewards.append(ep_reward)
    losses.append(avg_loss)
    
    # Wandb Logging
    wandb.log({
        "episode_reward": ep_reward,
        "loss": avg_loss,
        "episode": ep
    })
    
    if (ep + 1) % 25 == 0:
        avg_rew = np.mean(episode_rewards[-25:])
        print(f"Ep {ep+1} | Avg Reward: {avg_rew:.2f} | Loss: {losses[-1]:.4f}")

wandb.finish()

# Plot Training Curve (Local)
plt.figure(figsize=(12,5))
plt.subplot(1, 2, 1)
plt.plot(episode_rewards, label='Episode Reward')
plt.plot(np.convolve(episode_rewards, np.ones(20)/20, mode='valid'), label='Moving Avg (20)', color='orange')
plt.title("Training Rewards")
plt.xlabel("Episode")
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(losses, color='red')
plt.title("Training Loss")
plt.xlabel("Episode")

plt.tight_layout()
plt.show()

[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


Starting training for 500 episodes...




AttributeError: 'PPOAgent' object has no attribute 'train_step'

In [None]:
# Evaluation / Testing with WandB Inventory Plots
def evaluate(agent, env, run_name="evaluation_run"):
    wandb.init(project="kitchen-inventory-rl", name=run_name)
    
    obs, _ = env.reset()
    terminated = False
    total_reward = 0
    
    # Initialize history containers
    raw_levels = {name: [] for name in [ing.name for ing in env.ingredients]}
    prep_levels = {name: [] for name in [d.name for d in env.dishes]}
    rewards = []
    
    day = 0
    while not terminated:
        action, _, _ = agent.get_action(obs, deterministic=True)
        next_obs, reward, terminated, truncated, info = env.step(action)
        
        total_reward += reward
        rewards.append(reward)
        obs = next_obs
        
        # Track inventory levels
        current_raw_levels = {}
        current_prep_levels = {}
        
        for i, ing in enumerate(env.ingredients):
            qty = env.raw_inv[i].sum()
            raw_levels[ing.name].append(qty)
            current_raw_levels[ing.name] = qty
            
        for j, dish in enumerate(env.dishes):
            qty = env.prep_inv[j].sum()
            prep_levels[dish.name].append(qty)
            current_prep_levels[dish.name] = qty
            
        # Log to wandb
        wandb.log({
            "eval/daily_reward": reward,
            "eval/day": day,
            **{f"inventory/raw_{name}": qty for name, qty in current_raw_levels.items()},
            **{f"inventory/prep_{name}": qty for name, qty in current_prep_levels.items()}
        })
        
        if terminated or truncated:
            break
        day += 1
            
    print(f"Test Episode Reward: {total_reward:.2f}")
    
    # Create a custom WandB plot for Inventory Levels
    # We'll structure data for a multi-line plot
    data = []
    days = range(len(rewards))
    
    # Raw Ingredients
    for name, levels in raw_levels.items():
        for d, l in zip(days, levels):
            data.append([d, name, l, "Raw Ingredient"])
            
    # Prepared Dishes
    for name, levels in prep_levels.items():
        for d, l in zip(days, levels):
            data.append([d, name, l, "Prepared Dish"])
            
    table = wandb.Table(data=data, columns=["day", "item_name", "quantity", "type"])
    
    # Use wandb.plot.line_series instead of line with group argument if group is not supported
    # Alternatively, simply logging scalars above (inventory/raw_...) allows plotting in UI.
    # But here is a custom chart approach:
    
    wandb.log({
        "inventory_levels_plot": wandb.plot.line_series(
            xs=days, 
            ys=[levels for levels in raw_levels.values()] + [levels for levels in prep_levels.values()],
            keys=list(raw_levels.keys()) + list(prep_levels.keys()),
            title="Inventory Levels",
            xname="day"
        )
    })

    wandb.finish()
    
    # Local Plotting (Matplotlib)
    plt.figure(figsize=(14, 6))
    
    plt.subplot(1, 2, 1)
    for name, levels in raw_levels.items():
        plt.plot(days, levels, label=f"Raw: {name}")
    for name, levels in prep_levels.items():
        plt.plot(days, levels, label=f"Dish: {name}", linestyle='--')
    plt.title("Inventory Levels over Time")
    plt.xlabel("Day")
    plt.ylabel("Stock Units")
    plt.legend()
    
    plt.subplot(1, 2, 2)
    plt.plot(days, rewards, color='green')
    plt.title("Daily Reward")
    plt.xlabel("Day")
    plt.ylabel("Reward")
    
    plt.tight_layout()
    plt.show()
    return total_reward

# Run Evaluation
evaluate(agent, env)