In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from collections import defaultdict
import os
import random
from tqdm import tqdm

config = {
    'underage_cost': 28,
    'holding_cost': 73,
    'mean': 33,
    'coef_of_var': 92,
    'lead_time': 41,
    'demand': 57,
    'initial_inventory': 4839,
    'lost_demand': True,
    'maximize_profit': True,
    # Using period config from your snippet
    'train_periods': 104,
    'dev_periods': 50,
    'test_periods': 30,
    'n_samples': 32,  # episodes per epoch
    'max_order_quantity': 5,
    'n_products': 10,  # number of products to use
    'holding_cost_factor': 5.0 # Scale holding costs by this factor
}

# Load sales_data [33945, 1, 240]
# Check if GPU is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Move sales data to GPU
sales_data = torch.load('data_files/favorita/weekly_sales.pt')[:config['n_products'], 0, :].float().to(device)  # shape: [n_products, 240]

class InventoryActorCritic(nn.Module):
    def __init__(self, n_products, hidden_size=128, max_order_quantity=20):
        super().__init__()
        self.n_products = n_products
        self.max_order_quantity = max_order_quantity

        self.shared = nn.Sequential(
            nn.Linear(n_products * 2, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, hidden_size),
            nn.ReLU()
        )

        self.actor_heads = nn.ModuleList([
            nn.Sequential(
                nn.Linear(hidden_size, hidden_size // 2),
                nn.ReLU(),
                nn.Linear(hidden_size // 2, max_order_quantity + 1)
            ) for _ in range(n_products)
        ])

        self.critic = nn.Sequential(
            nn.Linear(hidden_size, hidden_size // 2),
            nn.ReLU(),
            nn.Linear(hidden_size // 2, 1)
        )

    def forward(self, state):
        features = self.shared(state)
        action_dists = []
        for head in self.actor_heads:
            logits = head(features)
            probs = torch.softmax(logits, dim=1)
            action_dists.append(probs)

        value = self.critic(features)
        return action_dists, value

class InventoryEnvironment:
    def __init__(self, sales_data, underage_cost, holding_cost, lead_time, initial_inventory, lost_demand, maximize_profit, periods, holding_cost_factor=1.0):
        self.sales_data_full = sales_data
        self.n_products = sales_data.shape[0]
        self.underage_cost = underage_cost
        self.holding_cost = holding_cost
        self.holding_cost_factor = holding_cost_factor
        self.lead_time = lead_time
        self.initial_inventory = initial_inventory
        self.lost_demand = lost_demand
        self.maximize_profit = maximize_profit
        self.periods = periods

        # Revenue = selling_price * sales. Assume selling_price is underage_cost + holding_cost for now.
        self.selling_price = self.underage_cost + self.holding_cost

        self.order_pipeline = []
        self.reset()

    def reset(self):
        # Randomly select a start index for the episode so we don't always train on the same slice
        max_start = self.sales_data_full.shape[1] - self.periods
        self.start_index = random.randint(0, max_start) if max_start > 0 else 0

        self.current_step = 0
        self.inventory = torch.ones(self.n_products) * self.initial_inventory
        self.order_pipeline = [torch.zeros(self.n_products) for _ in range(self.lead_time)]
        self.demand_history = torch.zeros(self.n_products)
        return self._get_state()

    def _get_state(self):
        return torch.cat([self.inventory, self.demand_history])

    def step(self, actions):
        # Process incoming order
        incoming_order = self.order_pipeline.pop(0)
        self.inventory += incoming_order  # incoming_order now matches shape [n_products]
        self.order_pipeline.append(actions.squeeze().float())  # Ensure actions are [n_products]

        # Demand
        current_period = self.start_index + self.current_step
        demand = self.sales_data_full[:, current_period]
        sales = torch.min(self.inventory, demand)

        self.inventory = self.inventory - sales
        lost_units = demand - sales

        # Scale the holding cost to penalize more
        scaled_holding_cost = self.holding_cost * self.holding_cost_factor

        holding_costs = scaled_holding_cost * self.inventory
        stockout_costs = self.underage_cost * lost_units if self.lost_demand else self.underage_cost * lost_units

        total_holding_cost = holding_costs.sum()
        total_stockout_cost = stockout_costs.sum()

        if self.maximize_profit:
            revenue = self.selling_price * sales.sum()
            reward = revenue - (total_holding_cost + total_stockout_cost)
        else:
            reward = -(total_holding_cost + total_stockout_cost)

        self.demand_history = demand
        self.current_step += 1
        done = self.current_step >= self.periods

        info = {
            'holding_costs': total_holding_cost.item(),
            'stockout_costs': total_stockout_cost.item(),
            'sales_revenue': (self.selling_price * sales.sum()).item() if self.maximize_profit else sales.sum().item()
        }

        return self._get_state(), reward.item(), done, info




class A2CTrainer:
    def __init__(self, model, env, lr=1e-4, gamma=0.99, entropy_coef=0.01, value_loss_coef=0.5):
        self.model = model.to(device)  # Move model to GPU
        self.env = env
        self.optimizer = optim.Adam(model.parameters(), lr=lr)
        self.gamma = gamma
        self.entropy_coef = entropy_coef
        self.value_loss_coef = value_loss_coef

    def train_episode(self):
        state = torch.FloatTensor(self.env.reset()).to(device)  # Move state to GPU
        done = False
        episode_rewards = []
        values = []
        log_probs = []
        entropy = 0
        metrics = defaultdict(float)

        while not done:
            state_tensor = state.unsqueeze(0)
            action_dists, value = self.model(state_tensor)

            actions = []
            episode_log_probs = []
            episode_entropy = 0

            for dist in action_dists:
                distribution = torch.distributions.Categorical(dist)
                action = distribution.sample()
                actions.append(action)
                episode_log_probs.append(distribution.log_prob(action))
                episode_entropy += distribution.entropy().mean()

            actions = torch.stack(actions).to(device)
            log_prob = torch.stack(episode_log_probs).sum()

            next_state, reward, done, info = self.env.step(actions.cpu())
            state = torch.FloatTensor(next_state).to(device)

            episode_rewards.append(reward)
            values.append(value)
            log_probs.append(log_prob)
            entropy += episode_entropy

            for k, v in info.items():
                metrics[k] += v

        returns = self._compute_returns(episode_rewards).to(device)
        advantages = returns - torch.cat(values).squeeze()

        value_loss = advantages.pow(2).mean()
        policy_loss = -(advantages.detach() * torch.stack(log_probs)).mean()
        entropy_loss = -entropy.mean()

        total_loss = (policy_loss +
                      self.value_loss_coef * value_loss +
                      self.entropy_coef * entropy_loss)

        self.optimizer.zero_grad()
        total_loss.backward()
        self.optimizer.step()

        metrics['total_reward'] = sum(episode_rewards)
        return metrics

    def _compute_returns(self, rewards):
        returns = []
        R = 0
        for r in reversed(rewards):
            R = r + self.gamma * R
            returns.insert(0, R)
        return torch.tensor(returns)

def prepare_training(sales_data, config):
    env = InventoryEnvironment(
        sales_data=sales_data.cpu(),  # Environment uses CPU for step calculations
        underage_cost=config['underage_cost'],
        holding_cost=config['holding_cost'],
        lead_time=config['lead_time'],
        initial_inventory=config['initial_inventory'],
        lost_demand=config['lost_demand'],
        maximize_profit=config['maximize_profit'],
        periods=config['train_periods'],
        holding_cost_factor=config['holding_cost_factor']
    )

    model = InventoryActorCritic(
        n_products=sales_data.shape[0],
        hidden_size=128,
        max_order_quantity=config['max_order_quantity']
    )

    trainer = A2CTrainer(
        model=model,
        env=env,
        lr=1e-4,
        gamma=0.99,
        entropy_coef=0.01,
        value_loss_coef=0.5
    )

    return trainer

def train(trainer, config, n_epochs=5):
    n_samples = config['n_samples']
    metrics_history = []

    for epoch in range(n_epochs):
        epoch_metrics = defaultdict(float)
        for _ in tqdm(range(n_samples)):
            metrics = trainer.train_episode()
            for k, v in metrics.items():
                epoch_metrics[k] += v

        # Average over n_samples
        for k in epoch_metrics:
            epoch_metrics[k] /= n_samples

        metrics_history.append(epoch_metrics)

        print(f"Epoch {epoch + 1}")
        print(f"Average Reward: {epoch_metrics['total_reward']:.2f}")
        print(f"Average Holding Cost: {epoch_metrics['holding_costs']:.2f}")
        print(f"Average Stockout Cost: {epoch_metrics['stockout_costs']:.2f}")
        print(f"Average Sales Revenue: {epoch_metrics['sales_revenue']:.2f}\n")

    return metrics_history

# Prepare and train the model
trainer = prepare_training(sales_data, config)
metrics_history = train(trainer, config, n_epochs=5)


  sales_data = torch.load('/content/weekly_sales.pt')[:config['n_products'], 0, :].float().to(device)  # shape: [n_products, 240]


Using device: cuda


100%|██████████| 32/32 [00:53<00:00,  1.66s/it]


Epoch 1
Average Reward: -1178825838.31
Average Holding Cost: 1181637172.27
Average Stockout Cost: 261716.10
Average Sales Revenue: 3073049.83



100%|██████████| 32/32 [00:52<00:00,  1.64s/it]


Epoch 2
Average Reward: -1176725828.08
Average Holding Cost: 1179503175.92
Average Stockout Cost: 260428.36
Average Sales Revenue: 3037776.24



100%|██████████| 32/32 [00:53<00:00,  1.67s/it]


Epoch 3
Average Reward: -1161739223.44
Average Holding Cost: 1164525537.92
Average Stockout Cost: 285328.46
Average Sales Revenue: 3071642.39



100%|██████████| 32/32 [00:53<00:00,  1.68s/it]


Epoch 4
Average Reward: -1168918787.16
Average Holding Cost: 1171712711.55
Average Stockout Cost: 269188.00
Average Sales Revenue: 3063112.86



100%|██████████| 32/32 [00:53<00:00,  1.67s/it]

Epoch 5
Average Reward: -1174306426.77
Average Holding Cost: 1177084384.09
Average Stockout Cost: 263074.31
Average Sales Revenue: 3041032.19




