<a href="https://colab.research.google.com/github/aymuos/masters-practise-repo/blob/main/TERM3/ReinforcementLearning/ReinforcementLearning_aprn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from typing import List, Tuple, Optional
import numpy as np

class InventoryEnv:
    """
    Inventory management environment for 3 products with volume constraints, lead times, and stochastic or deterministic demand.

    This environment simulates:
    - Warehouse inventory evolution with lead-time-based ordering.
    - Daily customer demand and fulfillment.
    - Cost computation due to holding, ordering, and stockouts.

    Attributes:
        volume_capacity (float): Max warehouse volume capacity.
        initial_inventory (List[int]): Initial stock for each product.
        product_volumes (List[float]): Volume per unit of each product.
        holding_cost_per_volume (float): Cost per unit volume per day for storing inventory.
        stockout_costs (List[float]): Penalty per unit of unfulfilled demand for each product.
        ordering_costs (List[float]): Fixed cost per order placed for each product.
        discard_costs (List[float]): Cost per unit discarded due to over-capacity.
        lead_times (List[int]): Days before an order arrives for each product.
        simulation_days (int): Episode length in days.
        demand_sequences (Optional[List[List[int]]]): Predefined demand for evaluation.
        demand_lambda (List[float]): Poisson mean for training demand generation.
    """

    def __init__(
        self,
        volume_capacity: float = 1000.0,
        initial_inventory: List[int] = [100, 100, 100],
        product_volumes: List[float] = [2.0, 3.0, 1.5],
        holding_cost_per_volume: float = 5.0,  # Updated holding cost
        stockout_costs: List[float] = [400.0, 500.0, 300.0],
        ordering_costs: List[float] = [80.0, 200.0, 120.0],
        discard_costs: List[float] = [200.0, 250.0, 150.0],  # New discard penalties
        lead_times: List[int] = [3, 2, 1],
        simulation_days: int = 50,
        demand_sequences: Optional[List[List[int]]] = None,
        demand_lambda: List[float] = [30, 25, 35],
        seed: int = 42
    ):
        self.volume_capacity = volume_capacity
        self.initial_inventory = initial_inventory[:]
        self.product_volumes = product_volumes
        self.holding_cost_per_volume = holding_cost_per_volume
        self.stockout_costs = stockout_costs
        self.ordering_costs = ordering_costs
        self.discard_costs = discard_costs
        self.lead_times = lead_times
        self.simulation_days = simulation_days
        self.demand_sequences = demand_sequences
        self.demand_lambda = demand_lambda
        self.random_state = np.random.RandomState(seed)

        self.reset()

    def reset(self) -> List[int]:
        """
        Reset environment to initial state for a new episode.
        Returns the initial observation state.
        """
        self.day = 0
        self.inventory = self.initial_inventory[:] # Resets current inventory to initial inventory
        self.pending_orders = [[] for _ in range(len(self.initial_inventory))]  # list of orders to be delivered (day_due, quantity)
        return self._get_state() # Returns initial state of the environment

    def step(self, action: List[int]) -> Tuple[List[int], float, bool, dict]:
        """
        Executes one simulation step.

        Args:
            action (List[int]): List of order quantities for each product. Each value must be in {0, 10, ..., 100}.

        Returns:
            state (List[int]): Updated state after taking the action.
            reward (float): Scaled negative cost for the step.
            done (bool): True if the episode is over.
            info (dict): Additional information (cost breakdown, demand, fulfillment).
        """
        assert all(a in range(0, 101, 10) for a in action), "Actions must be in {0, 10, ..., 100}" # Invalid actions are rejected

        # 1. Receive due orders and add them to current inventory
        for i in range(3):
            arrivals = [qty for due, qty in self.pending_orders[i] if due == self.day]
            self.inventory[i] += sum(arrivals)
            self.pending_orders[i] = [(due, qty) for due, qty in self.pending_orders[i] if due > self.day]

        # 2. Place new orders and add them to pending orders
        order_cost = 0
        for i in range(3):
            if action[i] > 0:
                order_cost += self.ordering_costs[i]
                self.pending_orders[i].append((self.day + self.lead_times[i], action[i]))

        # 3. Generate demand if not provided
        if self.demand_sequences:
            demand = self.demand_sequences[self.day]
        else:
            demand = [self.random_state.poisson(lam) for lam in self.demand_lambda]

        # 4. Enforce volume capacity and compute discards
        total_volume = sum(self.inventory[i] * self.product_volumes[i] for i in range(3))
        discarded = [0, 0, 0]
        if total_volume > self.volume_capacity:
            overflow = total_volume - self.volume_capacity
            # discard from highest-volume items first
            for i in sorted(range(3), key=lambda j: self.product_volumes[j], reverse=True):
                max_remove = int(overflow // self.product_volumes[i])
                remove_qty = min(max_remove, self.inventory[i])
                discarded[i] = remove_qty
                self.inventory[i] -= remove_qty
                overflow -= remove_qty * self.product_volumes[i]
                if overflow <= 0:
                    break

        # 5. Fulfill demand and compute stockouts
        fulfilled = [min(self.inventory[i], demand[i]) for i in range(3)]
        unfulfilled = [demand[i] - fulfilled[i] for i in range(3)]
        self.inventory = [self.inventory[i] - fulfilled[i] for i in range(3)]

        # 6. Compute costs and reward
        holding_cost = sum(self.inventory[i] * self.product_volumes[i] * self.holding_cost_per_volume for i in range(3))
        stockout_cost = sum(unfulfilled[i] * self.stockout_costs[i] for i in range(3))
        discard_cost = sum(discarded[i] * self.discard_costs[i] for i in range(3))
        total_cost = holding_cost + stockout_cost + order_cost + discard_cost
        reward = - total_cost / 100.0  # scaled for stability

        # 7. Update state
        self.day += 1
        done = self.day >= self.simulation_days # True if episode ends
        info = {
            "day": self.day,
            "inventory": self.inventory[:],
            "fulfilled": fulfilled,
            "unfulfilled": unfulfilled,
            "order_cost": order_cost,
            "holding_cost": holding_cost,
            "stockout_cost": stockout_cost,
            "discard_cost": discard_cost,
            "total_cost": total_cost
        }

        return self._get_state(), reward, done, info

    def _get_state(self) -> List[int]:
        """
        Constructs the state vector including inventory levels and outstanding orders.

        Returns:
            List[int]: State representation with 7 variables
        """
        outstanding_orders = [sum(qty for _, qty in self.pending_orders[i]) for i in range(3)]
        return self.inventory + outstanding_orders + [self.day]



In [2]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import random
from collections import deque


In [3]:
class QNetwork(nn.Module):
    def __init__(self, state_size, action_size, hidden_size=128):
        super(QNetwork, self).__init__()
        self.fc1 = nn.Linear(state_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.out = nn.Linear(hidden_size, action_size)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        return self.out(x)


In [4]:
class DQNAgent:
    def __init__(self, state_size, action_size, gamma=0.99, lr=1e-3,
                 batch_size=64, buffer_size=50000, epsilon_start=1.0,
                 epsilon_end=0.05, epsilon_decay=0.995):
        self.state_size = state_size
        self.action_size = action_size
        self.gamma = gamma
        self.batch_size = batch_size
        self.memory = deque(maxlen=buffer_size)
        self.epsilon = epsilon_start
        self.epsilon_min = epsilon_end
        self.epsilon_decay = epsilon_decay

        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.qnetwork = QNetwork(state_size, action_size).to(self.device)
        self.target_network = QNetwork(state_size, action_size).to(self.device)
        self.optimizer = optim.Adam(self.qnetwork.parameters(), lr=lr)

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def act(self, state):
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        state = torch.FloatTensor(state).unsqueeze(0).to(self.device)
        with torch.no_grad():
            q_values = self.qnetwork(state)
        return torch.argmax(q_values).item()

    def replay(self):
        if len(self.memory) < self.batch_size:
            return
        minibatch = random.sample(self.memory, self.batch_size)
        states, actions, rewards, next_states, dones = zip(*minibatch)

        states = torch.FloatTensor(states).to(self.device)
        actions = torch.LongTensor(actions).to(self.device)
        rewards = torch.FloatTensor(rewards).to(self.device)
        next_states = torch.FloatTensor(next_states).to(self.device)
        dones = torch.FloatTensor(dones).to(self.device)

        q_values = self.qnetwork(states).gather(1, actions.unsqueeze(1)).squeeze(1)
        next_q_values = self.target_network(next_states).max(1)[0]
        targets = rewards + (self.gamma * next_q_values * (1 - dones))

        loss = nn.MSELoss()(q_values, targets)
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

    def update_target_network(self):
        self.target_network.load_state_dict(self.qnetwork.state_dict())


In [5]:
env = InventoryEnv()
state = env.reset()
state_size = len(state)
action_size = 11 ** 3  # since 3 products, each has 11 discrete order options

agent = DQNAgent(state_size, action_size)

episodes = 1000
target_update_freq = 10

for e in range(episodes):
    state = env.reset()
    total_reward = 0
    done = False

    while not done:
        action_idx = agent.act(state)

        # Convert flat index → 3 product orders
        orders = np.unravel_index(action_idx, (11, 11, 11))
        orders = [o * 10 for o in orders]

        next_state, reward, done, _ = env.step(orders)
        agent.remember(state, action_idx, reward, next_state, done)
        agent.replay()
        state = next_state
        total_reward += reward

    if e % target_update_freq == 0:
        agent.update_target_network()

    print(f"Episode {e}, Total Reward: {total_reward}")


Episode 0, Total Reward: -15704.599999999999
Episode 1, Total Reward: -16589.274999999998
Episode 2, Total Reward: -10365.0
Episode 3, Total Reward: -15022.725
Episode 4, Total Reward: -11563.375000000002
Episode 5, Total Reward: -17194.825
Episode 6, Total Reward: -13265.900000000003
Episode 7, Total Reward: -12802.149999999998
Episode 8, Total Reward: -15920.525
Episode 9, Total Reward: -16484.05
Episode 10, Total Reward: -10413.150000000001
Episode 11, Total Reward: -19996.374999999996
Episode 12, Total Reward: -15458.85
Episode 13, Total Reward: -20300.050000000003
Episode 14, Total Reward: -16827.950000000004
Episode 15, Total Reward: -17461.475
Episode 16, Total Reward: -18817.299999999996
Episode 17, Total Reward: -15347.425
Episode 18, Total Reward: -15755.025000000007
Episode 19, Total Reward: -15549.850000000004
Episode 20, Total Reward: -19391.825
Episode 21, Total Reward: -21262.15
Episode 22, Total Reward: -12965.224999999997
Episode 23, Total Reward: -15452.899999999998
E

In [6]:
torch.save(agent.qnetwork.state_dict(), "dqn_inventory.pth")

Uncomment and run these into new projects

In [None]:
# # Save the trained model
# torch.save(agent.qnetwork.state_dict(), "dqn_inventory.pth")

# # Required run_policy function for leaderboard
# def run_policy(state):
#     # state: list of floats
#     state_tensor = torch.FloatTensor(state).unsqueeze(0)
#     with torch.no_grad():
#         q_values = agent.qnetwork(state_tensor)
#     action_idx = torch.argmax(q_values).item()
#     orders = np.unravel_index(action_idx, (11, 11, 11))
#     return [o * 10 for o in orders]


In [None]:
# #rl_agent.py
# #import gym
# import subprocess
# import sys
# try:
#     import torch
# except ImportError:
#     subprocess.check_call([sys.executable, "-m", "pip", "install", "torch"])
# import torch
# import torch.nn as nn
# import numpy as np
# import os

# # Get the current directory of submission.py
# CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))

# # Path to the model file
# model_path = os.path.join(CURRENT_DIR, "dqn_inventory.pth")

# # # Load model content
# # with open(model_path, 'r') as f:
# #     model_data = f.read()

# # # Optionally, process the model data
# # print("Loaded model data:", model_data)

# class QNetwork(nn.Module):
#     def __init__(self, state_size, action_size, hidden_size=128):
#         super(QNetwork, self).__init__()
#         self.fc1 = nn.Linear(state_size, hidden_size)
#         self.fc2 = nn.Linear(hidden_size, hidden_size)
#         self.out = nn.Linear(hidden_size, action_size)

#     def forward(self, x):
#         x = torch.relu(self.fc1(x))
#         x = torch.relu(self.fc2(x))
#         return self.out(x)

# class RLAgent:
#     def __init__(self):
#         pass

#     def flatten_state(self, state):
#       if isinstance(state, dict):
#           return np.concatenate([np.array(v, dtype=np.float32) for v in state.values()])
#       return np.array(state, dtype=np.float32)

#     def run_policy(self,state):
#         state = self.flatten_state(state)
#         STATE_SIZE = len(state)  # Example: adjust to your actual flattened state size
#         ACTION_SIZE = 11 ** 3  # 3 products, 11 discrete actions each

#         policy_net = QNetwork(STATE_SIZE, ACTION_SIZE)
#         policy_net.load_state_dict(torch.load(model_path))
#         policy_net.eval()
#         state_tensor = torch.FloatTensor(state).unsqueeze(0)

#         with torch.no_grad():
#             q_values = policy_net(state_tensor)
#         action_idx = torch.argmax(q_values).item()

#         # Convert flat index → orders for 3 products
#         orders = np.unravel_index(action_idx, (11, 11, 11))
#         return [o * 10 for o in orders]  # since action space is {0,10,...,100}



NameError: name '__file__' is not defined