<a href="https://colab.research.google.com/github/aymuos/masters-practise-repo/blob/main/TERM3/ReinforcementLearning/cleanup_rlproj0408.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Starting point


Cleaned | D3QN

---------------------------------------------------------------

In [4]:
import random
import torch
import numpy as np

class SumTree:
    """
    A SumTree data structure for efficient storage and sampling of priorities.
    Used internally by PrioritizedReplayBuffer.
    """
    def __init__(self, capacity):
        self.capacity = capacity
        self.tree = np.zeros(2 * capacity - 1)
        self.data = np.zeros(capacity, dtype=object)
        self.data_pointer = 0

    def add(self, priority, data):
        """Add data and update the tree."""
        tree_index = self.data_pointer + self.capacity - 1
        self.data[self.data_pointer] = data
        self.update(tree_index, priority)
        self.data_pointer += 1
        if self.data_pointer >= self.capacity:
            self.data_pointer = 0

    def update(self, tree_index, priority):
        """Update the priority of a data point."""
        change = priority - self.tree[tree_index]
        self.tree[tree_index] = priority
        while tree_index != 0:
            tree_index = (tree_index - 1) // 2
            self.tree[tree_index] += change

    def get_leaf(self, v):
        """Retrieve a data point and its priority from the tree."""
        parent_index = 0
        while True:
            left_child_index = 2 * parent_index + 1
            right_child_index = left_child_index + 1
            if left_child_index >= len(self.tree):
                leaf_index = parent_index
                break
            else:
                if v <= self.tree[left_child_index]:
                    parent_index = left_child_index
                else:
                    v -= self.tree[left_child_index]
                    parent_index = right_child_index
        data_index = leaf_index - self.capacity + 1
        return leaf_index, self.tree[leaf_index], self.data[data_index]

    @property
    def total_priority(self):
        """Get the sum of all priorities."""
        return self.tree[0]

class PrioritizedReplayBuffer:
    """
    Prioritized Experience Replay Buffer for DQN.
    """
    def __init__(self, buffer_size, alpha=0.6, seed=42):
        self.sum_tree = SumTree(buffer_size)
        self.alpha = alpha
        self.buffer_size = buffer_size
        self.experience = [None] * buffer_size  # Use a list to store experiences
        self.current_size = 0 # To track the actual number of experiences stored
        self.random_state = np.random.RandomState(seed)
        self.max_priority = 1.0 # Initial max priority for new experiences

    def add(self, state, action, reward, next_state, done):
        """Adds an experience to the buffer with maximum priority."""
        experience = (state, action, reward, next_state, done)
        self.sum_tree.add(self.max_priority, self.current_size) # Store index in SumTree
        self.experience[self.current_size] = experience # Store experience in list
        self.current_size = (self.current_size + 1) % self.buffer_size

    def sample(self, batch_size, beta=0.4):
        """Samples a batch of experiences based on priorities."""
        minibatch = []
        indices = []
        weights = []
        total_priority = self.sum_tree.total_priority
        segment = total_priority / batch_size

        for i in range(batch_size):
            a = segment * i
            b = segment * (i + 1)
            v = self.random_state.uniform(a, b)
            tree_index, priority, data_index = self.sum_tree.get_leaf(v) # data_index is the index in the experience list

            # Calculate importance sampling weight
            sampling_probability = priority / total_priority
            weight = (self.buffer_size * sampling_probability) ** -beta
            weights.append(weight)
            indices.append(tree_index) # Store the tree index for priority updates
            minibatch.append(self.experience[data_index]) # Retrieve experience from list

        # Normalize weights
        max_weight = max(weights) if weights else 1.0
        weights = [w / max_weight for w in weights]

        states, actions, rewards, next_states, dones = zip(*minibatch)

        return (torch.FloatTensor(states),
                torch.LongTensor(actions),
                torch.FloatTensor(rewards),
                torch.FloatTensor(next_states),
                torch.FloatTensor(dones),
                torch.FloatTensor(weights),
                indices)

    def update_priorities(self, tree_indices, td_errors):
        """Updates the priorities of sampled experiences based on TD errors."""
        for tree_index, td_error in zip(tree_indices, td_errors):
            priority = abs(td_error) ** self.alpha
            self.sum_tree.update(tree_index, priority)
            self.max_priority = max(self.max_priority, priority) # Update max priority

ModuleNotFoundError: No module named 'torch'

In [20]:
class DQNAgent:
    def __init__(self, state_size, action_size, gamma=0.99, lr=5e-4,
                 batch_size=128, buffer_size=100000, epsilon_start=1.0,
                 epsilon_end=0.01, epsilon_decay=0.99, alpha=0.6, beta_start=0.4, beta_frames=1000):
        self.state_size = state_size
        self.action_size = action_size
        self.gamma = gamma
        self.batch_size = batch_size
        self.memory = PrioritizedReplayBuffer(buffer_size, alpha)
        self.epsilon = epsilon_start
        self.epsilon_min = epsilon_end
        self.epsilon_decay = epsilon_decay

        self.beta = beta_start
        self.beta_increment_per_frame = (1.0 - beta_start) / beta_frames

        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.qnetwork = DuelingQNetwork(state_size, action_size).to(self.device)
        self.target_network = DuelingQNetwork(state_size, action_size).to(self.device)
        self.optimizer = optim.Adam(self.qnetwork.parameters(), lr=lr)

    def remember(self, state, action, reward, next_state, done):
        self.memory.add(state, action, reward, next_state, done)

    def act(self, state):
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        state = torch.FloatTensor(state).unsqueeze(0).to(self.device)
        self.qnetwork.eval()
        with torch.no_grad():
            q_values = self.qnetwork(state)
        self.qnetwork.train()
        return torch.argmax(q_values).item()

    def replay(self):
        if self.memory.current_size < self.batch_size: # Check if enough experiences are in the buffer
             return

        states, actions, rewards, next_states, dones, weights, indices = self.memory.sample(self.batch_size, self.beta)

        # Move tensors to the correct device
        states = states.to(self.device)
        actions = actions.to(self.device)
        rewards = rewards.to(self.device)
        next_states = next_states.to(self.device)
        dones = dones.to(self.device)
        weights = weights.to(self.device)

        q_values = self.qnetwork(states).gather(1, actions.unsqueeze(1)).squeeze(1)

        next_actions = self.qnetwork(next_states).max(1)[1].unsqueeze(1)
        next_q_values = self.target_network(next_states).gather(1, next_actions).squeeze(1)

        targets = rewards + (self.gamma * next_q_values * (1 - dones))

        td_errors = targets - q_values.detach()
        # Apply .detach() before converting to numpy
        self.memory.update_priorities(indices, td_errors.abs().cpu().detach().numpy())

        loss = (nn.MSELoss(reduction='none')(q_values, targets) * weights).mean()

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay
        self.beta = min(1.0, self.beta + self.beta_increment_per_frame)

    def update_target_network(self):
        self.target_network.load_state_dict(self.qnetwork.state_dict())

# Re-initialize agent with new class definition
agent = DQNAgent(state_size, action_size)

episodes = 500
target_update_freq = 10

for e in range(episodes):
    state = env.reset()
    total_reward = 0
    done = False

    while not done:
        action_idx = agent.act(state)

        # Convert flat index to 3 product orders
        orders = np.unravel_index(action_idx, (11, 11, 11))
        orders = [o * 10 for o in orders]

        next_state, reward, done, _ = env.step(orders)
        agent.remember(state, action_idx, reward, next_state, done)
        agent.replay()
        state = next_state
        total_reward += reward

    if e % target_update_freq == 0:
        agent.update_target_network()

    # print(f"Episode {e}, Total Reward: {total_reward}") # Optionally keep print for monitoring

In [None]:

# training loop
episodes = 500
target_update_freq = 10

for e in range(episodes):
    state = env.reset()
    total_reward = 0
    done = False

    while not done:
        action_idx = agent.act(state)

        # Convert flat index to 3 product orders
        orders = np.unravel_index(action_idx, (11, 11, 11))
        orders = [o * 10 for o in orders]

        next_state, reward, done, _ = env.step(orders)
        agent.remember(state, action_idx, reward, next_state, done)
        agent.replay()
        state = next_state
        total_reward += reward

    if e % target_update_freq == 0:
        agent.update_target_network()

    print(f"Episode {e}, Total Reward: {total_reward}")

Episode 0, Total Reward: -2010.2499999999998
Episode 1, Total Reward: -1883.9749999999997
Episode 2, Total Reward: -1820.5999999999997
Episode 3, Total Reward: -2115.7
Episode 4, Total Reward: -1740.9750000000001
Episode 5, Total Reward: -1954.4249999999995
Episode 6, Total Reward: -1760.9750000000001
Episode 7, Total Reward: -1869.4000000000003
Episode 8, Total Reward: -1615.1500000000003
Episode 9, Total Reward: -1614.6500000000008
Episode 10, Total Reward: -1679.4750000000001
Episode 11, Total Reward: -1678.6750000000002
Episode 12, Total Reward: -1892.4499999999996
Episode 13, Total Reward: -1254.7
Episode 14, Total Reward: -1905.3250000000005
Episode 15, Total Reward: -2059.7999999999997
Episode 16, Total Reward: -1553.7
Episode 17, Total Reward: -1757.0749999999998
Episode 18, Total Reward: -2047.4999999999998
Episode 19, Total Reward: -1462.8499999999995
Episode 20, Total Reward: -1514.55
Episode 21, Total Reward: -1516.8749999999998
Episode 22, Total Reward: -2377.1
Episode 23,

In [25]:
import random
import torch
import numpy as np

class SumTree:
    """
    A SumTree data structure for efficient storage and sampling of priorities.
    Used internally by PrioritizedReplayBuffer.
    """
    def __init__(self, capacity):
        self.capacity = capacity
        self.tree = np.zeros(2 * capacity - 1)
        self.data = np.zeros(capacity, dtype=object)
        self.data_pointer = 0

    def add(self, priority, data):
        """Add data and update the tree."""
        tree_index = self.data_pointer + self.capacity - 1
        self.data[self.data_pointer] = data
        self.update(tree_index, priority)
        self.data_pointer += 1
        if self.data_pointer >= self.capacity:
            self.data_pointer = 0

    def update(self, tree_index, priority):
        """Update the priority of a data point."""
        change = priority - self.tree[tree_index]
        self.tree[tree_index] = priority
        while tree_index != 0:
            tree_index = (tree_index - 1) // 2
            self.tree[tree_index] += change

    def get_leaf(self, v):
        """Retrieve a data point and its priority from the tree."""
        parent_index = 0
        while True:
            left_child_index = 2 * parent_index + 1
            right_child_index = left_child_index + 1
            if left_child_index >= len(self.tree):
                leaf_index = parent_index
                break
            else:
                if v <= self.tree[left_child_index]:
                    parent_index = left_child_index
                else:
                    v -= self.tree[left_child_index]
                    parent_index = right_child_index
        data_index = leaf_index - self.capacity + 1
        return leaf_index, self.tree[leaf_index], self.data[data_index]

    @property
    def total_priority(self):
        """Get the sum of all priorities."""
        return self.tree[0]

class PrioritizedReplayBuffer:
    """
    Prioritized Experience Replay Buffer for DQN.
    """
    def __init__(self, buffer_size, alpha=0.6, seed=42):
        self.sum_tree = SumTree(buffer_size)
        self.alpha = alpha
        self.buffer_size = buffer_size
        self.experience = [None] * buffer_size  # Use a list to store experiences
        self.current_size = 0 # To track the actual number of experiences stored
        self.random_state = np.random.RandomState(seed)
        self.max_priority = 1.0 # Initial max priority for new experiences

    def add(self, state, action, reward, next_state, done):
        """Adds an experience to the buffer with maximum priority."""
        experience = (state, action, reward, next_state, done)
        self.sum_tree.add(self.max_priority, self.current_size) # Store index in SumTree
        self.experience[self.current_size] = experience # Store experience in list
        self.current_size = (self.current_size + 1) % self.buffer_size

    def sample(self, batch_size, beta=0.4):
        """Samples a batch of experiences based on priorities."""
        minibatch = []
        indices = []
        weights = []
        total_priority = self.sum_tree.total_priority
        segment = total_priority / batch_size

        for i in range(batch_size):
            a = segment * i
            b = segment * (i + 1)
            v = self.random_state.uniform(a, b)
            tree_index, priority, data_index = self.sum_tree.get_leaf(v) # data_index is the index in the experience list

            # Calculate importance sampling weight
            sampling_probability = priority / total_priority
            weight = (self.buffer_size * sampling_probability) ** -beta
            weights.append(weight)
            indices.append(tree_index) # Store the tree index for priority updates
            minibatch.append(self.experience[data_index]) # Retrieve experience from list

        # Normalize weights
        max_weight = max(weights) if weights else 1.0
        weights = [w / max_weight for w in weights]

        states, actions, rewards, next_states, dones = zip(*minibatch)

        return (torch.FloatTensor(states),
                torch.LongTensor(actions),
                torch.FloatTensor(rewards),
                torch.FloatTensor(next_states),
                torch.FloatTensor(dones),
                torch.FloatTensor(weights),
                indices)

    def update_priorities(self, tree_indices, td_errors):
        """Updates the priorities of sampled experiences based on TD errors."""
        for tree_index, td_error in zip(tree_indices, td_errors):
            priority = abs(td_error) ** self.alpha
            self.sum_tree.update(tree_index, priority)
            self.max_priority = max(self.max_priority, priority) # Update max priority

In [21]:
# # Calculate and display the average reward per episode
# episode_rewards = []

# for e in range(episodes):
#     state = env.reset()
#     total_reward = 0
#     done = False

#     while not done:
#         action_idx = agent.act(state)
#         orders = np.unravel_index(action_idx, (11, 11, 11))
#         orders = [o * 10 for o in orders]
#         next_state, reward, done, _ = env.step(orders)
#         state = next_state
#         total_reward += reward

#     episode_rewards.append(total_reward)

# average_reward = np.mean(episode_rewards)
# print(f"Average Reward over {episodes} episodes: {average_reward:.2f}")

Average Reward over 500 episodes: -1397.03


In [None]:

model_path = 'dueling_dqn_policy_net.pth'
torch.save(agent.qnetwork.state_dict(), model_path)

print(f"Policy network saved to {model_path}")

Policy network saved to dueling_dqn_policy_net.pth


----------------------------------------------------


In [None]:
# rl agent

#rl_agent.py
#import gym
import subprocess
import sys
try:
    import torch
except ImportError:
    subprocess.check_call([sys.executable, "-m", "pip", "install", "torch"])
import torch
import torch.nn as nn
import numpy as np
import os



# Get the current directory of submission.py
CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))

# Path to the model file
model_path = os.path.join(CURRENT_DIR, "dueling_dqn_policy_net.pth")


class DuelingQNetwork(nn.Module):
    def __init__(self, state_size, action_size, hidden_size=128):
        super(DuelingQNetwork, self).__init__()
        self.fc1 = nn.Linear(state_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, hidden_size)

        self.value_stream = nn.Linear(hidden_size, 1)
        self.advantage_stream = nn.Linear(hidden_size, action_size)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))

        value = self.value_stream(x)
        advantage = self.advantage_stream(x)

        q_values = value + (advantage - advantage.mean(dim=1, keepdim=True))
        return q_values


class RLAgent:
    def __init__(self):
        pass

    def flatten_state(self, state):

      # idea is to convert the environment state into a flat numpy array that the neural network can process

      if isinstance(state, dict):
          return np.concatenate([np.array(v, dtype=np.float32) for v in state.values()])
      return np.array(state, dtype=np.float32)

    def run_policy(self,state):
        ''' policy execution function '''
        state = self.flatten_state(state)
        STATE_SIZE = len(state)
        ACTION_SIZE = 11 ** 3  # 3 products, 11 discrete actions each

        # Load the DuelingQNetwork model
        policy_net = DuelingQNetwork(STATE_SIZE, ACTION_SIZE)
        policy_net.load_state_dict(torch.load(model_path,map_location='cpu'))

        # loading a pretrained DQN model


        policy_net.eval()
        state_tensor = torch.FloatTensor(state).unsqueeze(0)

        with torch.no_grad():
            q_values = policy_net(state_tensor)
        action_idx = torch.argmax(q_values).item()

        # Convert flat index → orders for 3 products
        orders = np.unravel_index(action_idx, (11, 11, 11))
        return [o * 10 for o in orders]  # since action space is {0,10,...,100}

NameError: name '__file__' is not defined