In [24]:
import numpy as np
import json
import matplotlib.pyplot as plt
import psutil
import pynvml
from collections import Counter
from gym.envs.registration import register
from torchrl.envs import RewardSum, TransformedEnv
from torchrl.envs.libs.vmas import VmasEnv
from torchrl.envs.utils import check_env_specs
from gym import spaces

class VMASWrapper:
    def __init__(self, scenario_name, num_agents, max_steps):
        self.env = VmasEnv(
            scenario=scenario_name,
            num_envs=1,
            continuous_actions=True,
            max_steps=max_steps,
            device="cpu",  # Assuming CPU for now, adjust as needed
            n_agents=num_agents,
        )

        self.observation_space = self.env.observation_space
        self.action_space = self.env.action_space

        print("Action space:", self.action_space)  # Debugging
        print("Observation space:", self.observation_space)  # Debugging

    def step(self, action):
        state, reward, done, info = self.env.step(action)
        return state, reward, done, info

    def reset(self):
        return self.env.reset()

    def render(self):
        self.env.render()

    def close(self):
        self.env.close()


class ProblemSolver:
    def __init__(self, num_actions, env, lambda_=0.9):
        self.num_actions = num_actions  # Add this line to store num_actions
        num_observations = sum([np.prod(space.shape) if isinstance(space, spaces.Box) else space.n for space in env.observation_space])
        self.Q_values = np.zeros((num_observations, num_actions))
        self.eligibility_traces = np.zeros_like(self.Q_values)
        self.lambda_ = lambda_


    def choose_action(self, state, epsilon):
        if np.random.uniform(0, 1) < epsilon:
            return np.random.choice(self.num_actions)  # Random action
        else:
            return np.argmax(self.Q_values[state])  # Greedy action

    def update_Q(self, state, action, reward, next_state, next_action, alpha, gamma):
        # Calculate TD error
        td_error = reward + gamma * self.Q_values[next_state, next_action] - self.Q_values[state, action]
        
        # Update eligibility trace
        self.eligibility_traces *= gamma * self.lambda_
        self.eligibility_traces[state, action] += 1
        
        # Update Q-values
        self.Q_values += alpha * td_error * self.eligibility_traces

class Case:
    added_states = set()  # Class attribute to store states already added to the case base

    def __init__(self, problem, solution, trust_value=1):
        self.problem = np.array(problem)  # Convert problem to numpy array
        self.solution = solution
        self.trust_value = trust_value
    
    @staticmethod
    def sim_q(state1, state2):
        state1 = np.atleast_1d(state1)  # Ensure state1 is at least 1-dimensional
        state2 = np.atleast_1d(state2)  # Ensure state2 is at least 1-dimensional
        CNDMaxDist = 6  # Maximum distance between two nodes in the CND
        v = state1.size  # Total number of objects the agent can perceive
        DistQ = np.sum([Case.Dmin_phi(Objic, Objip) for Objic, Objip in zip(state1, state2)])
        similarity = (CNDMaxDist * v - DistQ) / (CNDMaxDist * v)
        return similarity

    @staticmethod
    def Dmin_phi(X1, X2):
        return np.max(np.abs(X1 - X2))
    

    @staticmethod
    def retrieve(state, case_base, threshold=0.2):
        similarities = {}
        for case in case_base:
            similarity = Case.sim_q(state, case.problem)
            if similarity >= threshold:
                return case  # Return the first case with similarity above the threshold
            
        return None  # Return None if no case meets the similarity threshold


    @staticmethod
    def reuse(c, temporary_case_base):
        temporary_case_base.append(c)

    @staticmethod
    def revise(case_base, temporary_case_base, successful_episodes):
        for case in temporary_case_base:
            if successful_episodes and case in case_base:
                case.trust_value += 0.1
            elif not successful_episodes and case in case_base:
                case.trust_value -= 0.1
            case.trust_value = max(0, min(case.trust_value,1))

    @staticmethod
    def retain(case_base, temporary_case_base, successful_episodes, threshold=0):
        if successful_episodes:
            for case in reversed(temporary_case_base):
                state = tuple(np.atleast_1d(case.problem))
                if state not in Case.added_states:
                    case_base.append(case)
                    Case.added_states.add(state)
            
            filtered_case_base = []
            for case in case_base:
                if case.trust_value >= threshold:
                    filtered_case_base.append(case)
                else:
                    pass

            return filtered_case_base
        else:
            return case_base

class QCBRL:
    def __init__(self, num_actions, env):
        self.num_actions = num_actions
        self.env = env
        self.problem_solver = ProblemSolver(num_actions, env)  # Initialize ProblemSolver
        self.case_base = []
        self.temporary_case_base = []

    def run(self, episodes=100, max_steps=100, alpha=0.1, gamma=0.9, epsilon=0.1, render=False):
        rewards = []
        memory_usage = []
        gpu_memory_usage = []
        successful_episodes = False
        num_successful_episodes = 0

        pynvml.nvmlInit()
        handle = pynvml.nvmlDeviceGetHandleByIndex(0)

        for episode in range(episodes):
            state = self.env.reset()
            episode_reward = 0
            self.temporary_case_base = []
            
            for _ in range(max_steps):
                if render:
                    self.env.render()
                action = self.take_action(state, epsilon)
                next_state, reward, done, _ = self.env.step(action)

                c = Case(state, action)
                Case.reuse(c, self.temporary_case_base)

                next_action = self.take_action(next_state, epsilon)
                self.problem_solver.update_Q(state, action, reward, next_state, next_action, alpha, gamma)

                state = next_state
                episode_reward += reward

                if done:
                    successful_episodes = reward > 0
                    break
                
            if episode_reward > 0:
                num_successful_episodes += 1

            rewards.append(episode_reward)
            print(f"Episode {episode + 1}, Total Reward: {episode_reward}")

            Case.revise(self.case_base, self.temporary_case_base, successful_episodes)
            self.case_base = Case.retain(self.case_base, self.temporary_case_base, successful_episodes)
            
            memory_usage.append(psutil.virtual_memory().percent)
            gpu_memory_usage.append(pynvml.nvmlDeviceGetMemoryInfo(handle).used / 1024**2)

        self.save_case_base_temporary()
        self.save_case_base()

        success_rate = (num_successful_episodes / episodes) * 100

        return rewards, success_rate, memory_usage, gpu_memory_usage

    def take_action(self, state, epsilon):
        # if state in self.problem_solver.Q_values:  # Access Q_values from ProblemSolver
        #     similar_solution = Case.retrieve(state, self.case_base)
        #     if similar_solution is not None:
        #         action = similar_solution.solution
        #     else:
        #         action = self.problem_solver.choose_action(state, epsilon)
        # else:
        #     # State not found in Q_values, return random action
        #     action = np.random.choice(self.num_actions)
        
        action = np.random.choice(self.num_actions)

        return action




    def save_case_base_temporary(self):
        filename = "case_base_temporary.json"
        case_base_data = [{"problem": case.problem.tolist() if isinstance(case.problem, np.ndarray) else int(case.problem), 
                        "solution": int(case.solution), 
                        "trust_value": int(case.trust_value)} for case in self.temporary_case_base]
        with open(filename, 'w') as file:
            json.dump(case_base_data, file)
        print("Temporary case base saved successfully.")

    def save_case_base(self):
        filename = "case_base.json"
        case_base_data = [{"problem": case.problem.tolist() if isinstance(case.problem, np.ndarray) else int(case.problem), 
                        "solution": int(case.solution), 
                        "trust_value": int(case.trust_value)} for case in self.case_base]
        with open(filename, 'w') as file:
            json.dump(case_base_data, file)
            print("Case base saved successfully.")
        
    def load_case_base(self):
        filename = "case_base.json"
        try:
            with open(filename, 'r') as file:
                case_base_data = json.load(file)
                self.case_base = [Case(np.array(case["problem"]), case["solution"], case["trust_value"]) for case in case_base_data]
                print("Case base loaded successfully.")
        except FileNotFoundError:
            print("Case base file not found. Starting with an empty case base.")

    def display_success_rate(self, success_rate):
        print(f"Success rate: {success_rate}%")

    def plot_rewards(self, rewards):
        plt.plot(rewards)
        plt.xlabel('Episode')
        plt.ylabel('Total Reward')
        plt.title('Rewards over Episodes')
        plt.grid(True)
        plt.show() 

    def plot_resources(self, memory_usage, gpu_memory_usage):
        plt.plot(memory_usage, label='Memory (%)')
        plt.plot(gpu_memory_usage, label='GPU Memory (MB)')
        plt.xlabel('Episode')
        plt.ylabel('Resource Usage')
        plt.title('Resource Usage over Episodes')
        plt.legend()
        plt.grid(True)
        plt.show()

if __name__ == "__main__":
    env = VMASWrapper(scenario_name="navigation", num_agents=3, max_steps=100)
    num_actions = sum([env.action_space[i].shape[0] for i in range(len(env.action_space))]) # Sum of actions for all agents
    agent = QCBRL(num_actions, env)

    rewards, success_rate, memory_usage, gpu_memory_usage = agent.run(episodes=1000, max_steps=1000, alpha=0.1, gamma=0.9, epsilon=0.1)
    agent.display_success_rate(success_rate)
    agent.plot_rewards(rewards)
    agent.plot_resources(memory_usage, gpu_memory_usage)


Action space: Tuple(Box(-1.0, 1.0, (2,), float32), Box(-1.0, 1.0, (2,), float32), Box(-1.0, 1.0, (2,), float32))
Observation space: Tuple(Box(-inf, inf, (18,), float32), Box(-inf, inf, (18,), float32), Box(-inf, inf, (18,), float32))


AttributeError: 'int' object has no attribute 'batch_size'