In [22]:
import numpy as np
import json
import gym
import matplotlib.pyplot as plt
import psutil
import pynvml
from collections import Counter
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F


class DQN(nn.Module):
    def __init__(self, input_size, output_size):
        super(DQN, self).__init__()
        self.fc1 = nn.Linear(input_size, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, output_size)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x


class ProblemSolver:
    def __init__(self, actions, input_size, epsilon=0.1, gamma=0.99, alpha=0.1, lambd=0.9):
        self.actions = actions
        self.epsilon = epsilon
        self.gamma = gamma
        self.alpha = alpha
        self.lambd = lambd

        self.policy_net = DQN(input_size, actions)
        self.target_net = DQN(input_size, actions)
        self.target_net.load_state_dict(self.policy_net.state_dict())
        self.target_net.eval()

        self.optimizer = optim.Adam(self.policy_net.parameters(), lr=alpha)
        self.loss_fn = nn.MSELoss()

    def choose_action(self, state):
        if np.random.rand() < self.epsilon:
            return np.random.choice(self.actions)
        else:
            with torch.no_grad():
                state = torch.FloatTensor(state).view(1, -1)
                q_values = self.policy_net(state)
                return q_values.argmax().item()

    def update_Q(self, state, action, reward, next_state, next_action):
        state = torch.FloatTensor(state).view(1, -1)
        next_state = torch.FloatTensor(next_state).view(1, -1)
        action = torch.LongTensor([[action]])
        reward = torch.FloatTensor([reward])
        done = torch.BoolTensor([False])

        # Q-value of the action taken
        q_value = self.policy_net(state).gather(1, action)

        # Q-value of the best action in the next state according to the policy network
        next_q_value = self.target_net(next_state).max(1)[0].detach()
        target = reward + self.gamma * next_q_value

        # Update Q-value of the action taken
        loss = self.loss_fn(q_value, target.unsqueeze(1))

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # Update target network
        self.soft_update_target_network()

    def soft_update_target_network(self):
        for target_param, policy_param in zip(self.target_net.parameters(), self.policy_net.parameters()):
            target_param.data.copy_(self.lambd * target_param.data + (1.0 - self.lambd) * policy_param.data)


class Case:
    added_states = set()  # Class attribute to store states already added to the case base

    def __init__(self, problem, solution, trust_value=1):
        self.problem = np.array(problem)  # Convert problem to numpy array
        self.solution = solution
        self.trust_value = trust_value

    @staticmethod
    def retrieve(state, case_base, threshold=0.5):
        similarities = {}
        for case in case_base:
            similarities[case] = Case.sim_q(state, case.problem)  # Compare state with the problem part of the case

        sorted_similarities = sorted(similarities.items(), key=lambda x: x[1], reverse=True)

        if sorted_similarities:
            most_similar_case = sorted_similarities[0][0] if sorted_similarities[0][1] >= threshold else None
        else:
            most_similar_case = None

        return most_similar_case

    @staticmethod
    def reuse(c, case_base, temporary_case_base):
        # Store the new case from the problem solver
        if c not in temporary_case_base:
            temporary_case_base.append(c)

        # Check if there are similar cases in case_base
        similar_cases = [case for case in case_base if np.array_equal(case.problem, c.problem)]
        for similar_case in similar_cases:
            if similar_case not in temporary_case_base:
                temporary_case_base.append(similar_case)

    @staticmethod
    def revise(case_base, temporary_case_base, episode_ended_successfully):
        for case in temporary_case_base:
            if episode_ended_successfully and case in case_base:
                case.trust_value += 0.1  # Increment trust value if the episode ended successfully and the case is in the case base
            elif not episode_ended_successfully and case in case_base:
                case.trust_value -= 0.1  # Decrement trust value if the episode ended unsuccessfully and the case is in the case base
            case.trust_value = max(0, min(case.trust_value, 1))  # Ensure trust value is within[0,1]

    @staticmethod
    def retain(case_base, temporary_case_base, episode_ended_successfully, threshold):
        if episode_ended_successfully:
            # Iterate through the temporary case base to find the last occurrence of each unique state
            for case in reversed(temporary_case_base):
                state = tuple(np.atleast_1d(case.problem))
                # Check if the state is already in the case base or has been added previously
                if state not in Case.added_states:
                    # Add the case to the case base if the state is new
                    case_base.append(case)
                    Case.added_states.add(state)

            # Filter case_base based on trust_value
            filtered_case_base = []
            for case in case_base:
                if case.trust_value >= threshold:
                    filtered_case_base.append(case)

            return filtered_case_base
        else:
            return case_base  # Return original case_base if episode is not successful

    @staticmethod
    def sim_q(state1, state2):
        state1 = np.atleast_1d(state1)  # Ensure state1 is at least 1-dimensional
        state2 = np.atleast_1d(state2)  # Ensure state2 is at least 1-dimensional
        CNDMaxDist = 6  # Maximum distance between two nodes in the CND
        v = state1.size  # Total number of objects the agent can perceive
        DistQ = np.sum([Case.Dmin_phi(Objic, Objip) for Objic, Objip in zip(state1, state2)])
        similarity = (CNDMaxDist * v - DistQ) / (CNDMaxDist * v)
        return similarity

    @staticmethod
    def Dmin_phi(X1, X2):
        return np.max(np.abs(X1 - X2))


class QCBRL:
    def __init__(self, actions, input_size, threshold=0.2, epsilon=0.1, gamma=0.99, alpha=0.1, lambd=0.9):
        self.problem_solver = ProblemSolver(actions, input_size, epsilon, gamma, alpha, lambd)
        self.case_base = []
        self.threshold = threshold
        self.temporary_case_base = []
        # self.load_case_base()  # Load case base at initialization

    def train(self, episodes, max_steps, render=False):
        env = gym.make('FrozenLake-v1')
        rewards = []
        episode_rewards = []
        memory_usage = []
        gpu_memory_usage = []
        successful_episodes = 0  # Initialize counter for successful episodes

        pynvml.nvmlInit()
        handle = pynvml.nvmlDeviceGetHandleByIndex(0)  # GPU Index

        for episode in range(1, episodes + 1):
            state = env.reset()
            total_reward = 0
            self.temporary_case_base = []

            for step in range(max_steps):
                if render:
                    env.render()
                action, next_state = self.take_action(state)
                next_state, reward, done, _ = env.step(action)
                next_state = np.array(next_state)
                total_reward += reward

                state = next_state

                if done:
                    rewards.append(total_reward)
                    episode_rewards.append(total_reward)
                    if reward == 1.0:
                        episode_ended_successfully = True
                        successful_episodes += 1  # Increment successful episode counter
                    else:
                        episode_ended_successfully = False

                    print(f"Episode {episode} ended after {step + 1} steps with total reward: {total_reward}")
                    break

            Case.revise(self.case_base, self.temporary_case_base, episode_ended_successfully)
            self.case_base = Case.retain(self.case_base, self.temporary_case_base, episode_ended_successfully,
                                          self.threshold)

            memory_usage.append(psutil.virtual_memory().percent)
            gpu_memory_usage.append(pynvml.nvmlDeviceGetMemoryInfo(handle).used / 1024 ** 2)

        self.save_case_base_temporary()
        self.save_case_base()

        env.close()
        self.plot_rewards(episode_rewards)
        self.plot_resources(memory_usage, gpu_memory_usage)

        # Calculate and display percentage of successful episodes
        success_percentage = (successful_episodes / episodes) * 100
        print(f"Percentage of Successful Episodes: {success_percentage}%")

    def take_action(self, state):
        similar_solution = Case.retrieve(state, self.case_base)
        if similar_solution is not None:
            action = similar_solution.solution
            next_state = state  # Assuming the next state remains the same
        else:
            state_tensor = torch.FloatTensor(state)  # Convert to tensor
            action = self.problem_solver.choose_action(state_tensor)
            next_state = state
        
        c = Case(state, action)
        Case.reuse(c, self.case_base, self.temporary_case_base)

        return action, next_state




    def save_case_base_temporary(self):
        filename = "case_base_temporary.json"
        case_base_data = [{"problem": case.problem.tolist(), "solution": case.solution, "trust_value": case.trust_value}
                          for case in self.temporary_case_base]
        with open(filename, 'w') as file:
            json.dump(case_base_data, file)

    def save_case_base(self):
        filename = "case_base.json"
        case_base_data = [{"problem": case.problem.tolist(), "solution": case.solution, "trust_value": case.trust_value}
                          for case in self.case_base]
        with open(filename, 'w') as file:
            json.dump(case_base_data, file)

        print("Case base saved successfully.")  # Add this line to check if the case base is being saved

    def load_case_base(self):
        filename = "case_base.json"
        try:
            with open(filename, 'r') as file:
                case_base_data = json.load(file)
                self.case_base = [Case(np.array(case["problem"]), case["solution"], case["trust_value"]) for case in
                                  case_base_data]
                print("Case base loaded successfully.")  # Add this line to check if the case base is being loaded
        except FileNotFoundError:
            print("Case base file not found. Starting with an empty case base.")

    def plot_rewards(self, rewards):
        plt.plot(rewards)
        plt.xlabel('Episode')
        plt.ylabel('Total Reward')
        plt.title('Rewards over Episodes')
        plt.grid(True)
        plt.show()

    def plot_resources(self, memory_usage, gpu_memory_usage):
        plt.plot(memory_usage, label='Memory (%)')
        plt.plot(gpu_memory_usage, label='GPU Memory (MB)')
        plt.xlabel('Episode')
        plt.ylabel('Resource Usage')
        plt.title('Resource Usage over Episodes')
        plt.legend()
        plt.grid(True)
        plt.show()


if __name__ == "__main__":
    env = gym.make('FrozenLake-v1')
    if isinstance(env.observation_space, gym.spaces.Box):
        input_dim = env.observation_space.shape[0]
    else:
        input_dim = env.observation_space.n
    output_dim = env.action_space.n
    agent = QCBRL(output_dim, input_dim)
    agent.train(episodes=10000, max_steps=100000)




RuntimeError: mat1 and mat2 shapes cannot be multiplied (1x0 and 16x128)