In [7]:
import numpy as np
import json
import gym
import matplotlib.pyplot as plt
import psutil
import pynvml
from collections import Counter
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

# Define Policy Network
class PolicyNetwork(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(PolicyNetwork, self).__init__()
        self.fc1 = nn.Linear(input_dim, 64)
        self.fc2 = nn.Linear(64, 64)
        self.fc3 = nn.Linear(64, output_dim)
        self.softmax = nn.Softmax(dim=-1)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = self.softmax(self.fc3(x))
        return x

def discount_returns(rewards, gamma=0.99):
    discounted_returns = []
    running_add = 0
    for t, r in enumerate(reversed(rewards)):
        # print(t)
        running_add = running_add * gamma ** t + r
        discounted_returns.insert(0, running_add)
    return discounted_returns


class ProblemSolver:
    def __init__(self, actions, epsilon=0.1, gamma=0.99, alpha=0.1, lambd=0.9):
        self.actions = actions
        self.epsilon = epsilon
        self.gamma = gamma
        self.alpha = alpha
        self.lambd = lambd
        self.policy_network = PolicyNetwork(16, actions)
        self.optimizer = optim.Adam(self.policy_network.parameters(), lr=self.alpha)

    def choose_action(self, state):
        state_tensor = torch.tensor(state, dtype=torch.float32)
        # print("State size:", state_tensor.size())  # Add this line to check state tensor size
        action_probs = self.policy_network(state_tensor)
        action = torch.multinomial(action_probs, 1).item()
        return action

    def update_policy(self, rewards, log_probs):
        returns = []
        discounted_reward = 0
        for r in reversed(rewards):
            discounted_reward = r + self.gamma * discounted_reward
            returns.insert(0, discounted_reward)

        returns = torch.tensor(returns)
        returns = (returns - returns.mean()) / (returns.std() + 1e-9)

        policy_gradient = []
        for log_prob, R in zip(log_probs, returns):
            policy_gradient.append(-log_prob * R)

        self.optimizer.zero_grad()
        policy_gradient = torch.stack(policy_gradient).sum()
        policy_gradient.backward()
        self.optimizer.step()

class Case:
    added_states = set()

    def __init__(self, problem, solution, trust_value=1):
        self.problem = np.array(problem)
        self.solution = solution
        self.trust_value = trust_value

    @staticmethod
    def retrieve(state, case_base, threshold=0.5):
        similarities = {}
        for case in case_base:
            similarities[case] = Case.sim_q(state, case.problem)
        
        sorted_similarities = sorted(similarities.items(), key=lambda x: x[1], reverse=True)
        
        if sorted_similarities:
            most_similar_case = sorted_similarities[0][0] if sorted_similarities[0][1] >= threshold else None
        else:
            most_similar_case = None
        
        return most_similar_case

    @staticmethod
    def reuse(c, case_base, temporary_case_base):
        if c not in temporary_case_base:
            temporary_case_base.append(c)
        
        similar_cases = [case for case in case_base if np.array_equal(case.problem, c.problem)]
        for similar_case in similar_cases:
            if similar_case not in temporary_case_base:
                temporary_case_base.append(similar_case)

    @staticmethod
    def revise(case_base, temporary_case_base, episode_ended_successfully):
        for case in temporary_case_base:
            if episode_ended_successfully and case in case_base:
                case.trust_value += 0.1
            elif not episode_ended_successfully and case in case_base:
                case.trust_value -= 0.1
            case.trust_value = max(0, min(case.trust_value,1))

    @staticmethod
    def retain(case_base, temporary_case_base, episode_ended_successfully, threshold):
        if episode_ended_successfully:
            for case in reversed(temporary_case_base):
                state = tuple(np.atleast_1d(case.problem))
                if state not in Case.added_states:
                    case_base.append(case)
                    Case.added_states.add(state)
            
            filtered_case_base = []
            for case in case_base:
                if case.trust_value >= threshold:
                    filtered_case_base.append(case)
            return filtered_case_base
        else:
            return case_base

    @staticmethod
    def sim_q(state1, state2):
        state1 = np.atleast_1d(state1)
        state2 = np.atleast_1d(state2)
        CNDMaxDist = 6
        v = state1.size
        DistQ = np.sum([Case.Dmin_phi(Objic, Objip) for Objic, Objip in zip(state1, state2)])
        similarity = (CNDMaxDist * v - DistQ) / (CNDMaxDist * v)
        return similarity

    @staticmethod
    def Dmin_phi(X1, X2):
        return np.max(np.abs(X1 - X2))

class QCBRL:
    def __init__(self, actions, input_size, epsilon, gamma, alpha, lambd, threshold):
        self.problem_solver = ProblemSolver(actions, epsilon, gamma, alpha, lambd)
        self.case_base = []
        self.threshold = threshold
        self.temporary_case_base = []
        self.policy_network = PolicyNetwork(input_size, actions)

    def take_action(self, state):
        # Convert state to a numpy array if it's not already
        state_array = np.asarray(state)
        # Convert state to a tensor and add batch dimension if needed
        state_tensor = torch.tensor(state_array, dtype=torch.float32).unsqueeze(0)

        similar_solution = Case.retrieve(state_tensor, self.case_base)
        if similar_solution is not None:
            action = similar_solution.solution
            next_state = state
        else:
            # Flatten the state tensor
            state_tensor_flattened = state_tensor.view(1, -1)
            # print (f"state tensor: {state_tensor}")
            # print("State tensor shape:", state_tensor.shape) 
            # print(f"state tensor flattened: {state_tensor_flattened}")
            # print("State tensor flattened shape:", state_tensor_flattened.shape) 
            action_probs = self.policy_network(state_tensor_flattened)
            action = torch.multinomial(action_probs, 1).item()
            next_state = state

        # Reuse case and return the selected action and the next state
        c = Case(state, action)
        Case.reuse(c, self.case_base, self.temporary_case_base)
        return action, next_state


    def train(self, episodes, max_steps, render=False):
        env = gym.make('FrozenLake-v1')
        rewards = []
        episode_rewards = []
        memory_usage = []
        gpu_memory_usage = []
        successful_episodes = 0

        pynvml.nvmlInit()
        handle = pynvml.nvmlDeviceGetHandleByIndex(0)

        for episode in range(1, episodes + 1):
            state = env.reset()
            total_reward = 0
            self.temporary_case_base = []

            for step in range(max_steps):
                if render:
                    env.render()

                # Convert state to a numpy array if it's not already
                state = np.array(state)

                action, next_state = self.take_action(state)
                next_state, reward, done, _ = env.step(action)
                total_reward += reward

                state = next_state

                if done:
                    rewards.append(total_reward)
                    episode_rewards.append(total_reward)
                    if reward == 1.0:
                        episode_ended_successfully = True
                        successful_episodes += 1
                    else:
                        episode_ended_successfully = False

                    print(f"Episode {episode} ended after {step + 1} steps with total reward: {total_reward}")
                    break

            Case.revise(self.case_base, self.temporary_case_base, episode_ended_successfully)
            self.case_base = Case.retain(self.case_base, self.temporary_case_base, episode_ended_successfully, self.threshold)

            memory_usage.append(psutil.virtual_memory().percent)
            gpu_memory_usage.append(pynvml.nvmlDeviceGetMemoryInfo(handle).used / 1024**2)

        self.save_case_base_temporary()
        self.save_case_base()

        env.close()
        self.plot_rewards(episode_rewards)
        self.plot_resources(memory_usage, gpu_memory_usage)

        success_percentage = (successful_episodes / episodes) * 100
        print(f"Percentage of Successful Episodes: {success_percentage}%")


        

    def save_case_base_temporary(self):
        filename = "case_base_temporary.json"
        case_base_data = [{"problem": case.problem.tolist(), "solution": case.solution, "trust_value": case.trust_value} for case in self.temporary_case_base]
        with open(filename, 'w') as file:
            json.dump(case_base_data, file)

    def save_case_base(self):
        filename = "case_base.json"
        case_base_data = [{"problem": case.problem.tolist(), "solution": case.solution, "trust_value": case.trust_value} for case in self.case_base]
        with open(filename, 'w') as file:
            json.dump(case_base_data, file)

            print("Case base saved successfully.")
        
    def load_case_base(self):
        filename = "case_base.json"
        try:
            with open(filename, 'r') as file:
                case_base_data = json.load(file)
                self.case_base = [Case(np.array(case["problem"]), case["solution"], case["trust_value"]) for case in case_base_data]
                print("Case base loaded successfully.")
        except FileNotFoundError:
            print("Case base file not found. Starting with an empty case base.")

    def plot_rewards(self, rewards):
        plt.plot(rewards)
        plt.xlabel('Episode')
        plt.ylabel('Total Reward')
        plt.title('Rewards over Episodes')
        plt.grid(True)
        plt.show() 

    def plot_resources(self, memory_usage, gpu_memory_usage):
        plt.plot(memory_usage, label='Memory (%)')
        plt.plot(gpu_memory_usage, label='GPU Memory (MB)')
        plt.xlabel('Episode')
        plt.ylabel('Resource Usage')
        plt.title('Resource Usage over Episodes')
        plt.legend()
        plt.grid(True)
        plt.show()

if __name__ == "__main__":
    actions = 4
    input_size = 16  # The state space size in FrozenLake-v1 is 16
    epsilon = 0.1  # Define the epsilon parameter
    gamma = 0.9  # Define the gamma parameter
    alpha = 0.1  # Define the alpha parameter
    lambd = 0.5  # Define the lambda parameter
    threshold = 0.5  # Define the threshold parameter
    agent = QCBRL(actions, input_size, epsilon, gamma, alpha, lambd, threshold)
    agent.train(episodes=10000, max_steps=100000)




state tensor: tensor([0.])
State tensor shape: torch.Size([1])
state tensor flattened: tensor([[0.]])
State tensor flattened shape: torch.Size([1, 1])


RuntimeError: mat1 and mat2 shapes cannot be multiplied (1x1 and 16x64)