In [9]:
import numpy as np
import json
import gym
import matplotlib.pyplot as plt
import psutil
import pynvml
from collections import Counter
from gym.envs.registration import register
import torch.nn as nn
import torch.nn.functional as F
import torch.autograd as autograd

# Register the wrapper with a new environment ID
register(
    id='CustomRewardFrozenLake-v1',
    entry_point='gym.envs.toy_text:FrozenLakeEnv',
    kwargs={'map_name': '4x4', 'is_slippery': True},
    max_episode_steps=100,
    reward_threshold=1,  # Adjust the reward threshold if needed
)

class CustomRewardFrozenLake(gym.Env):
    def __init__(self):
        self.env = gym.make("CustomRewardFrozenLake-v1")
        self.observation_space = self.env.observation_space
        self.action_space = self.env.action_space

    def step(self, action):
        state, reward, done, info = self.env.step(action)
        if reward == 0 and not done:
            reward = 0
        elif reward == 0 and done:
            reward = -5
        elif reward == 1:
            reward = 1
        return state, reward, done, info

    def reset(self):
        return self.env.reset()

    def render(self):
        self.env.render()

    def close(self):
        self.env.close()

import torch
import torch.nn as nn
import torch.optim as optim

class ActorCriticNetwork(nn.Module):
    def __init__(self, input_size, output_size):
        super(ActorCriticNetwork, self).__init__()
        self.fc1_actor = nn.Linear(input_size, 128)  
        self.fc2_actor = nn.Linear(128, output_size)
        
        self.fc1_critic = nn.Linear(input_size, 128)  
        self.fc2_critic = nn.Linear(128, 1)

    def forward_actor(self, x):
        x = F.relu(self.fc1_actor(x))
        x = self.fc2_actor(x)
        return F.softmax(x, dim=1)
    
    def forward_critic(self, x):
        x = F.relu(self.fc1_critic(x))
        x = self.fc2_critic(x)
        return x


class ProblemSolver:
    def __init__(self, num_states, num_actions, env):
        self.num_actions = num_actions
        self.num_states = num_states
        input_size = 1  # Input size for the state
        self.actor_critic_network = ActorCriticNetwork(input_size, num_actions)
        self.optimizer_actor = optim.Adam(self.actor_critic_network.parameters(), lr=0.01)
        self.optimizer_critic = optim.Adam(self.actor_critic_network.parameters(), lr=0.01)

    def choose_action(self, state):
        state_tensor = torch.tensor(state, dtype=torch.float32).unsqueeze(0)  
        action_probs = self.actor_critic_network.forward_actor(state_tensor)
        action = torch.multinomial(action_probs, 1).item()
        return action

    def update_policy(self, states, actions, rewards, next_states, dones, gamma):
        log_probs = []
        values = []
        advantages = []
        returns = []

        for state, action, reward, next_state, done in zip(states, actions, rewards, next_states, dones):
            state_tensor = torch.FloatTensor(state).unsqueeze(0)
            action_probs = self.actor_critic_network.forward_actor(state_tensor)
            value = self.actor_critic_network.forward_critic(state_tensor)
            values.append(value)
            log_probs.append(torch.log(action_probs.squeeze(0)[action]))
            
            if done:
                returns.append(reward)
            else:
                next_state_tensor = torch.FloatTensor(next_state).unsqueeze(0)
                next_value = self.actor_critic_network.forward_critic(next_state_tensor)
                returns.append(reward + gamma * next_value)

        values = torch.cat(values)
        returns = torch.tensor(returns, dtype=torch.float32)
        advantages = returns - values

        actor_loss = []
        critic_loss = []

        for log_prob, advantage, value, ret in zip(log_probs, advantages, values, returns):
            actor_loss.append(-log_prob * advantage)
            critic_loss.append(F.smooth_l1_loss(value, ret))
        
        self.optimizer_actor.zero_grad()
        actor_loss = torch.stack(actor_loss).sum()
        actor_loss.backward(retain_graph=True)
        self.optimizer_actor.step()

        self.optimizer_critic.zero_grad()
        critic_loss = torch.stack(critic_loss).sum()
        critic_loss.backward()
        self.optimizer_critic.step()


class Case:
    added_states = set()  

    def __init__(self, problem, solution, trust_value=1):
        self.problem = np.array(problem)  
        self.solution = solution
        self.trust_value = trust_value
    
    @staticmethod
    def sim_q(state1, state2):
        state1 = np.atleast_1d(state1)  
        state2 = np.atleast_1d(state2)  
        CNDMaxDist = 6  
        v = state1.size  
        DistQ = np.sum([Case.Dmin_phi(Objic, Objip) for Objic, Objip in zip(state1, state2)])
        similarity = (CNDMaxDist * v - DistQ) / (CNDMaxDist * v)
        return similarity

    @staticmethod
    def Dmin_phi(X1, X2):
        return np.max(np.abs(X1 - X2))
    

    @staticmethod
    def retrieve(state, case_base, threshold=0.2):
        similarities = {}
        for case in case_base:
            similarities[case] = Case.sim_q(state, case.problem)  
        
        sorted_similarities = sorted(similarities.items(), key=lambda x: x[1], reverse=True)
        
        if sorted_similarities:
            most_similar_case = sorted_similarities[0][0] if sorted_similarities[0][1] >= threshold else None
        else:
            most_similar_case = None
        
        return most_similar_case

    @staticmethod
    def reuse(c, temporary_case_base):
        temporary_case_base.append(c)

    @staticmethod
    def revise(case_base, temporary_case_base, successful_episodes):
        for case in temporary_case_base:
            if successful_episodes and case in case_base:
                case.trust_value += 0.1  
            elif not successful_episodes and case in case_base:
                case.trust_value -= 0.1  
            case.trust_value = max(0, min(case.trust_value,1))

    @staticmethod
    def retain(case_base, temporary_case_base, successful_episodes, threshold=0):
        if successful_episodes:
            for case in reversed(temporary_case_base):
                state = tuple(np.atleast_1d(case.problem))
                if state not in Case.added_states:
                    case_base.append(case)
                    Case.added_states.add(state)
            
            filtered_case_base = []
            for case in case_base:
                if case.trust_value >= threshold:
                    filtered_case_base.append(case)
                else:
                    pass

            return filtered_case_base
        else:
            return case_base  

class QCBRL:
    def __init__(self, num_states, num_actions, env):
        self.num_actions = num_actions
        self.num_states = num_states
        self.env = env
        self.problem_solver = ProblemSolver(num_states, num_actions, env)
        self.case_base = []
        self.temporary_case_base = []

    def run(self, episodes=100, max_steps=100, alpha=0.1, gamma=0.9, epsilon=0.1, render=False):
        rewards = []
        memory_usage = []
        gpu_memory_usage = []
        successful_episodes = False
        num_successful_episodes = 0

        for episode in range(episodes):
            state = self.env.reset()
            state = np.array([state], dtype=np.float32)  
            episode_reward = 0
            states = []
            actions = []
            rewards_episode = []
            next_states = []
            dones = []

            for _ in range(max_steps):
                if render:
                    self.env.render()
                
                action = self.take_action(state, epsilon)
                next_state, reward, done, _ = self.env.step(action)

                states.append(state)
                actions.append(action)
                rewards_episode.append(reward)
                next_states.append(next_state)
                dones.append(done)

                state = np.array([next_state], dtype=np.float32)  
                episode_reward += reward

                if done:
                    successful_episodes = reward > 0
                    break
                
            if successful_episodes:
                num_successful_episodes += 1

            rewards.append(episode_reward)
            self.problem_solver.update_policy(states, actions, rewards_episode, next_states, dones, gamma)
        
        success_rate = (num_successful_episodes / episodes) * 100
        return rewards, success_rate, memory_usage, gpu_memory_usage

    def take_action(self, state, epsilon):
        similar_solution = Case.retrieve(state, self.case_base)
        if similar_solution is not None:
            action = similar_solution.solution
        else:
            action = self.problem_solver.choose_action(state)
        return action
    
    def save_case_base_temporary(self):
        filename = "case_base_temporary.json"
        case_base_data = [{"problem": case.problem.tolist() if isinstance(case.problem, np.ndarray) else int(case.problem), 
                        "solution": int(case.solution), 
                        "trust_value": int(case.trust_value)} for case in self.temporary_case_base]
        with open(filename, 'w') as file:
            json.dump(case_base_data, file)
        print("Temporary case base saved successfully.")

    def save_case_base(self):
        filename = "case_base.json"
        case_base_data = [{"problem": case.problem.tolist() if isinstance(case.problem, np.ndarray) else int(case.problem), 
                        "solution": int(case.solution), 
                        "trust_value": int(case.trust_value)} for case in self.case_base]
        with open(filename, 'w') as file:
            json.dump(case_base_data, file)
            print("Case base saved successfully.")
        
    def load_case_base(self):
        filename = "case_base.json"
        try:
            with open(filename, 'r') as file:
                case_base_data = json.load(file)
                self.case_base = [Case(np.array(case["problem"]), case["solution"], case["trust_value"]) for case in case_base_data]
                print("Case base loaded successfully.")
        except FileNotFoundError:
            print("Case base file not found. Starting with an empty case base.")

    def display_success_rate(self, success_rate):
        print(f"Success rate: {success_rate}%")

    def plot_rewards(self, rewards):
        plt.plot(rewards)
        plt.xlabel('Episode')
        plt.ylabel('Total Reward')
        plt.title('Rewards over Episodes')
        plt.grid(True)
        plt.show() 

    def plot_resources(self, memory_usage, gpu_memory_usage):
        plt.plot(memory_usage, label='Memory (%)')
        plt.plot(gpu_memory_usage, label='GPU Memory (MB)')
        plt.xlabel('Episode')
        plt.ylabel('Resource Usage')
        plt.title('Resource Usage over Episodes')
        plt.legend()
        plt.grid(True)
        plt.show()

if __name__ == "__main__":
    env = CustomRewardFrozenLake()
    
    num_states = env.observation_space
    print(f"num_states: {num_states}")
    num_actions = env.action_space.n

    agent = QCBRL(num_states, num_actions, env)
    rewards, success_rate, memory_usage, gpu_memory_usage = agent.run(episodes=1000, max_steps=1000, alpha=0.1, gamma=0.9, epsilon=0.1)

    agent.display_success_rate(success_rate)
    agent.plot_rewards(rewards)


num_states: Discrete(16)


  logger.warn(f"Overriding environment {id}")


RuntimeError: mat1 and mat2 shapes cannot be multiplied (1x0 and 1x128)