In [29]:
import numpy as np
import json
import gym
import matplotlib.pyplot as plt

class ProblemSolver:
    def __init__(self, actions, epsilon=0.1, gamma=0.99, alpha=0.1, lambd=0.9):
        self.actions = actions
        self.epsilon = epsilon
        self.gamma = gamma
        self.alpha = alpha
        self.lambd = lambd
        self.Q = {}  # Q-values table
        self.e = {}  # Eligibility traces table

    def choose_action(self, sq):
        sq_tuple = sq  # Convert numpy array to tuple
        if np.random.rand() < self.epsilon:
            return np.random.choice(self.actions)
        else:
            if sq_tuple in self.Q:  # Use the tuple as the key
                return np.argmax(self.Q[sq_tuple])
            else:
                return np.random.choice(self.actions)


    def update_Q(self, sq, action, reward, next_sq, next_action):
        if sq not in self.Q:
            self.Q[sq] = np.zeros(self.actions)
            self.e[sq] = np.zeros(self.actions)

        delta = reward + self.gamma * self.Q.get(next_sq, np.zeros(self.actions))[next_action] - self.Q[sq][action]
        self.e[sq][action] += 1

        for state in self.Q:
            for a in range(self.actions):
                self.Q[state][a] += self.alpha * delta * self.e[state][a]
                self.e[state][a] *= self.gamma * self.lambd

    def solve_problem(self, sq, reward, next_sq):
        action = self.choose_action(sq)
        if reward is not None:  # If reward is received
            next_action = self.choose_action(next_sq)
            self.update_Q(sq, action, reward, next_sq, next_action)
            sq = next_sq
            return action, sq
        else:
            return action, sq

def sim_q(sq, c):
    # Example of a similarity function comparing qualitative states
    similarity = np.random.rand()  # Replace this with your own similarity calculation
    return similarity

class QCBRL:
    def __init__(self, actions, threshold=0.5, epsilon=0.1, gamma=0.99, alpha=0.1, lambd=0.9):
        self.problem_solver = ProblemSolver(actions, epsilon, gamma, alpha, lambd)
        self.C_B = {}
        self.threshold = threshold
        self.env = gym.make('FrozenLake-v1')

    def train(self, episodes, max_steps):
        total_rewards = []  # List to store total rewards for each episode

        for episode in range(1, episodes+1):
            state = self.env.reset()
            total_reward = 0
            for step in range(1, max_steps+1):
                self.env.render()  # Render the environment
                print("State: {}".format(state))
                action, next_state = self.take_action(state)
                print("Action: {}".format(action))
                next_state, reward, done, _ = self.env.step(action)
                print("next_state: {}".format(next_state))
                print("Reward: {}".format(reward))
                total_reward += reward
                c = (state, action, reward, next_state)
                self.reuse(c)
                # Determine if the episode ended successfully
                episode_ended_successfully = done
                if episode_ended_successfully:
                    new_C_B = {}
                    for key, value in self.C_B.items():
                        if isinstance(key, tuple):
                            new_key = key
                        else:
                            new_key = (key,)
                        new_C_B[new_key] = value
                    self.C_B = new_C_B
                    break  # Exit loop if episode ends
                state = next_state
            total_rewards.append(total_reward)  # Append total reward for current episode
            print(f"Episode {episode} finished after {step} steps with total reward: {total_reward}")

        # Plot the graph
        plt.plot(range(1, episodes + 1), total_rewards)
        plt.xlabel('Episode')
        plt.ylabel('Total Reward')
        plt.title('Agent Performance Over Episodes')
        plt.grid(True)
        plt.show()

    def take_action(self, state):
        action, _ = self.problem_solver.solve_problem(state, None, None)
        return action, state

    def reuse(self, c):
        hashed_c = tuple(array.tobytes() if isinstance(array, np.ndarray) else array for array in c)
        self.C_B[hashed_c] = c

    def convert_to_serializable(self, obj):
        if isinstance(obj, np.ndarray):
            return obj.tolist()
        elif isinstance(obj, dict):
            converted_dict = {}
            for k, v in obj.items():
                if isinstance(k, tuple):
                    k = '_'.join(map(str, k))  # Convert tuple keys to strings
                converted_dict[self.convert_to_serializable(k)] = self.convert_to_serializable(v)
            return converted_dict
        elif isinstance(obj, list):
            return [self.convert_to_serializable(item) for item in obj]
        elif isinstance(obj, tuple):
            return tuple(self.convert_to_serializable(item) for item in obj)
        else:
            return obj

    def save_case_base(self, filename):
        # Convert NumPy arrays to lists and tuple keys to string keys
        converted_case_base = self.convert_to_serializable(self.C_B)
        with open(filename, 'w') as file:
            json.dump(converted_case_base, file)

# Example usage
if __name__ == "__main__":
    actions = 4  # Example number of actions
    agent = QCBRL(actions)
    agent.train(episodes=200, max_steps=1000)
    agent.save_case_base("case_base.json")


State: 0
Action: 0
next_state: 0
Reward: 0.0
State: 0
Action: 2
next_state: 0
Reward: 0.0
State: 0
Action: 2
next_state: 4
Reward: 0.0
State: 4
Action: 1
next_state: 8
Reward: 0.0
State: 8
Action: 2
next_state: 12
Reward: 0.0
Episode 1 finished after 5 steps with total reward: 0.0
State: 0
Action: 3
next_state: 0
Reward: 0.0
State: 0
Action: 3
next_state: 0
Reward: 0.0
State: 0
Action: 0
next_state: 0
Reward: 0.0
State: 0
Action: 1
next_state: 1
Reward: 0.0
State: 1
Action: 3
next_state: 0
Reward: 0.0
State: 0
Action: 2
next_state: 1
Reward: 0.0
State: 1
Action: 3
next_state: 0
Reward: 0.0
State: 0
Action: 1
next_state: 4
Reward: 0.0
State: 4
Action: 2
next_state: 5
Reward: 0.0
Episode 2 finished after 9 steps with total reward: 0.0
State: 0
Action: 0
next_state: 0
Reward: 0.0
State: 0
Action: 0
next_state: 0
Reward: 0.0
State: 0
Action: 2
next_state: 1
Reward: 0.0
State: 1
Action: 3
next_state: 2
Reward: 0.0
State: 2
Action: 1
next_state: 3
Reward: 0.0
State: 3
Action: 0
next_state: 2

KeyboardInterrupt: 

: 