In [33]:
import numpy as np
import matplotlib.pyplot as plt

class BaseEnvironment:
    def __init__(self) -> None:
        self.state = 0
        self.terminated = False

    def reset(self):
        self.state = 0
        self.terminated = True

    def new_ep(self):
        self.terminated = False

    def get_state(self):
        return self.state

    def take_step(self, action):
        raise NotImplementedError("Subclasses must implement this method.")

class NormalEnvironment(BaseEnvironment):
    def get_reward(self, action):
        if self.state == 0:
            reward = 0
        elif self.state == 1:
            rewards = [4, 0]
            reward = rewards[action]
        elif self.state == 2:
            rewards = [2, 3]
            reward = rewards[action]
        return reward

    def take_step(self, action):
        self.terminated = False
        reward = self.get_reward(action)
        if self.state == 0:
                if action < 2:
                    # there is no further state
                    self.reset()
                else:
                    #random transition to left or right if C is chosen
                    self.state = np.random.randint(1,3)
        else:
                # there is no further state
            self.reset()
        return reward

class MazeEnvironment(BaseEnvironment):
    def get_reward(self, action):
        if self.state == 0:
            reward = 0
        elif self.state == 1:
            rewards = [4,0]
            reward = rewards[action]
        elif self.state == 2:
            rewards = [2,3]
            reward = rewards[action]
        return reward

    def take_step(self,  action):
        self.terminated = False
        reward = self.get_reward(action)
        if self.state == 0:
            self.state = self.state + 1 + action
        else:
            # there is no further state
            self.reset()
        return reward

import numpy as np

class BaseAgent:
    def __init__(self, env, epsilon_a=0.075, policy_type="random", epsilon_c=0.2, decay=0, beta=1) -> None:
        self.vs = np.zeros(3)
        #self.policy = [[0.5, 0.5], [0.5, 0.5], [0.5, 0.5]]
        self.policy_type = policy_type
        self.env = env
        self.epsilon_a = epsilon_a
        self.epsilon_c = epsilon_c
        self.decay = decay
        self.beta = beta

    def reset(self):
        # reset at the start of a new epoch
        #self.vs = np.zeros(3)
        #self.policy = [[0.5, 0.5], [0.5, 0.5], [0.5, 0.5]]
        raise NotImplementedError("Subclasses must implement this method.")

    def critic(self):
        return self.vs

    def actor(self):
        raise NotImplementedError("Subclasses must implement this method.")

    def update_policy(self, state, reward, new_state, action):
        raise NotImplementedError("Subclasses must implement this method.")

    def update_weights(self, reward, state, new_state):
        raise NotImplementedError("Subclasses must implement this method.")

    def softmax(self, x):
        for i in range(len(x)):
            x[i] = self.beta * x[i]
        return np.exp(x) / np.exp(x).sum()

    def delta_func(self, a, b):
        return a == b

class ChoiceAgent(BaseAgent):
    def __init__(self, env, epsilon_a=0.075, policy_type="random", epsilon_c=0.2, decay=0, beta=1) -> None:
        super().__init__(env, epsilon_a, policy_type, epsilon_c, decay, beta)
        self.ms = [[0, 0, 0], [0, 0], [0, 0]]
        self.policy = [[1/3, 1/3, 1/3], [0.5, 0.5], [0.5, 0.5]]

    def reset(self):
        # Reset at the start of a new epoch
        self.values = np.zeros(3)
        self.memory = [[0, 0, 0], [0, 0], [0, 0]]
        self.policy = [[1/3, 1/3, 1/3], [0.5, 0.5], [0.5, 0.5]]

    def actor(self):
        state = self.env.get_state()
        if state == 0:
            action = np.random.choice(np.arange(3), p=self.policy[state])
        else:
            action = np.random.choice(np.arange(2), p=self.policy[state][0:2])  # Adjusted here
        return action

    def update_policy(self, state, reward, new_state, action):
        if action < 2:
            delta = reward - self.vs[state]
        else:
            delta = reward + self.vs[new_state] - self.vs[state]
        for a in range(len(self.ms[state])):
            self.ms[state][a] = (1 - self.decay) * self.ms[state][a] + self.epsilon_a * self.delta_func(action, a) * delta
            self.policy[state] = self.softmax(self.ms[state])
        return self.policy

    def update_weights(self, reward, state, new_state):
        if new_state != 0:
            delta = reward + self.vs[new_state] - self.vs[state]
        else:
            delta = reward - self.vs[state]

        self.vs[state] += self.epsilon_c * delta

        return self.vs

class MazeAgent(BaseAgent):
    def __init__(self, env, epsilon_a=0.075, policy_type="random", epsilon_c=0.2, decay=0, beta=1) -> None:
        super().__init__(env, epsilon_a, policy_type, epsilon_c, decay, beta)
        self.ms = [[0, 0], [0, 0], [0, 0]]
        self.policy = [[0.5, 0.5], [0.5, 0.5], [0.5, 0.5]]

    def actor(self):
        state = self.env.get_state()
        action = np.random.choice(np.arange(2), p=self.policy[state])
        return action

    def update_policy(self, state, reward, new_state, action):
        if state == 0:
            delta = reward + self.vs[new_state] - self.vs[state]
        else:
            delta = reward - self.vs[state]
        for a in range(len(self.ms[state])):
            self.ms[state][a] = (1 - self.decay) * self.ms[state][a] + self.epsilon_a * self.delta_func(action, a) * delta
            self.policy[state] = self.softmax(self.ms[state])
        return self.policy

    def update_weights(self, reward, state, new_state):
        if state == 0:
            delta = reward + self.vs[new_state] - self.vs[state]
        else:
            delta = reward - self.vs[state]

        self.vs[state] += self.epsilon_c * delta

        return self.vs


# policy evaluation for "maze" environment
episodes = 1000
epochs = 1000
epsilon = 0.1

env_normal = NormalEnvironment()
agent_normal = ChoiceAgent(env=env_normal, epsilon_a=epsilon, policy_type="actor_critic")
v_hist = np.zeros((epochs, episodes, 3))
policy_hist = np.zeros((epochs, episodes, 7))

for epoch in range(epochs):
    agent_normal.reset()

    for episode in range(episodes):
        for step in range(2):
            if env_normal.terminated == False:
                state = env_normal.state
                v = agent_normal.vs
                if state == 0:
                    action = np.random.choice(np.arange(3), p=agent_normal.policy[state])
                else:
                    action = np.random.choice(np.arange(2), p=agent_normal.policy[state])
                reward = env_normal.take_step(action)
                new_state = env_normal.state
                policy = agent_normal.update_policy(state, reward, new_state, action)
                if new_state != 0:
                    delta = reward + agent_normal.values[new_state] - agent_normal.values[state]
                else:
                    delta = reward - agent_normal.values[state]

                agent_normal.values[state] += agent_normal.epsilon_c * delta

        v_hist[epoch,episode] = v
        policy_hist[epoch, episode] = np.concatenate((policy[0], policy[1], policy[2]))
        env_normal.new_ep()

# plot policy evaluation for "normal" environment
means_per_epoch_1_pe_normal = np.mean(v_hist[:, :, 0], axis=0)
plt.plot(range(epochs), v_hist[0, :, 0])
plt.plot(range(epochs), means_per_epoch_1_pe_normal)
plt.xlabel("Episode")
plt.ylabel("v(u=1) - Normal")
plt.show()

means_per_epoch_2_pe_normal = np.mean(v_hist[:, :, 1], axis=0)
plt.plot(range(epochs), v_hist[0, :, 1])
plt.plot(range(epochs), means_per_epoch_2_pe_normal)
plt.xlabel("Episode")
plt.ylabel("v(u=2) - Normal")
plt.show()

means_per_epoch_3_pe_normal = np.mean(v_hist[:, :, 2], axis=0)
plt.plot(range(epochs), v_hist[0, :, 2])
plt.plot(range(epochs), means_per_epoch_3_pe_normal)
plt.xlabel("Episode")
plt.ylabel("v(u=3) - Normal")
plt.show()

# policy evaluation for "maze" environment
episodes = 1000
epochs = 1000
epsilon = 0.1

env_maze = MazeEnvironment()
agent_maze = MazeAgent(env=env_maze, epsilon_a=epsilon, policy_type="actor_critic")
v_over_epochs_pe_maze = np.zeros((epochs, episodes, 3))
for epoch in range(epochs):
    agent_maze.reset()

    for episode in range(episodes):
        for step in range(2):
            if env_maze.terminated == False:
                state = env_maze.get_state()
                vs = agent_maze.critic()
                action = agent_maze.actor()
                reward = env_maze.take_step(action)
                new_state = env_maze.get_state()
                agent_maze.update_weights(reward, state, new_state)

        v_over_epochs_pe_maze[epoch, episode] = vs
        env_maze.new_ep()

# plot policy evaluation for "maze" environment
means_per_epoch_1_pe_maze = np.mean(v_over_epochs_pe_maze[:, :, 0], axis=0)
plt.plot(range(epochs), v_over_epochs_pe_maze[0, :, 0])
plt.plot(range(epochs), means_per_epoch_1_pe_maze)
plt.xlabel("Episode")
plt.ylabel("v(u=1) - Maze")
plt.show()

means_per_epoch_2_pe_maze = np.mean(v_over_epochs_pe_maze[:, :, 1], axis=0)
plt.plot(range(epochs), v_over_epochs_pe_maze[0, :, 1])
plt.plot(range(epochs), means_per_epoch_2_pe_maze)
plt.xlabel("Episode")
plt.ylabel("v(u=2) - Maze")
plt.show()

means_per_epoch_3_pe_maze = np.mean(v_over_epochs_pe_maze[:, :, 2], axis=0)
plt.plot(range(epochs), v_over_epochs_pe_maze[0, :, 2])
plt.plot(range(epochs), means_per_epoch_3_pe_maze)
plt.xlabel("Episode")
plt.ylabel("v(u=3) - Maze")
plt.show()


  return np.exp(x) / np.exp(x).sum()
  return np.exp(x) / np.exp(x).sum()


ValueError: probabilities contain NaN