In [167]:
import time
import random
import numpy as np

In [168]:
# from environments.two_round_rock_paper_scissors import RockPaperScissors
# from environments.line_world import LineWorld
# from environments.grid_world import GridWorld

In [169]:
import numpy as np

class GridWorld:
    def __init__(self):
        self.size = (6, 6)
        self.goal_state = 8
        self.rewards = {8: 10, 3: -5}
        self.actions = ['up', 'down', 'left', 'right']
        self.current_state = 0
        self.action_space = [0, 1, 2, 3]

    def get_next_state(self, state, action):
        row, col = divmod(state, self.size[1])

        if action == 'up':
            row = max(row - 1, 0)
        elif action == 'down':
            row = min(row + 1, self.size[0] - 1)
        elif action == 'left':
            col = max(col - 1, 0)
        elif action == 'right':
            col = min(col + 1, self.size[1] - 1)

        next_state = row * self.size[1] + col
        return next_state

    def get_reward(self, state):
        return self.rewards.get(state, -1)

    def reset(self):
        self.current_state = 0
        return self.current_state

    def is_game_over(self):
        return self.current_state == self.goal_state

    def step(self, action):
        next_state = self.get_next_state(self.current_state, action)
        reward = self.get_reward(next_state)
        done = self.is_game_over()
        self.current_state = next_state
        return next_state, reward, done, {}

    def select_action(self, state):
        if np.random.rand() < self.epsilon:
            return np.random.choice(self.environment.num_actions())
        else:
            return np.argmax(self.q_values[state])
    
    def num_states(self):
        return self.size[0] * self.size[1]

    def num_actions(self):
        return len(self.actions)

    def available_actions(self):
        return self.actions

    def state_id(self):
        return self.current_state

    def reward(self, state):
        return self.rewards.get(state, -1)

    def get_state_value(self, state):
        state_values = [
            [0, 1, 2, 3, 4, 5],
            [6, 7, 8, 9, 10, 11],
            [12, 13, 14, 15, 16, 17],
            [18, 19, 20, 21, 22, 23],
            [24, 25, 26, 27, 28, 29],
            [30, 31, 32, 33, 34, 35]
        ]
        return state_values[state // self.size[1]][state % self.size[1]]


In [170]:
class DynaQAgent:
    def __init__(self, environment, gamma=0.9, alpha=0.1, epsilon=0.1, n=10):
        self.environment = environment
        self.gamma = gamma
        self.alpha = alpha
        self.epsilon = epsilon
        self.n = n
        self.q_values = np.zeros((environment.num_states(), environment.num_actions()))
        self.model = {}
        self.initialize_model()

    def initialize_model(self):
        for state in range(self.environment.num_states()):
            for action in range(self.environment.num_actions()):
                self.model[(state, action)] = (0, state)

    def select_action(self, state):
        if np.random.rand() < self.epsilon:
            return np.random.choice(self.environment.action_space)
        else:
            return np.argmax(self.q_values[state])

    def update_model(self, state, action, reward, next_state):
        self.model[(state, action)] = (reward, next_state)

    def planning_step(self):
        for _ in range(self.n):
            state, action = random.choice(list(self.model.keys()))
            reward, next_state = self.model[(state, action)]
            self.q_values[state][action] += self.alpha * (reward + self.gamma * np.max(self.q_values[next_state]) - self.q_values[state][action])

    def dyna_q(self, episodes=100):
        for _ in range(episodes):
            state = self.environment.reset()
            done = False
            while not done:
                print("Iteration: ", _)  # Debugging
                action = self.select_action(state)
                next_state, reward, done, _ = self.environment.step(action)
                if next_state >= self.q_values.shape[0]:
                    next_state = self.q_values.shape[0] - 1
                self.q_values[state][action] += self.alpha * (reward + self.gamma * np.max(self.q_values[next_state]) - self.q_values[state][action])
                self.update_model(state, action, reward, next_state)
                self.planning_step()
                state = next_state


    def find_best_path_for_goal(self, start_state):
        state = start_state
        path = [state]
        action_map = {0: 'up', 1: 'down', 2: 'left', 3: 'right'}
        while True:
            action_idx = np.argmax(self.q_values[state])
            action = action_map[action_idx]
            next_state, reward, done, _ = self.environment.step(action)
            path.append(next_state)
            if done:
                break
            state = next_state
        return path

In [171]:
start_time = time.time()
print("Dyna Q - GridWorld")

environment = GridWorld()
agent = DynaQAgent(environment, gamma=0.9, alpha=0.1, epsilon=0.1, n=10)

agent.dyna_q(episodes=100)

print("\nQ-Values:")
print(agent.q_values)

print("\nPolicy:")
for state in range(environment.num_states()):
    print(f"State {state}: {agent.select_action(state)}")

print("\nScore:")
print(environment.score())
print("\nExecution Time: ", time.time() - start_time)

Dyna Q - GridWorld
Iteration:  0
Iteration:  {}
Iteration:  {}
Iteration:  {}
Iteration:  {}
Iteration:  {}
Iteration:  {}
Iteration:  {}
Iteration:  {}
Iteration:  {}
Iteration:  {}
Iteration:  {}
Iteration:  {}
Iteration:  {}
Iteration:  {}
Iteration:  {}
Iteration:  {}
Iteration:  {}
Iteration:  {}
Iteration:  {}
Iteration:  {}
Iteration:  {}
Iteration:  {}
Iteration:  {}
Iteration:  {}
Iteration:  {}
Iteration:  {}
Iteration:  {}
Iteration:  {}
Iteration:  {}
Iteration:  {}
Iteration:  {}
Iteration:  {}
Iteration:  {}
Iteration:  {}
Iteration:  {}
Iteration:  {}
Iteration:  {}
Iteration:  {}
Iteration:  {}
Iteration:  {}
Iteration:  {}
Iteration:  {}
Iteration:  {}
Iteration:  {}
Iteration:  {}
Iteration:  {}
Iteration:  {}
Iteration:  {}
Iteration:  {}
Iteration:  {}
Iteration:  {}
Iteration:  {}
Iteration:  {}
Iteration:  {}
Iteration:  {}
Iteration:  {}
Iteration:  {}
Iteration:  {}
Iteration:  {}
Iteration:  {}
Iteration:  {}
Iteration:  {}
Iteration:  {}
Iteration:  {}
Iterati

KeyboardInterrupt: 

In [None]:
from RL_project.secret_envs_wrapper import SecretEnv0

print("Policy Iteration - Secret Env 0")

environment = SecretEnv0()
agent = DynaQAgent(environment)
agent.policy_iteration()

print("State Values:")
print(agent.state_values.reshape((3, 3)))

print("\nPolicy:")
action_symbols = ['↑', '↓', '←', '→', 'N/A']
for row in range(3):
    for col in range(3):
        state = row * 3 + col
        if state == 8:
            print(" G ", end=" ")
        else:
            action = agent.policy[state]
            print(f" {action_symbols[action]} ", end=" ")
    print()

start_state = 0
best_path = agent.find_best_path_for_goal(start_state)
print("\nBest Path from state 0 to goal:")
print(best_path)

In [None]:
param = Parameters((3, 3), 8, {8: 10, 3: -5}, 0.9)
environment = Environment(param.size, param.goal_state, param.rewards)
agent = DynaQAgent(environment, param.gamma)

agent.dyna_q()

print("State Values:")
print(agent.q_values.max(axis=1).reshape(param.size))
print("\nPolicy:")
for row in range(param.size[0]):
    for col in range(param.size[1]):
        state = row * param.size[1] + col
        if state == param.goal_state:
            print(" G ", end=" ")
        else:
            action_idx = np.argmax(agent.q_values[state])
            print(environment.actions[action_idx], end=" ")
    print()

start_state = 0
best_path = agent.find_best_path_for_goal(start_state)
print("\nBest Path from state 0 to goal:")
print(best_path)