# QLearningAgent

In [1]:
import random
from MazeGameEnv import MazeGameEnv

class QLearningAgent():
    def __init__(self, env, learning_rate=0.01, discount_factor=0.999, exploration_rate=1.0, exploration_decay=0.999, min_exploration_rate=0.01):
        self.env = env
        self.lr = learning_rate
        self.discount_factor = discount_factor
        self.exploration_rate = exploration_rate
        self.exploration_decay = exploration_decay
        self.min_exploration_rate = min_exploration_rate
        self.positions = self.create_positions()
        self.q = self.create_q_table(self.positions)

    def create_positions(self):
        positions = []
        for i in range(len(self.env.board)):
            for j in range(len(self.env.board[0])):
                positions.append((i, j))
        return positions
    
    def create_q_table(self, positions):
        q = []
        for i in range(len(positions)):
            row = []
            for j in range(len(self.env.actions)):
                row.append(0)
            q.append(row)
        return q

    def act(self, observation):
        valid_actions = [action for action in range(len(self.env.actions)) if self.env.is_valid_move(observation, action)]
        if random.uniform(0, 1) < self.exploration_rate:
            #print("Exploration")
            return random.choice(valid_actions)
        else:
            #print("Exploitation")
            posible_actions = self.q[self.obs_to_position(observation)]
            posible_valid_actions = []

            for i in range(len(posible_actions)):
                if i in valid_actions:
                    posible_valid_actions.append((posible_actions[i], i))

            action_t = max(posible_valid_actions)
            action = action_t[1]
            if action is None:
                return None
            return action
        
    def obs_to_position(self, observation):
        position = None
        for i in range(len(self.positions)):
            if self.positions[i] == observation:
                position = i
                break
        return position

    def learn(self, observation, action, reward, next_observation, done):
        if done:
            return
        
        best_next_action = self.predict(observation)
        if best_next_action is None:
            print("No best action")
            return
        td_target = reward + self.discount_factor*self.q[self.obs_to_position(next_observation)][best_next_action]
        td_error = td_target - self.q[self.obs_to_position(observation)][action]
        self.q[self.obs_to_position(observation)][action] += self.lr*td_error

        self.exploration_rate = max(self.min_exploration_rate, self.exploration_rate*self.exploration_decay)
        

    def predict(self, observation):
        valid_actions = [action for action in range(len(self.env.actions)) if self.env.is_valid_move(observation, action)]
        posible_actions = self.q[self.obs_to_position(observation)]
        posible_valid_actions = []
        for i in range(len(posible_actions)):
            if i in valid_actions:
                posible_valid_actions.append((posible_actions[i], i))

        action_t = max(posible_valid_actions)
        action = action_t[1]
        if action is None:
            return None
        return action
    

In [2]:
def evaluate_agent(agent, env, num_episodes=100):
    rewards = []
    for episode in range(num_episodes):
        env.reset()
        observation = env.get_pos(env.player)
        done = False
        total_reward = 0
        
        steps = 0
        while not done:
            action = agent.act(observation)
            next_observation, reward, done = env.step(action)
            agent.learn(observation, action, reward, next_observation, done)
            observation = next_observation
            total_reward += reward
            steps += 1
            if steps > 1000:
                break
        print(f"End episode {episode} with {total_reward} points")
        rewards.append(total_reward)
    return rewards

learning_rate = 0.1
discount_factor = 0.9
exploration_rate = 0.1
num_episodes = 1000
env = MazeGameEnv()
agent = QLearningAgent(env, learning_rate, discount_factor, exploration_rate)
average_reward = evaluate_agent(agent, env, num_episodes)
print(f"Average reward over {num_episodes} episodes: {average_reward}")

End episode 0 with 114 points
End episode 1 with 116 points
End episode 2 with 116 points
End episode 3 with 116 points
End episode 4 with 131 points
End episode 5 with 116 points
End episode 6 with 116 points
End episode 7 with 116 points
End episode 8 with 116 points
End episode 9 with 112 points
End episode 10 with 116 points
End episode 11 with 116 points
End episode 12 with 116 points
End episode 13 with 114 points
End episode 14 with 118 points
End episode 15 with 116 points
End episode 16 with 68 points
End episode 17 with 80 points
End episode 18 with 116 points
End episode 19 with 118 points
End episode 20 with 116 points
End episode 21 with 118 points
End episode 22 with 116 points
End episode 23 with 118 points
End episode 24 with 114 points
End episode 25 with 118 points
End episode 26 with 116 points
End episode 27 with 118 points
End episode 28 with 118 points
End episode 29 with 116 points
End episode 30 with 116 points
End episode 31 with 118 points
End episode 32 with 

In [3]:
def try_agent(agent, env):
    env.reset()
    observation = env.get_pos(env.player)
    done = False
    total_reward = 0
    
    steps = 0
    while not done:
        action = agent.predict(observation)
        next_observation, reward, done = env.step(action)
        observation = next_observation
        total_reward += reward
        steps += 1
        if steps > 2000:
            print("finish")
            break
    return total_reward


total_reward = try_agent(agent, env)
env.render()