In [11]:
import os
import sys
from utils import get_settings_dir

In [12]:
get_settings_dir()
from settings import ROWS, COLS

In [13]:
# Ssetup code for Q-learning agent

# -------------------- Setup Callbacks (callbacks.py) --------------------

# Simulating the Q-learning agent callbacks and training environment
import os
import numpy as np
import pickle
from callbacks import QLearningModel, state_to_features

model = QLearningModel()

In [18]:
# -------------------- Setup Training (train.py) --------------------
from typing import List, Tuple
from objective import Objective
import numpy as np

class AgentTraining:
    def __init__(self):
        self.episodes = 1000
        self.max_steps = 100
        self.rewards = []
        self.eps = 0
        self.epsilon_min = 0.1
        self.epsilon_decay = 0.995
        self.q_table = {}
        self.graph = []
        self.epsilon = 1.0
        self.learning_rate = 0.1
        self.discount_factor = 0.99
        self.model = QLearningModel()
        self.actions = self.model.actions
        self.objective = Objective(task=1, game_state={'self': (None, None, None, (7, 7)),  # Player's current state (self_position)
                                                        'coins': [(5, 5), (2, 8)],            # List of coin positions
                                                        'field': np.zeros((ROWS, COLS))      # Game field representation (grid)
                                                        })
        self.total_reward = 0
    
    def game_events_occurred(self, old_game_state, action, new_game_state, events):
        """ Update Q-table based on events """
        state = state_to_features(old_game_state)
        new_state = state_to_features(new_game_state)

        # Extract necessary parameters for reward calculation
        old_player_pos = old_game_state['self'][3]
        old_objective = old_game_state['coins'][0]  # Assuming coins are the objective
        new_self_pos = new_game_state['self'][3]
        new_objective = new_game_state['coins'][0]
        old_field = old_game_state['field']
        new_field = new_game_state['field']

        reward = self.calculate_reward(
            old_player_pos, old_objective, new_self_pos, new_objective, old_field, new_field, events
        )
        self.update_q_table(state, action, reward, new_state)

    def calculate_reward(
        self, 
        old_player_pos: Tuple[int, int], old_objective: Tuple[int, int],
        new_self_pos: Tuple[int, int], new_objective: Tuple[int, int],
        old_field: np.ndarray, new_field: np.ndarray,
        events: List[str]
        ) -> float:
        """
        Calculate the reward based on the state and events.
        """
        old_distance = self.objective.distance_objective(start_pos=old_player_pos, objective_pos=old_objective, field=old_field)
        new_distance = self.objective.distance_objective(start_pos=new_self_pos, objective_pos=new_objective, field=new_field)

        if old_objective == new_objective:
            reward = 1 if len(new_distance) < len(old_distance) else -3  # discourages moving away or not making progress
        else: 
            reward = 0  # no reward if objective has changed
        
        if "COIN_COLLECTED" in events:
            reward += 2
        
        return reward

    def update_q_table(self, state, action, reward, next_state):
        """ Update Q-table using Q-learning formula """
        state_tuple = tuple(state)
        if state_tuple not in self.q_table:
            self.q_table[state_tuple] = np.zeros(len(self.actions))
        
        next_state_tuple = tuple(next_state)
        if next_state_tuple not in self.q_table:
            self.q_table[next_state_tuple] = np.zeros(len(self.actions))

        action_index = self.actions.index(action)
        
        # Find the max Q-value for the next state (future reward estimation)
        max_next_q_value = np.max(self.q_table[next_state_tuple])

        # Q-learning update rule
        current_q_value = self.q_table[state_tuple][action_index]
        self.q_table[state_tuple][action_index] = current_q_value + self.learning_rate * (
            reward + self.discount_factor * max_next_q_value - current_q_value
        )

    def end_of_round(self, last_game_state, last_action, events):
        """ Handle end of round logic """
        reward = -100 if 'KILLED_SELF' in events else 0
        self.total_reward += reward

        self.graph.append((self.eps, self.total_reward))
        self.total_reward = 0

        if self.eps % 1000 == 0:
            self.save_metrics()
        
        self.eps += 1
        self.epsilon = max(self.epsilon_min, self.epsilon * self.epsilon_decay)
        self.log_progress(self.eps)


    def save_metrics(self):
        """ Save training metrics (e.g., Q-table, graph) """
        with open("q_table.pkl", "wb") as f:
            pickle.dump(self.q_table, f)
        with open("graph.txt", "w") as f:
            f.write(str(self.rewards))
            
    
    def log_progress(self, episode):
        """ Log training progress """
        if len(self.rewards) >= 100:
            avg_reward = sum(self.rewards[-100:]) / 100
        elif self.rewards:
            avg_reward = sum(self.rewards) / len(self.rewards)
        else:
            avg_reward = 0  # Handle case where self.rewards is empty

        print(f"Episode {episode}: Average Reward: {avg_reward}")

In [19]:
# -------------------- Test Execution --------------------

# Create instances of the agent and model
q_model = QLearningModel()
trainer = AgentTraining()

# Simulate a game loop
for episode in range(10):  # Run a limited number of episodes for testing
    old_game_state = {'self': (None, None, None, (7, 7)), 'coins': [(5, 5)], 'field': np.zeros((10, 10))}
    new_game_state = {'self': (None, None, None, (6, 6)), 'coins': [(5, 5)], 'field': np.zeros((10, 10))}
    state = state_to_features(old_game_state)
    
    # Choose an action using the Q-learning model
    action = q_model.choose_action(state)

    # Simulate events for the action
    events = ["COIN_COLLECTED"] if action == 'RIGHT' else []
    
    # Training step
    trainer.game_events_occurred(old_game_state, action, new_game_state, events)
    
    # End of round logic after some steps
    if episode % 10 == 0:
        trainer.end_of_round(new_game_state, action, events)

    # Log progress
    trainer.log_progress(episode)

# Print the Q-table for review
print("Q-Table:")
print(trainer.q_table)

ZeroDivisionError: division by zero

# Comparison with symmetry version