In [1]:
import numpy as np
import plotly.graph_objects as go
from SnakeEnvironment import SnakeEnv

In [2]:
env = SnakeEnv(grid_size=10)
print("Observation Space:", env.observation_space)
print("Action Space:", env.action_space)

Observation Space: Box(0.0, 1.0, (10, 10, 1), float32)
Action Space: Discrete(4)


In [3]:
action_size = env.action_space.n
state_size = env.observation_space.shape[0] * env.observation_space.shape[1]

In [4]:
total_episodes = 20000
max_steps_per_episode = 500
learning_rate = 0.05
gamma = 0.9
epsilon = 1.0
min_epsilon = 0.01
decay_rate = 0.0001

q_table = {}

In [5]:
def discretize_state_simple(state):
    snake_head = tuple(np.argwhere(state[:, :, 0] == 1)[0])
    food_position = tuple(np.argwhere(state[:, :, 0] == -1)[0])
    direction = 0
    return (snake_head, food_position, direction)

In [6]:
total_rewards = []
evaluation_interval = 500
average_rewards = []

In [7]:
for episode in range(total_episodes):
    state = env.reset()
    state_key = discretize_state_simple(state)
    done = False
    episode_reward = 0

    for _ in range(max_steps_per_episode):
        if done:
            break

        if np.random.rand() < epsilon:
            action = env.action_space.sample()
        else:
            action = np.argmax(q_table.get(state_key, np.zeros(env.action_space.n)))

        new_state, reward, done, _ = env.step(action)
        new_state_key = discretize_state_simple(new_state)

        if reward == -1:
            distance_before = np.linalg.norm(np.array(state_key[0]) - np.array(state_key[1]))
            distance_after = np.linalg.norm(np.array(new_state_key[0]) - np.array(new_state_key[1]))
            if distance_after < distance_before:
                reward = 1
            else:
                reward = -1
        elif reward == 10:
            reward = 200
        elif reward == -10:
            reward = -10

        if state_key not in q_table:
            q_table[state_key] = np.zeros(env.action_space.n)
        q_table[state_key][action] += learning_rate * (
            reward + gamma * np.max(q_table.get(new_state_key, np.zeros(env.action_space.n))) - q_table[state_key][action]
        )

        state_key = new_state_key
        episode_reward += reward

    epsilon = max(min_epsilon, epsilon * np.exp(-decay_rate * episode))
    total_rewards.append(episode_reward)

    if (episode + 1) % evaluation_interval == 0:
        average_reward = np.mean(total_rewards[-evaluation_interval:])
        average_rewards.append(average_reward)
        print(f"Episode {episode + 1}: Average Reward = {average_reward:.2f}, Epsilon = {epsilon:.4f}")

Episode 500: Average Reward = 118.21, Epsilon = 0.0100
Episode 1000: Average Reward = 220.20, Epsilon = 0.0100
Episode 1500: Average Reward = 234.52, Epsilon = 0.0100
Episode 2000: Average Reward = 251.18, Epsilon = 0.0100
Episode 2500: Average Reward = 252.88, Epsilon = 0.0100
Episode 3000: Average Reward = 255.13, Epsilon = 0.0100
Episode 3500: Average Reward = 263.65, Epsilon = 0.0100
Episode 4000: Average Reward = 270.07, Epsilon = 0.0100
Episode 4500: Average Reward = 267.71, Epsilon = 0.0100
Episode 5000: Average Reward = 274.75, Epsilon = 0.0100
Episode 5500: Average Reward = 278.32, Epsilon = 0.0100
Episode 6000: Average Reward = 260.38, Epsilon = 0.0100
Episode 6500: Average Reward = 279.33, Epsilon = 0.0100
Episode 7000: Average Reward = 274.60, Epsilon = 0.0100
Episode 7500: Average Reward = 278.84, Epsilon = 0.0100
Episode 8000: Average Reward = 276.49, Epsilon = 0.0100
Episode 8500: Average Reward = 285.75, Epsilon = 0.0100
Episode 9000: Average Reward = 284.44, Epsilon = 

In [8]:
fig = go.Figure()
fig.add_trace(go.Scatter(
    x=list(range(evaluation_interval, total_episodes + 1, evaluation_interval)),
    y=average_rewards,
    mode='lines+markers',
    name='Average Reward'
))
fig.update_layout(
    title='Training Progress',
    xaxis_title='Episode',
    yaxis_title='Average Reward',
    template='plotly_dark'
)
fig.show()

In [9]:
def evaluate_agent_with_plot(q_table, env, test_episodes=100, max_steps_per_episode=500):
    
    rewards_per_episode = []

    for episode in range(test_episodes):
        state = env.reset()
        state_key = discretize_state_simple(state)
        done = False
        episode_reward = 0

        for _ in range(max_steps_per_episode):
            if done:
                break

            action = np.argmax(q_table.get(state_key, np.zeros(env.action_space.n)))
            new_state, reward, done, _ = env.step(action)
            new_state_key = discretize_state_simple(new_state)

            state_key = new_state_key
            episode_reward += reward

        rewards_per_episode.append(episode_reward)

    fig = go.Figure()
    fig.add_trace(go.Scatter(
        x=list(range(1, test_episodes + 1)),
        y=rewards_per_episode,
        mode='lines+markers',
        name='Total Reward'
    ))
    fig.update_layout(
        title='Agent Performance during Testing',
        xaxis_title='Episode',
        yaxis_title='Total Reward',
        template='plotly_dark'
    )
    fig.show()

    average_test_reward = sum(rewards_per_episode) / test_episodes
    print(f"Average test reward over {test_episodes} episodes: {average_test_reward:.2f}")

    return rewards_per_episode, average_test_reward

test_rewards, average_test_reward = evaluate_agent_with_plot(q_table, env, test_episodes=100, max_steps_per_episode=500)
print(f"Final Average Test Reward: {average_test_reward:.2f}")

Average test reward over 100 episodes: -12.66
Final Average Test Reward: -12.66
