# Exercises Sheet 3 Reinforcement Learning

In [5]:
import numpy as np
import random
import matplotlib.pyplot as plt

class Environment:
    def __init__(self):
        # Define the grid size and initialize the matrix
        self.grid_size = 10  # 10x10 grid
        self.goal_state = 100  # goal at state 100
        self.max_steps = 1000  # limit for steps in each episode

    def state_to_coordinates(self, state):
        """Converts state to grid coordinates (row, col)."""
        row = (state - 1) // self.grid_size
        col = (state - 1) % self.grid_size
        return row, col

    def coordinates_to_state(self, row, col):
        """Converts grid coordinates to state."""
        return row * self.grid_size + col + 1

    def transition(self, state, action):
        """State transition function based on action (up, down, left, right)."""
        row, col = self.state_to_coordinates(state)
        
        if action == "up" and row > 0:
            row -= 1
        elif action == "down" and row < self.grid_size - 1:
            row += 1
        elif action == "left" and col > 0:
            col -= 1
        elif action == "right" and col < self.grid_size - 1:
            col += 1
        # Return new state after action
        return self.coordinates_to_state(row, col)

    def reward(self, state):
        """Reward function: 100 if goal is reached, otherwise 0."""
        return 100 if state == self.goal_state else 0

    def print_matrix(self, agent_state):
        """Print the 10x10 matrix with 'O' for the agent and 'X' for the goal."""
        matrix = [[' ' for _ in range(self.grid_size)] for _ in range(self.grid_size)]
        # Set goal position
        goal_row, goal_col = self.state_to_coordinates(self.goal_state)
        matrix[goal_row][goal_col] = 'X'
        # Set agent position
        agent_row, agent_col = self.state_to_coordinates(agent_state)
        matrix[agent_row][agent_col] = 'O'
        
        # Print the matrix
        for row in matrix:
            print(' '.join(row))
        print('\n' + '-' * 20)  # Divider between steps

class Agent:
    def __init__(self, environment):
        self.env = environment
        self.current_state = 1  # Start position
        self.actions = ["up", "down", "left", "right"]

    def choose_random_action(self):
        """Randomly select an action."""
        return random.choice(self.actions)

    def reset(self):
        """Reset the agent to the starting position."""
        self.current_state = 1

    def run_episode(self):
        """Run an episode until goal is reached or max steps are exceeded."""
        total_reward = 0
        steps = 0

        for i in range(self.env.max_steps):
            action = self.choose_random_action()
            new_state = self.env.transition(self.current_state, action)
            reward = self.env.reward(new_state)
            total_reward += reward
            steps += 1
            self.current_state = new_state

            # Print the matrix showing the agent's current position
            if i % 100 == 0:
                print(f"iteration {i}")
                self.env.print_matrix(self.current_state)

            # Check if goal state is reached
            if self.current_state == self.env.goal_state:
                break

        self.reset()
        return total_reward, steps

# Run simulation for 30 episodes
def simulate(agent, num_episodes=30):
    rewards, steps_list = [], []

    for _ in range(num_episodes):
        reward, steps = agent.run_episode()
        rewards.append(reward)
        steps_list.append(steps)

    '''

    # Calculate statistics
    avg_reward = np.mean(rewards)
    avg_steps = np.mean(steps_list)
    std_steps = np.std(steps_list)

    # Plotting the results
    fig, ax = plt.subplots(1, 2, figsize=(12, 6))

    ax[0].boxplot(rewards, vert=True)
    ax[0].set_title("Rewards per Episode")
    ax[0].set_ylabel("Reward")

    ax[1].boxplot(steps_list, vert=True)
    ax[1].set_title("Steps to Reach Goal")
    ax[1].set_ylabel("Steps")

    plt.show()

    print(f"Average Reward per Episode: {avg_reward}")
    print(f"Average Steps to Goal: {avg_steps}")
    print(f"Standard Deviation of Steps: {std_steps}")

    '''

# Initialize environment and agent
env = Environment()
agent = Agent(env)

# Run the simulation
simulate(agent)


iteration 0
                   
O                  
                   
                   
                   
                   
                   
                   
                   
                  X

--------------------
iteration 100
                O  
                   
                   
                   
                   
                   
                   
                   
                   
                  X

--------------------
iteration 200
                   
                   
                   
                   
                   
                   
                   
                   
O                  
                  X

--------------------
iteration 300
                   
                   
                   
                   
                   
            O      
                   
                   
                   
                  X

--------------------
iteration 400
                   
                   
    