<a href="https://colab.research.google.com/github/anniewit/ML/blob/master/QLearning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import numpy as np
import numpy.random as rand

def generate_field(x, y, num_rewards, max_reward):
    """
    Generate a random game field with rewards.
    
    Args:
        x (int):            x dimension of the field
        y (int):            y dimension of the field 
        num_rewards (int):  the number of rewards that should be randomly placed
        max_reward (int):   the maximum reward that can be placed 
        
    Returns:
        ndarray: A field with randomly initialized rewards, the rest of the 
        entries is zero
    """
    
    # Change or comment out to get different random data in each run
    np.random.seed(42)
    
    field = np.zeros((y,x), dtype=np.uint8)
    
    for i in range(num_rewards):
        field[rand.randint(y), rand.randint(x)] = rand.choice(max_reward)
    
    return field

In [0]:
%matplotlib notebook

import numpy as np
import matplotlib.pyplot as plt

class QLearning:
    """
    This class contains all the necessary methods to navigate through
    a maze or game with the help of a little bit of Q-Learning.
    """

    def __init__(self, field, actions, gamma):
        """
        Initializes the QLearning Algorithm with the necessary parameters.
        All q values are stored in self.q - this is an array that has
        ACTIONS x map_x x map_y dimensions to store a value for each action
        in each field. The starting position self.pos is randomly initialized.
        
        Args:
            field (ndarray):  the map (holds rewards?)
            actions (list):   the available actions
            gamma (float):    the gamma in the lecture slides
        
        Returns:
            QLearning: An instance that can be used for Q-Learning on the field
        """
        # q stores the q_values for each action in each space of the field.
        self.field = field
     ##   print("field",field.shape)
        self.actions = actions
        self.gamma = gamma
        
        # Remember the map extend for further navigation.
        self.map_y = self.field.shape[0]
        self.map_x = self.field.shape[1]
        
        # Create q value matrix.
        self.q = np.zeros((len(self.actions), self.map_y, self.map_x))
  ###      print("q-shape: ", self.q.shape)

        # Start on a random position in the field.
        self.pos = [np.random.randint(self.map_y), np.random.randint(self.map_x)]
        self.fig, self.axes = plt.subplots(3, 3, num='QLearning State')
        for ax in self.axes.flat:
            ax.axis('off')

    def get_coordinates(self, position, action):
        """
        Returns the coordinates that follow a certain action, depending
        on the current position of the learner. If the border is reached
        the agent just stops there.
        
        Args:
            position (pair):  the current position
            action (string):  the action that should be performed (one of: 'up', 'down', ...)
            
        Returns:
            pair of int: the updated coordinates
        """
        # return the right new coordinates depending on the position
        # YOUR CODE HERE
        ##print("old position ", position, " \t action ", action )
        (y,x) = position
        
        if action == 'up' and y < self.map_y - 1: 
            return(y + 1, x)
        if action == 'down' and y > 0:
            return(y - 1, x)
        if action == 'right' and x < self.map_x - 1:
            return(y, x + 1)
        if action == 'left' and x > 0:
            return(y, x - 1)
        # if cannot move (border reached), stay at position
        else:
            return position
    
    def action_index(self, action):
        """convert  action to a number in order to story entries in q-value matrix"""
        return self.actions.index(action)


    def update(self):
        """
        Implementation of the update step. Closely follows the Algorithm described on
        ML-10 Sl.18. Note that you have attributes available as specified in the
        __init__ method of this class, in addition to that is the FIELD variable that
        stores the real field the agent is iterating about, as well as ACTIONS which
        stores the available actions.
        """
        # Select a random action that should be performed next.
        # Be careful to handle the case where you hit the wall!
        # YOUR CODE HERE
        action = np.random.choice(self.actions)

        # Receive the reward for the new position from the field.
        # YOUR CODE HERE
        old_y, old_x = self.pos
        (new_y, new_x) = self.get_coordinates(self.pos, action)
        
        # if wall was hit
        if new_y == old_y and new_x == old_x: 
            return
        
        reward = field[new_y, new_x]
        
        
        # Update the q-value for the performed action.
        # YOUR CODE HERE
        self.q[self.action_index(action), old_y, old_x] = reward + self.gamma * np.amax([self.q[a, new_y, new_x] for a in range(len(self.actions))])

        # Update the position of the player to the new field.
        # YOUR CODE HERE
        self.pos = (new_y, new_x)


    def plot(self):
        """
        Plots the current state.
        """
        for i, action in enumerate(self.actions):
            ax = self.axes.flat[2*i + 1]
            ax.set(title=action)
            ax.imshow(self.q[i,:,:], interpolation='None')

        self.fig.canvas.draw()