# Learning Q-Learning

In [2]:
import random
import numpy as np
from matplotlib import pyplot as plt

** Define the environment, R. **
Rows in R represent states and columns represent actions.
An action is defined as transfering to another state.
Our goal here is to get to state 5 in the graph. Every state/action can be assigned a
reward value according to a reward function.
Impossible transitions are assigned a value -1 in matrix R.

In [3]:
# Define Reward/Environment matrix, R
R = np.array([
        [-1,  -1,  -1,  -1,   0,  -1],
        [-1,  -1,  -1,   0,  -1, 100],
        [-1,  -1,  -1,   0,  -1,  -1],
        [-1,   0,   0,  -1,   0,  -1],
        [ 0,  -1,  -1,   0,  -1, 100],
        [-1,   0,  -1,  -1,   0, 100]])

KeyboardInterrupt: 

** Define the softmax function **
To be able to easily convert log-probs to probs. Not generalized to matrices!

In [4]:
def softmax(Z):
    if sum(Z) == 0:
        return np.ones_like(Z) / len(Z)
    eZ = np.exp(Z)
    return eZ / np.sum(eZ)

** An Agent **
has a Q matrix. The rows represent states, the columns actions.
Individual entries represent memories/expectations about future reward values associated to that particular state + action combo.
The attribute gamma represents the weight/tradeoff assigned to the immediate reward vs. the expected reward in the next step.

In [5]:
class Agent:
    """Representation of an agent, acting in an environment (R)"""

    def __init__(self, gamma=0.8):
        self.Q = np.zeros_like(R)  # Memory matrix
        self.gamma = gamma  # learning hyperparameter

KeyboardInterrupt: 

** This routine fits an agent to an environment. **
The initial state is randomly chosen. Then in each step a random action is chosen from all
possible valid actions in the current state with a uniform probability distribution.
The state/action pair is evaluated by the reward function and the obtained value is
recorded in the Q matrix. And the action is manifested by setting state to action.

In [6]:
def fit(agent, episodes, environment, display=None, master=None):
    state = random.randrange(len(environment))

    for episode in range(1, episodes + 1):
        # Filter valid actions (where p != -1)
        valid = [i for i, p in enumerate(environment[state]) if p >= 0]
        # Randomly select an action. COULD BE WEIGHED BY CURRENT Q...
        action = random.choice(valid)
        # Reward, directly observable from the environment
        immediate_reward = environment[state, action]
        ERGCA = agent.Q[action]  # as in Expected Reward Given Chosen Action
        agent.Q[state, action] = immediate_reward + agent.gamma * ERGCA
        # Make the step
        state = action
        # These are used to plot Q in real time
        if display is not None and master is not None:
            mx = agent.Q.max()
            display.set_data(agent.Q / mx if mx > 0. else agent.Q)
            master.pause(0.1)

KeyboardInterrupt: 

** This routine runs an agent based solely on its Q matrix **
Selecting actions while fitting does not rely on Q.
This method uses the learned Q matrix to navigate in the environment and measures the
average number of steps required for agent to get to the destination (state 5)

In [7]:
def run(agent, environment, runs=100):
        steps_required = []
        for _ in range(runs):
            state = random.randrange(len(environment))
            steps = 0
            while state != 5:
                # Select valid actions given state
                arg = list(filter(lambda p: p >= 0., agent.Q[state]))
                # treat Q[state] entries as log probabilities and compute softmax
                prob = agent.Q[state, arg]
                # if Q[state] is empty (all 0s, set it to a uniform prob distribution
                prob = softmax(prob) if sum(prob) > 0. else np.ones_like(prob) / len(prob)
                # select a valid action with their given probabilities
                state = np.random.choice(arg, p=prob)
                steps += 1
            steps_required.append(steps)
        print("Average from {} runs: {} steps".format(
            runs, sum(steps_required) / len(steps_required)))

KeyboardInterrupt: 

We run the model on the above-defined environment, R.

In [8]:
if __name__ == '__main__':
    bob = Agent()
    # agent.run(R)
    run(bob, R)

    plt.ion()
    mat = plt.imshow(np.zeros_like(bob.Q), vmin=0., vmax=1., interpolation="none")

    fit(bob, 100, R, display=mat, master=plt)

    plt.close()

    run(bob, R)

    # THIS IS AWESOME!!!!!!!!

KeyboardInterrupt: 