In [1]:
# first, look at main.py, it is quite simple:
#from agent import Agent #import agent
#from monitor import interact #monitor is where things will run
import gym #this is the environment
import numpy as np


In [22]:

env = gym.make('Taxi-v2') #initializing the environment
agent = Agent() #initializing the agent
avg_rewards, best_avg_reward = interact(env, agent) #using the interact function in monitor.py, we will begin to generate average and best average rewards

Episode 20000/20000 || Best average reward -735.45



In [2]:
# second, we look at monitor and undersatnd how agent and environment interacts
from collections import deque #deque is like a list, optimized for 
import sys
import math
import numpy as np

def interact(env, agent, num_episodes=20000, window=100):
    """ Monitor agent's performance.
    
    Params
    ======
    - env: instance of OpenAI Gym's Taxi-v1 environment
    - agent: instance of class Agent (see Agent.py for details)
    - num_episodes: number of episodes of agent-environment interaction
    - window: number of episodes to consider when calculating average rewards

    Returns
    =======
    - avg_rewards: deque containing average rewards
    - best_avg_reward: largest value in the avg_rewards deque
    """
    # initialize average rewards. looks like an empty list
    avg_rewards = deque(maxlen=num_episodes)
    # initialize best average reward
    best_avg_reward = -math.inf # i guess the worst reward is negative infinity
    # initialize monitor for most recent rewards
    samp_rewards = deque(maxlen=window)
    # for each episode
    # each episode so far starts like this
    for i_episode in range(1, num_episodes+1): #standard
        # begin the episode
        state = env.reset() #standard
        # initialize the sampled reward
        samp_reward = 0 #just set the first value
        while True:
            # agent selects an action
            action = agent.select_action(state)
            # agent performs the selected action
            next_state, reward, done, _ = env.step(action)
            # agent performs internal updates based on sampled experience
            agent.step(state, action, reward, next_state, done)
            # update the sampled reward
            samp_reward += reward
            # update the state (s <- s') to next time step
            state = next_state
            if done:
                # save final sampled reward
                samp_rewards.append(samp_reward)
                break
        if (i_episode >= 100):
            # get average reward from last 100 episodes
            avg_reward = np.mean(samp_rewards)
            # append to deque
            avg_rewards.append(avg_reward)
            # update best average reward
            if avg_reward > best_avg_reward:
                best_avg_reward = avg_reward
        # monitor progress
        print("\rEpisode {}/{} || Best average reward {}".format(i_episode, num_episodes, best_avg_reward), end="")
        sys.stdout.flush()
        # check if task is solved (according to OpenAI Gym)
        if best_avg_reward >= 9.7:
            print('\nEnvironment solved in {} episodes.'.format(i_episode), end="")
            break
        if i_episode == num_episodes: print('\n')
    return avg_rewards, best_avg_reward

In [20]:
#this is the place where things needs to be updated
import numpy as np
from collections import defaultdict

class Agent:

    def __init__(self, nA=6):
        """ Initialize agent.

        Params
        ======
        - nA: number of actions available to the agent
        """
        self.nA = nA
        self.Q = defaultdict(lambda: np.zeros(self.nA))
        self.gamma = 1.0
        self.eps_start = 1.0
        self.epsilon = self.eps_start
        self.eps_decay = 0.99999
        self.eps_min = 0.05
        self.eps = 0.005
        self.alpha = 1
        
    def get_probs(self):
        """ obtains the action probabilities corresponding to epsilon-greedy policy """
        policy_s = np.ones(self.nA) * self.epsilon / self.nA
        best_a = np.argmax(self.Q)
        policy_s[best_a] = 1 - self.epsilon + (self.epsilon / self.nA)
        return policy_s

    def select_action(self, state):
        """ Given the state, select an action.

        Params
        ======
        - state: the current state of the environment

        Returns
        =======
        - action: an integer, compatible with the task's action space
        """
        #return np.random.choice(self.nA) #just a random selection
        action = np.random.choice(np.arange(self.nA), p=self.get_probs()) \
                                    if state in self.Q else env.action_space.sample()
        return action

    def step(self, state, action, reward, next_state, done):
        """ Update the agent's knowledge, using the most recently sampled tuple.

        Params
        ======
        - state: the previous state of the environment
        - action: the agent's previous choice of action
        - reward: last reward received
        - next_state: the current state of the environment
        - done: whether the episode is complete (True or False)
        """
        #i think this only updates Q
        #self.Q[state][action] += 1
        # prepare for discounting
        current = self.Q[state][action]
        policy_s = np.ones(self.nA) * self.eps / self.nA
        policy_s[np.argmax(self.Q[next_state])] = 1 - self.eps + (self.eps / self.nA)
        Qsa_next = np.dot(self.Q[next_state], policy_s) 
        new_value = current + (self.alpha * (reward + (self.gamma * Qsa_next) - current)) #update_Q_expsarsa
        self.Q[state][action] = new_value

In [None]:
def update_Q_expsarsa(alpha, gamma, nA, eps, Q, state, action, reward, next_state=None):
    """Returns updated Q-value for the most recent experience."""
    current = Q[state][action]         # estimate in Q-table (for current state, action pair)
    policy_s = np.ones(nA) * eps / nA  # current policy (for next state S')
    policy_s[np.argmax(Q[next_state])] = 1 - eps + (eps / nA) # greedy action
    Qsa_next = np.dot(Q[next_state], policy_s)         # get value of state at next time step

    new_value = current + (alpha * (reward + (gamma * Qsa_next) - current)) # get updated value 
    return new_value

In [21]:
num_episodes = 2
env = gym.make('Taxi-v2') #initializing the environment
agent = Agent() #initializing the agent

avg_rewards = deque(maxlen=num_episodes)
# initialize best average reward
best_avg_reward = -math.inf # i guess the worst reward is negative infinity
# initialize monitor for most recent rewards
window = 100
samp_rewards = deque(maxlen=window)
# for each episode
# each episode so far starts like this
for i_episode in range(1, num_episodes+1): #standard
    # begin the episode
    state = env.reset() #standard
    # initialize the sampled reward
    samp_reward = 0 #just set the first value
    print('episode {}'.format(i_episode))
    while True:
        # agent selects an action
        action = agent.select_action(state)
        print('action {}'.format(action))
        # agent performs the selected action
        next_state, reward, done, _ = env.step(action)
        print('next state {}, reward {}, done {}'.format(next_state, reward, done))
        # agent performs internal updates based on sampled experience
        agent.step(state, action, reward, next_state, done)
        # update the sampled reward
        samp_reward += reward
        # update the state (s <- s') to next time step
        state = next_state
        if done:
            # save final sampled reward
            samp_rewards.append(samp_reward)
            break
    if (i_episode >= 100):
        # get average reward from last 100 episodes
        avg_reward = np.mean(samp_rewards)
        # append to deque
        avg_rewards.append(avg_reward)
        # update best average reward
        if avg_reward > best_avg_reward:
            best_avg_reward = avg_reward
    # monitor progress
    print("\rEpisode {}/{} || Best average reward {}".format(i_episode, num_episodes, best_avg_reward), end="")
    sys.stdout.flush()
    # check if task is solved (according to OpenAI Gym)
    if best_avg_reward >= 9.7:
        print('\nEnvironment solved in {} episodes.'.format(i_episode), end="")
        break
    if i_episode == num_episodes: print('\n')


episode 1
action 5
next state 394, reward -10, done False
action 2
next state 394, reward -1, done False
action 4
next state 394, reward -10, done False
action 5
next state 394, reward -10, done False
action 1
next state 294, reward -1, done False
action 1
next state 194, reward -1, done False
action 4
next state 194, reward -10, done False
action 5
next state 194, reward -10, done False
action 3
next state 174, reward -1, done False
action 5
next state 174, reward -10, done False
action 1
next state 74, reward -1, done False
action 5
next state 74, reward -10, done False
action 1
next state 74, reward -1, done False
action 0
next state 174, reward -1, done False
action 4
next state 174, reward -10, done False
action 1
next state 74, reward -1, done False
action 1
next state 74, reward -1, done False
action 5
next state 74, reward -10, done False
action 5
next state 74, reward -10, done False
action 2
next state 94, reward -1, done False
action 5
next state 94, reward -10, done False
a

Episode 1/2 || Best average reward -infepisode 2
action 2
next state 187, reward -1, done False
action 1
next state 87, reward -1, done False
action 2
next state 87, reward -1, done False
action 3
next state 67, reward -1, done False
action 1
next state 67, reward -1, done False
action 2
next state 87, reward -1, done False
action 0
next state 187, reward -1, done False
action 0
next state 287, reward -1, done False
action 2
next state 287, reward -1, done False
action 5
next state 287, reward -10, done False
action 5
next state 287, reward -10, done False
action 3
next state 267, reward -1, done False
action 1
next state 167, reward -1, done False
action 0
next state 267, reward -1, done False
action 3
next state 247, reward -1, done False
action 3
next state 227, reward -1, done False
action 5
next state 227, reward -10, done False
action 4
next state 227, reward -10, done False
action 1
next state 127, reward -1, done False
action 1
next state 27, reward -1, done False
action 4
nex

Episode 2/2 || Best average reward -inf



In [15]:
agent.Q

defaultdict(<function __main__.Agent.__init__.<locals>.<lambda>>,
            {2: array([-2.,  0.,  0.,  0.,  0.,  0.]),
             34: array([ -1.,   0.,  -1.,   0., -30., -30.]),
             42: array([ 0.,  0., -1.,  0.,  0.,  0.]),
             54: array([-2.,  0.,  0.,  0.,  0.,  0.]),
             62: array([ -2.,  -1.,  -1.,  -1., -10.,   0.]),
             74: array([  0.,  -1.,  -1.,  -2., -10., -10.]),
             82: array([ -1.,   0.,  -1.,  -2.,   0., -10.]),
             94: array([ -2.,   0.,  -2.,  -2., -20., -30.]),
             102: array([ -3.,  -2.,  -1.,   0., -30., -20.]),
             114: array([ -1.,   0.,  -2.,   0., -10.,   0.]),
             122: array([-1.,  0.,  0., -1.,  0.,  0.]),
             134: array([ -1.,  -1.,  -2.,  -1., -10.,   0.]),
             142: array([ 0.,  0., -2.,  0.,  0.,  0.]),
             154: array([-1.,  0., -2., -2.,  0.,  0.]),
             162: array([ -3.,  -1.,  -3.,   0., -10.,   0.]),
             174: array([ -1.,  -1

In [None]:
#testing using the following
agent = Agent()
state = env.reset()
action = agent.select_action(state)
action, state

In [None]:
next_state, reward, done, _ = env.step(action)
agent.step(next_state, action, reward, next_state, done)
state = next_state

state


In [None]:
action = agent.select_action(state)
action, state