In [1]:
import sys
import numpy as np
import random
import math
from collections import defaultdict, deque


In [3]:
class Agent:
    
    def __init__(self, nA, nS, epsilon=1, epsilon_decay = 0.005, min_epsilon = 0.001, alpha = 0.8, 
                 alpha_decay = 0.99, gamma = 0.95):
        self.nA = nA
        self.nS = nS
        self.Q = np.full((nS, nA), 0.01)
        self.epsilon = epsilon
        self.epsilon_decay = epsilon_decay
        self.min_epsilon = min_epsilon
        self.alpha = alpha
        self.gamma = gamma
        self.alpha_decay = alpha_decay
        
    def select_action_greedy(self, state):
        policy_p = ((np.ones(self.nA)*self.epsilon)/self.nA)
        greedy_Q = np.argmax(self.Q[state])
        policy_p[greedy_Q] = 1 - self.epsilon + (self.epsilon/self.nA)
        action = np.random.choice(self.nA, p=policy_p)
        return action
    
    def agent_update_q(self, state, action, reward, next_state, done):
        next_action = self.select_action_greedy(next_state)
        self.Q[state][action] = (1 - self.alpha) * self.Q[state][action] + self.alpha * (reward + self.gamma * np.max(self.Q[next_state]))

In [None]:
def interact(env, agent, num_episodes, window = 100, render_toggle = False, render_eps=1000):
    windowed_rewards = []
    working_rewards = deque(maxlen=window)
    #actions = []
    goals = 0
    
    for i_episode in range(1, num_episodes+1):
        state = env.reset()
        episode_r = 0        
        
        while True:
            if i_episode % render_eps == 0:
                if render_toggle:
                    env.render()
            action = agent.select_action_greedy(state)
            next_state, reward, done, _ = env.step(action)
            agent.agent_update_q(state, action, reward, next_state, done)
            episode_r += reward
            state = next_state
            #actions.append(action)
            #print(action)
            #env.render()
            if done:
                working_rewards.append(episode_r)
                if episode_r ==1:
                    goals += 1
                    agent.alpha = max(0.1, agent.alpha*agent.alpha_decay)
                break
        
        agent.epsilon = agent.min_epsilon + 0.99*np.exp(-agent.epsilon_decay*i_episode)
         
        if i_episode % window==0:
            windowed_rewards.append(100*np.mean(working_rewards))
            if i_episode % (window*10)==0:
                print('Episode {}: goal reached in {} of last {} episodes'.format(i_episode, goals, window*10))
                goals = 0
            
    return windowed_rewards, agent.Q