- http://coach.nervanasys.com/algorithms/value_optimization/double_dqn/index.html
- https://medium.com/@awjuliani/simple-reinforcement-learning-with-tensorflow-part-4-deep-q-networks-and-beyond-8438a3e2b8df
- https://jaromiru.com/2016/11/07/lets-make-a-dqn-double-learning-and-prioritized-experience-replay/

In [1]:
import matplotlib.pyplot as plt
import gym
import cv2
import random
import math
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from collections import deque
from collections import namedtuple
import copy

In [2]:
SEED = 1234
CAPACITY = 10_000
BATCH_SIZE = 32
PROCESSED_SIZE = 84
GAME = 'Pong-v0'
N_ACTIONS = gym.make(GAME).action_space.n
PHI_LENGTH = 4
UPDATE_FREQ = 1
EPSILON_START = 1.0
EPSILON_END = 0.1
EPSILON_STEPS = 1_000_000
GAMMA = 0.99
TARGET_UPDATE = 1000
PRINT_UPDATE = 5_000

In [3]:
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
np.random.seed(SEED)
random.seed(SEED)

In [4]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [5]:
class ReplayMemory:
    def __init__(self, capacity, batch_size):
        """
        Replay memory that holds examples in the form of (s, a, r, s')
        
        args:
            capacity (int): the size of the memory
            batch_size (int): size of batches used for training model
        """
        
        self.batch_size = batch_size
        self.capacity = capacity
        self.memory = deque(maxlen=self.capacity)
        self.Transition = namedtuple('Transition', ('state', 'action', 'reward', 'next_state'))
        self._available = False

    def put(self, state, action, reward, next_state):
        """
        Places an (s, a, r, s') example in the memory
        
        args:
            state (np.array): 
            action (list[int]):
            reward (list[int]):
            next_state (np.array or None):
        """
        
        state = torch.FloatTensor(state)
        action = torch.LongTensor([action])
        reward = torch.FloatTensor([reward])
        if next_state is not None:
            next_state = torch.FloatTensor(next_state)
        transition = self.Transition(state=state, action=action, reward=reward, next_state=next_state)
        self.memory.append(transition)

    def sample(self):
        """
        Gets a random sample of n = batch_size examples from the memory
            
        returns:
            Transitions (namedtuple): a tuple of (s, a, r, s')
        """
        
        transitions = random.sample(self.memory, self.batch_size)
        return self.Transition(*(zip(*transitions)))

    def size(self):
        """
        Returns the length of the memory
        
        returns:
            length (int): number of examples in the memory
        """
        return len(self.memory)

    def is_available(self):
        """
        Returns True if we have enough examples within the memory
        
        returns:
            available (bool): True if we have at least n = batch_size examples in the memory
        """
        if self._available:
            return True

        if len(self.memory) > self.batch_size:
            self._available = True
        return self._available

In [6]:
class Environment:
    def __init__(self, game, size, seed):
        """
        A class that has helpful wrappers around the Gym environment
        
        game (string): name of Atari game, i.e. Breakout-v0
        size (int): height and width of observation after preprocessing
        seed (int): random seed
        """
        
        self.size = size
        
        #init game
        self.game = gym.make(game)
        
        #set random seed for determinism
        self.game.seed(seed)
        
    def process(self, obs):
        """
        Process an observation (i.e. convert to grayscale, resize and normalize)
        
        args:
            obs (np.array): observation from gym of game screen, should be (height, width, channels)
        
        returns:
            output (np.array): (self.size, self.size) array with all values <= 1
        """
        
        assert len(obs.shape) == 3 #make sure image is correct shape
        
        gray = cv2.cvtColor(obs, cv2.COLOR_RGB2GRAY) #convert to grayscale
        output = cv2.resize(gray, (self.size, self.size)) #resize
        output = output.astype(np.float32, copy=False) #convert to float32
        output /= 255.0 #normalize values between [0, 1]
                     
        assert (output <= 1.0).all() 
                        
        return output
    
    def get_obs(self):
        """
        Gets a processed observation
        
        returns:
            obs (np.array): (self.size, self.size) array with all values <= 1
        """
        
        obs = self.game.render('rgb_array')
        obs = self.process(obs)
        return obs
    
    def init(self):
        """
        Reset the environment and return the initial state (unprocessed)
        
        returns:
            obs (np.array): observation from gym of game screen, should be (height, width, channels)
        """
        obs = self.game.reset()
        return obs
    
    def reset(self):
        """
        Reset the environment and return the initial state (processed)
        
        returns:
            output (np.array): (self.size, self.size) array with all values <= 1
        """
        obs = self.game.reset()
        output = self.process(obs)
        return output

In [7]:
class DQN(nn.Module):

    def __init__(self, n_actions):
        super(DQN, self).__init__()
        self.conv1 = nn.Conv2d(4, 32, kernel_size=8, stride=4)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=4, stride=2)
        self.conv3 = nn.Conv2d(64, 64, kernel_size=3, stride=1)
        self.fc1 = nn.Linear(7*7*64, 512)
        self.fc2 = nn.Linear(512, n_actions) #actions from from env.action_space.n

    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        x = F.relu(self.conv3(x))
        x = F.relu(self.fc1(x.view(x.size(0), -1))) #flattens the (N, C, H, W) to (N, C*H*W)
        return self.fc2(x)

In [11]:
class Agent:
    def __init__(self, env, mem, model, phi_length, update_freq, e_start, e_end, e_steps, gamma, target_update, print_update):
        """
        An agent class that handles training the model

        args:
            mem (ReplayMemory): ReplayMemory object
            env (Environment): Environment object
            model (nn.Module): PyTorch model
            phi_length (int): number of observations to stack to make a state
            frame_skip (int): we only use every n = frame_skip observations to make a state
            e_start (int): initial value of epsilon
            e_end (int): minimum value of epsilon
            e_steps (int): number of steps for epsilon to go from e_start to e_end
            gamma (float): decay rate of rewards
            target_update (int): after how many steps (frames) to update target model
            print_update (int): after how many steps (frames) to print summary of performance
            
        """
        self.env = env
        self.mem = mem
        self.model = model
        self.phi_length = phi_length
        self.update_freq = update_freq
        self.e_start = e_start
        self.e_end = e_end
        self.e_steps = e_steps
        self.gamma = gamma
        self.target_update = target_update
        self.print_update = print_update
        
        self.steps = 0 #number of steps taken
        self.episodes = 0 #number of episodes
        self.obs_buffer = deque(maxlen=phi_length) #for holding observations to be turned into states
        
        #put model on gpu if available
        self.model = model.to(device)
        
        #create target model
        #TODO: this may need to be a copy.deepcopy or load state dict
        self.target = copy.deepcopy(self.model)
    
        #create optimizer
        #trying params from: https://github.com/hengyuan-hu/rainbow
        #self.optimizer = optim.Adam(self.model.parameters(), lr=6.25e-5, eps=1.5e-4)
        #from dqn paper
        self.optimizer = optim.RMSprop(self.model.parameters(), lr=0.00025, alpha=0.95, momentum=0.95)
        
    def get_epsilon(self):
        """
        Calculates the value of epsilon from the current number of frames
        
        returns:
            epsilon (int): the probability of doing a random action
        """
        epsilon = self.e_end + (self.e_start - self.e_end) * math.exp(-1. * self.steps / self.e_steps)
        return epsilon
        
    def get_action(self, state):
        """
        Selects action to perform, with probability = epsilon chooses a random action,
        else chooses the best predicted action of the model
        
        args:
            state (np.array): input state to the model
            
        returns:
            action (int): the index of the action 
        """
    
        #get value of epsilon
        epsilon = self.get_epsilon()
        
        #with probablity of epsilon, pick a random action
        if random.random() < epsilon:
            action = self.env.game.action_space.sample()
        
        else:
            #with probability of (1 - epsilon) pick predicted value
            with torch.no_grad():
                state = torch.FloatTensor(state).unsqueeze(0).to(device) #convert to tensor, reshape and add to gpu
                Qsa = self.model(state) #pass state through model to get Qa
                action = Qsa.max(1)[1].item() #action is max Qa value
                
        #make sure the value is an integer
        assert isinstance(action, int)
            
        return action
    
    def get_initial_state(self):
        """
        Get the initial state to the model, a stack of processed observations
        
        returns:
            state (np.array): a stack of n = phi_length processed observations
        """
        
        _ = self.env.reset() #reset environment
        obs = self.env.get_obs() #get a processed observation
        state = np.stack([obs for _ in range(self.phi_length)], axis=0) #stack n = phi_length times to make a state
        
        #also fill the 
        for _ in range(self.phi_length):
            self.obs_buffer.append(obs)
        
        return state
        
    def get_state(self):
        """
        Get a stack from the observation buffer
        
        returns:
            state (np.array): a stack of n = phi_length processed observations
        """
        
        state = np.array(self.obs_buffer)
        
        return state
        
    def train(self):
        
        training_done = False
        reward_per_episode = []
        rewards_all_episodes = []
        
        while not training_done:
            
            episode_done = False
            episode_reward = 0
            episode_steps = 0
            
            #get initial state
            state = self.get_initial_state()
            
            while not episode_done:
                                
                #get action
                action = self.get_action(state)
                               
                #apply action while skipping frames
                observation, reward, episode_done, info = self.env.game.step(action)

                #sum rewards
                episode_reward += reward
                    
                #append processed observation to a buffer of observations
                self.obs_buffer.append(self.env.get_obs())
                        
                #get the next state from the observation buffer
                next_state = self.get_state()
                
                #add to memory, for terminal states, set next_state to None
                if episode_done:
                    mem.put(state, action, reward, None)
                else:
                    mem.put(state, action, reward, next_state)
                    
                #make new state the old next_state
                state = next_state
                                
                #update model parameters
                if mem.is_available() and self.steps % self.update_freq == 0:
                    loss = self.optimize()
    
                #increase number of steps
                self.steps += 1
                episode_steps += 1 
            
                if self.steps % (self.target_update * self.update_freq) == 0:
                    self.target.load_state_dict(self.model.state_dict())
            
                if self.steps % self.print_update == 0:
                    avg_reward_per_episode = np.mean(reward_per_episode)
                    #rewards_all_episodes.extend(reward_per_episode)
                    reward_per_episode = []
                    print(f'Episodes: {self.episodes}, Steps: {self.steps}, Epsilon: {self.get_epsilon():.2f}, Avg. Reward per Ep: {avg_reward_per_episode:.2f}')

            #increase number of episodes
            self.episodes += 1
            reward_per_episode.append(episode_reward)
            
    def optimize(self):
        """
        Update model parameters
        """
        
        #get a batch
        transitions = mem.sample()
        
        #need to set the Q value of terminal states to 0
        #this mask will be 1 for non-terminal next_states and 0 for terminal next_states
        non_terminal_mask = torch.ByteTensor(list(map(lambda ns: ns is not None, transitions.next_state)))
        
        #this will be 1 for terminal next_states, and 0 for non-terminal next states
        terminal_mask = 1 - non_terminal_mask
        
        #state_batch = (N*C,H,W), where N is batch_size, C is phi_length, H and W are processed obs size
        state_batch = torch.cat(transitions.state).to(device)
        
        #action_batch = (N, 1)
        action_batch = torch.cat(transitions.action).unsqueeze(1).to(device)
        
        #reward_batch = (N, 1)
        reward_batch = torch.cat(transitions.reward).unsqueeze(1).to(device)
        
        #clip reward between +1 and -1
        reward_batch.data.clamp_(-1, 1)
        
        #next_state_batch = (V*C,H,W), where V is non_terminal next_state
        non_terminal_next_state_batch = torch.cat([ns for ns in transitions.next_state if ns is not None]).to(device)
        
        #reshape to (N,C,H,W)
        state_batch = state_batch.view(mem.batch_size, self.phi_length, self.env.size, self.env.size)
        
        #reshape to (V,C,H,W)
        non_terminal_next_state_batch = non_terminal_next_state_batch.view(-1, self.phi_length, self.env.size, self.env.size)
        
        #get predicted Q values from model
        Q_pred = self.model(state_batch)
        
        #get Q values of action taken, shape (N,1)
        Q_vals = Q_pred.gather(1, action_batch)
          
        """
        in double dqn, get q values from model and target
        """
        #get Q values from target model  
        model_pred = self.model(non_terminal_next_state_batch)
        target_pred = self.target(non_terminal_next_state_batch) 
            
        """
        then get the actions from the model predicted Q values, not the target
        """
        target_actions = model_pred.max(dim=1)[1].view(-1, 1)
        
        """
        then use these actions to get Q values from the target network
        """
        target_Q = target_pred.gather(1, target_actions) 
            
        #tensor for placing target values
        target_vals = torch.zeros(mem.batch_size, 1).to(device) 
            
        """
        don't forget to update this line
        """    
        #fill in target values for non_terminal states
        #the terminal states will stay initialized as zeros
        target_vals[non_terminal_mask] = reward_batch[non_terminal_mask] + target_Q * self.gamma
            
        #calculate loss between Q values and target values
        loss = F.smooth_l1_loss(Q_vals, target_vals.detach())
            
        #zero gradients
        self.optimizer.zero_grad()
        
        #calculate gradients 
        loss.backward()
        
        #clamp gradients
        for p in self.model.parameters():
            p.grad.data.clamp_(-1, 1)
            
        #update parameters
        self.optimizer.step()
        
        return loss.item()

IndentationError: unexpected indent (<ipython-input-11-bdf299954458>, line 97)

In [9]:
env = Environment(GAME, PROCESSED_SIZE, SEED)
mem = ReplayMemory(CAPACITY, BATCH_SIZE)
model = DQN(N_ACTIONS)
agent = Agent(env, mem, model, PHI_LENGTH, UPDATE_FREQ, EPSILON_START, EPSILON_END, EPSILON_STEPS, GAMMA, TARGET_UPDATE, PRINT_UPDATE)

In [10]:
agent.train()

Episodes: 3, Steps: 5000, Epsilon: 1.00, Avg. Reward per Ep: -19.33
Episodes: 7, Steps: 10000, Epsilon: 0.99, Avg. Reward per Ep: -20.00
Episodes: 11, Steps: 15000, Epsilon: 0.99, Avg. Reward per Ep: -20.00
Episodes: 15, Steps: 20000, Epsilon: 0.98, Avg. Reward per Ep: -19.75
Episodes: 19, Steps: 25000, Epsilon: 0.98, Avg. Reward per Ep: -20.25
Episodes: 24, Steps: 30000, Epsilon: 0.97, Avg. Reward per Ep: -20.80
Episodes: 27, Steps: 35000, Epsilon: 0.97, Avg. Reward per Ep: -20.00
Episodes: 32, Steps: 40000, Epsilon: 0.96, Avg. Reward per Ep: -21.00
Episodes: 36, Steps: 45000, Epsilon: 0.96, Avg. Reward per Ep: -20.00
Episodes: 40, Steps: 50000, Epsilon: 0.96, Avg. Reward per Ep: -20.50
Episodes: 44, Steps: 55000, Epsilon: 0.95, Avg. Reward per Ep: -20.50
Episodes: 48, Steps: 60000, Epsilon: 0.95, Avg. Reward per Ep: -20.50
Episodes: 52, Steps: 65000, Epsilon: 0.94, Avg. Reward per Ep: -19.00
Episodes: 56, Steps: 70000, Epsilon: 0.94, Avg. Reward per Ep: -21.00
Episodes: 60, Steps: 75

Episodes: 415, Steps: 580000, Epsilon: 0.60, Avg. Reward per Ep: -17.33
Episodes: 417, Steps: 585000, Epsilon: 0.60, Avg. Reward per Ep: -16.00
Episodes: 420, Steps: 590000, Epsilon: 0.60, Avg. Reward per Ep: -19.00
Episodes: 423, Steps: 595000, Epsilon: 0.60, Avg. Reward per Ep: -19.33
Episodes: 427, Steps: 600000, Epsilon: 0.59, Avg. Reward per Ep: -19.25
Episodes: 430, Steps: 605000, Epsilon: 0.59, Avg. Reward per Ep: -18.33
Episodes: 432, Steps: 610000, Epsilon: 0.59, Avg. Reward per Ep: -20.00
Episodes: 436, Steps: 615000, Epsilon: 0.59, Avg. Reward per Ep: -19.75
Episodes: 439, Steps: 620000, Epsilon: 0.58, Avg. Reward per Ep: -19.00
Episodes: 442, Steps: 625000, Epsilon: 0.58, Avg. Reward per Ep: -19.33
Episodes: 445, Steps: 630000, Epsilon: 0.58, Avg. Reward per Ep: -19.33
Episodes: 448, Steps: 635000, Epsilon: 0.58, Avg. Reward per Ep: -19.00
Episodes: 451, Steps: 640000, Epsilon: 0.57, Avg. Reward per Ep: -18.67
Episodes: 453, Steps: 645000, Epsilon: 0.57, Avg. Reward per Ep:

Episodes: 768, Steps: 1150000, Epsilon: 0.38, Avg. Reward per Ep: -18.67
Episodes: 770, Steps: 1155000, Epsilon: 0.38, Avg. Reward per Ep: -19.00
Episodes: 774, Steps: 1160000, Epsilon: 0.38, Avg. Reward per Ep: -19.00
Episodes: 777, Steps: 1165000, Epsilon: 0.38, Avg. Reward per Ep: -19.00
Episodes: 780, Steps: 1170000, Epsilon: 0.38, Avg. Reward per Ep: -18.67
Episodes: 784, Steps: 1175000, Epsilon: 0.38, Avg. Reward per Ep: -19.50
Episodes: 787, Steps: 1180000, Epsilon: 0.38, Avg. Reward per Ep: -20.33
Episodes: 791, Steps: 1185000, Epsilon: 0.38, Avg. Reward per Ep: -19.50
Episodes: 794, Steps: 1190000, Epsilon: 0.37, Avg. Reward per Ep: -20.33
Episodes: 797, Steps: 1195000, Epsilon: 0.37, Avg. Reward per Ep: -19.00
Episodes: 800, Steps: 1200000, Epsilon: 0.37, Avg. Reward per Ep: -18.67
Episodes: 803, Steps: 1205000, Epsilon: 0.37, Avg. Reward per Ep: -19.33
Episodes: 806, Steps: 1210000, Epsilon: 0.37, Avg. Reward per Ep: -20.00
Episodes: 810, Steps: 1215000, Epsilon: 0.37, Avg. 

Episodes: 1094, Steps: 1710000, Epsilon: 0.26, Avg. Reward per Ep: -18.00
Episodes: 1097, Steps: 1715000, Epsilon: 0.26, Avg. Reward per Ep: -18.33
Episodes: 1099, Steps: 1720000, Epsilon: 0.26, Avg. Reward per Ep: -18.50
Episodes: 1102, Steps: 1725000, Epsilon: 0.26, Avg. Reward per Ep: -16.00
Episodes: 1104, Steps: 1730000, Epsilon: 0.26, Avg. Reward per Ep: -17.50
Episodes: 1106, Steps: 1735000, Epsilon: 0.26, Avg. Reward per Ep: -20.50
Episodes: 1109, Steps: 1740000, Epsilon: 0.26, Avg. Reward per Ep: -19.00
Episodes: 1111, Steps: 1745000, Epsilon: 0.26, Avg. Reward per Ep: -16.50
Episodes: 1114, Steps: 1750000, Epsilon: 0.26, Avg. Reward per Ep: -18.33
Episodes: 1116, Steps: 1755000, Epsilon: 0.26, Avg. Reward per Ep: -14.50
Episodes: 1119, Steps: 1760000, Epsilon: 0.25, Avg. Reward per Ep: -17.67
Episodes: 1121, Steps: 1765000, Epsilon: 0.25, Avg. Reward per Ep: -17.00
Episodes: 1124, Steps: 1770000, Epsilon: 0.25, Avg. Reward per Ep: -16.33
Episodes: 1127, Steps: 1775000, Epsilo

Episodes: 1368, Steps: 2265000, Epsilon: 0.19, Avg. Reward per Ep: -17.00
Episodes: 1371, Steps: 2270000, Epsilon: 0.19, Avg. Reward per Ep: -18.00
Episodes: 1373, Steps: 2275000, Epsilon: 0.19, Avg. Reward per Ep: -17.00
Episodes: 1375, Steps: 2280000, Epsilon: 0.19, Avg. Reward per Ep: -15.00
Episodes: 1377, Steps: 2285000, Epsilon: 0.19, Avg. Reward per Ep: -16.00
Episodes: 1380, Steps: 2290000, Epsilon: 0.19, Avg. Reward per Ep: -18.67
Episodes: 1383, Steps: 2295000, Epsilon: 0.19, Avg. Reward per Ep: -17.67
Episodes: 1385, Steps: 2300000, Epsilon: 0.19, Avg. Reward per Ep: -16.00
Episodes: 1387, Steps: 2305000, Epsilon: 0.19, Avg. Reward per Ep: -17.00
Episodes: 1390, Steps: 2310000, Epsilon: 0.19, Avg. Reward per Ep: -18.67
Episodes: 1393, Steps: 2315000, Epsilon: 0.19, Avg. Reward per Ep: -17.00
Episodes: 1395, Steps: 2320000, Epsilon: 0.19, Avg. Reward per Ep: -18.50
Episodes: 1398, Steps: 2325000, Epsilon: 0.19, Avg. Reward per Ep: -18.33
Episodes: 1400, Steps: 2330000, Epsilo

Episodes: 1638, Steps: 2820000, Epsilon: 0.15, Avg. Reward per Ep: -17.00
Episodes: 1640, Steps: 2825000, Epsilon: 0.15, Avg. Reward per Ep: -16.50
Episodes: 1643, Steps: 2830000, Epsilon: 0.15, Avg. Reward per Ep: -18.00
Episodes: 1645, Steps: 2835000, Epsilon: 0.15, Avg. Reward per Ep: -15.00
Episodes: 1648, Steps: 2840000, Epsilon: 0.15, Avg. Reward per Ep: -16.00
Episodes: 1651, Steps: 2845000, Epsilon: 0.15, Avg. Reward per Ep: -17.00
Episodes: 1653, Steps: 2850000, Epsilon: 0.15, Avg. Reward per Ep: -15.00
Episodes: 1656, Steps: 2855000, Epsilon: 0.15, Avg. Reward per Ep: -18.00
Episodes: 1658, Steps: 2860000, Epsilon: 0.15, Avg. Reward per Ep: -14.50
Episodes: 1660, Steps: 2865000, Epsilon: 0.15, Avg. Reward per Ep: -17.50
Episodes: 1662, Steps: 2870000, Epsilon: 0.15, Avg. Reward per Ep: -15.50
Episodes: 1665, Steps: 2875000, Epsilon: 0.15, Avg. Reward per Ep: -15.33
Episodes: 1667, Steps: 2880000, Epsilon: 0.15, Avg. Reward per Ep: -19.00
Episodes: 1669, Steps: 2885000, Epsilo

Episodes: 1919, Steps: 3375000, Epsilon: 0.13, Avg. Reward per Ep: -16.67
Episodes: 1921, Steps: 3380000, Epsilon: 0.13, Avg. Reward per Ep: -16.00
Episodes: 1924, Steps: 3385000, Epsilon: 0.13, Avg. Reward per Ep: -18.33
Episodes: 1927, Steps: 3390000, Epsilon: 0.13, Avg. Reward per Ep: -17.67
Episodes: 1929, Steps: 3395000, Epsilon: 0.13, Avg. Reward per Ep: -17.50
Episodes: 1932, Steps: 3400000, Epsilon: 0.13, Avg. Reward per Ep: -17.33
Episodes: 1934, Steps: 3405000, Epsilon: 0.13, Avg. Reward per Ep: -14.50
Episodes: 1937, Steps: 3410000, Epsilon: 0.13, Avg. Reward per Ep: -18.33
Episodes: 1939, Steps: 3415000, Epsilon: 0.13, Avg. Reward per Ep: -17.00
Episodes: 1942, Steps: 3420000, Epsilon: 0.13, Avg. Reward per Ep: -17.00
Episodes: 1945, Steps: 3425000, Epsilon: 0.13, Avg. Reward per Ep: -18.00
Episodes: 1948, Steps: 3430000, Epsilon: 0.13, Avg. Reward per Ep: -17.67
Episodes: 1950, Steps: 3435000, Epsilon: 0.13, Avg. Reward per Ep: -18.00
Episodes: 1953, Steps: 3440000, Epsilo

Episodes: 2208, Steps: 3930000, Epsilon: 0.12, Avg. Reward per Ep: -16.00
Episodes: 2210, Steps: 3935000, Epsilon: 0.12, Avg. Reward per Ep: -16.50
Episodes: 2213, Steps: 3940000, Epsilon: 0.12, Avg. Reward per Ep: -16.00
Episodes: 2215, Steps: 3945000, Epsilon: 0.12, Avg. Reward per Ep: -17.00
Episodes: 2218, Steps: 3950000, Epsilon: 0.12, Avg. Reward per Ep: -17.00
Episodes: 2220, Steps: 3955000, Epsilon: 0.12, Avg. Reward per Ep: -13.50
Episodes: 2223, Steps: 3960000, Epsilon: 0.12, Avg. Reward per Ep: -18.33
Episodes: 2225, Steps: 3965000, Epsilon: 0.12, Avg. Reward per Ep: -15.50
Episodes: 2227, Steps: 3970000, Epsilon: 0.12, Avg. Reward per Ep: -16.50
Episodes: 2229, Steps: 3975000, Epsilon: 0.12, Avg. Reward per Ep: -16.00
Episodes: 2232, Steps: 3980000, Epsilon: 0.12, Avg. Reward per Ep: -18.00
Episodes: 2234, Steps: 3985000, Epsilon: 0.12, Avg. Reward per Ep: -17.50
Episodes: 2237, Steps: 3990000, Epsilon: 0.12, Avg. Reward per Ep: -19.67
Episodes: 2239, Steps: 3995000, Epsilo

Episodes: 2474, Steps: 4485000, Epsilon: 0.11, Avg. Reward per Ep: -16.67
Episodes: 2476, Steps: 4490000, Epsilon: 0.11, Avg. Reward per Ep: -16.50
Episodes: 2478, Steps: 4495000, Epsilon: 0.11, Avg. Reward per Ep: -17.00
Episodes: 2481, Steps: 4500000, Epsilon: 0.11, Avg. Reward per Ep: -15.67
Episodes: 2483, Steps: 4505000, Epsilon: 0.11, Avg. Reward per Ep: -18.50
Episodes: 2486, Steps: 4510000, Epsilon: 0.11, Avg. Reward per Ep: -16.67
Episodes: 2489, Steps: 4515000, Epsilon: 0.11, Avg. Reward per Ep: -17.67
Episodes: 2492, Steps: 4520000, Epsilon: 0.11, Avg. Reward per Ep: -17.33
Episodes: 2496, Steps: 4525000, Epsilon: 0.11, Avg. Reward per Ep: -18.25
Episodes: 2499, Steps: 4530000, Epsilon: 0.11, Avg. Reward per Ep: -17.67
Episodes: 2501, Steps: 4535000, Epsilon: 0.11, Avg. Reward per Ep: -15.00
Episodes: 2504, Steps: 4540000, Epsilon: 0.11, Avg. Reward per Ep: -18.00
Episodes: 2507, Steps: 4545000, Epsilon: 0.11, Avg. Reward per Ep: -17.67
Episodes: 2509, Steps: 4550000, Epsilo

Episodes: 2745, Steps: 5040000, Epsilon: 0.11, Avg. Reward per Ep: -18.00
Episodes: 2747, Steps: 5045000, Epsilon: 0.11, Avg. Reward per Ep: -17.50
Episodes: 2749, Steps: 5050000, Epsilon: 0.11, Avg. Reward per Ep: -15.00
Episodes: 2752, Steps: 5055000, Epsilon: 0.11, Avg. Reward per Ep: -16.33
Episodes: 2754, Steps: 5060000, Epsilon: 0.11, Avg. Reward per Ep: -15.00
Episodes: 2757, Steps: 5065000, Epsilon: 0.11, Avg. Reward per Ep: -15.67
Episodes: 2759, Steps: 5070000, Epsilon: 0.11, Avg. Reward per Ep: -16.50
Episodes: 2761, Steps: 5075000, Epsilon: 0.11, Avg. Reward per Ep: -15.00
Episodes: 2763, Steps: 5080000, Epsilon: 0.11, Avg. Reward per Ep: -14.50
Episodes: 2765, Steps: 5085000, Epsilon: 0.11, Avg. Reward per Ep: -15.00
Episodes: 2767, Steps: 5090000, Epsilon: 0.11, Avg. Reward per Ep: -16.50
Episodes: 2769, Steps: 5095000, Epsilon: 0.11, Avg. Reward per Ep: -17.50
Episodes: 2772, Steps: 5100000, Epsilon: 0.11, Avg. Reward per Ep: -17.67
Episodes: 2775, Steps: 5105000, Epsilo

Episodes: 3009, Steps: 5595000, Epsilon: 0.10, Avg. Reward per Ep: -16.00
Episodes: 3011, Steps: 5600000, Epsilon: 0.10, Avg. Reward per Ep: -17.50
Episodes: 3014, Steps: 5605000, Epsilon: 0.10, Avg. Reward per Ep: -13.00
Episodes: 3016, Steps: 5610000, Epsilon: 0.10, Avg. Reward per Ep: -14.00
Episodes: 3018, Steps: 5615000, Epsilon: 0.10, Avg. Reward per Ep: -15.50
Episodes: 3020, Steps: 5620000, Epsilon: 0.10, Avg. Reward per Ep: -13.00
Episodes: 3022, Steps: 5625000, Epsilon: 0.10, Avg. Reward per Ep: -14.00
Episodes: 3025, Steps: 5630000, Epsilon: 0.10, Avg. Reward per Ep: -16.00
Episodes: 3026, Steps: 5635000, Epsilon: 0.10, Avg. Reward per Ep: -13.00
Episodes: 3029, Steps: 5640000, Epsilon: 0.10, Avg. Reward per Ep: -14.67
Episodes: 3031, Steps: 5645000, Epsilon: 0.10, Avg. Reward per Ep: -14.00
Episodes: 3033, Steps: 5650000, Epsilon: 0.10, Avg. Reward per Ep: -15.00
Episodes: 3035, Steps: 5655000, Epsilon: 0.10, Avg. Reward per Ep: -14.00
Episodes: 3037, Steps: 5660000, Epsilo

KeyboardInterrupt: 