https://arxiv.org/abs/1511.06581
http://coach.nervanasys.com/algorithms/value_optimization/dueling_dqn/index.html
http://torch.ch/blog/2016/04/30/dueling_dqn.html
https://medium.com/@awjuliani/simple-reinforcement-learning-with-tensorflow-part-4-deep-q-networks-and-beyond-8438a3e2b8df
https://github.com/dxyang/DQN_pytorch

In [1]:
import matplotlib.pyplot as plt
import gym
import cv2
import random
import math
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from collections import deque
from collections import namedtuple
import copy

In [2]:
SEED = 1234
CAPACITY = 10_000
BATCH_SIZE = 32
GAME = 'PongNoFrameskip-v4'
N_ACTIONS = gym.make(GAME).action_space.n
LEARNING_START = CAPACITY
UPDATE_FREQ = 1
EPSILON_START = 1.0
EPSILON_END = 0.01
EPSILON_STEPS = 30_000
GAMMA = 0.99
TARGET_UPDATE = 1_000
PRINT_UPDATE = 5_000

In [3]:
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
np.random.seed(SEED)
random.seed(SEED)
torch.backends.cudnn.deterministic = True

In [4]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [5]:
class ReplayMemory:
    def __init__(self, capacity, batch_size):
        """
        Replay memory that holds examples in the form of (s, a, r, s')
        
        args:
            capacity (int): the size of the memory
            batch_size (int): size of batches used for training model
        """
        
        self.batch_size = batch_size
        self.capacity = capacity
        
        #the memory holds al the (s, a, r, s') pairs
        #a deque is first-in-first-out, i.e. when you push an example onto the queue
        #and it at maximum capacity, the oldest example is popped off the queue
        self.memory = deque(maxlen=self.capacity) 
        
        #examples in the queue are saved as Transitions
        #makes the code more readable and intuitive when getting  examples from the memory
        self.Transition = namedtuple('Transition', ('state', 'action', 'reward', 'next_state')) 

    def push(self, state, action, reward, next_state):
        """
        Places an (s, a, r, s') example in the memory
        
        args:
            state (np.array): the observation obtained from the environment before the action
            action (list[int]): the action taken
            reward (list[int]): the reward received from the action taken at the current state
            next_state (np.array or None): the observation obtained from the environment after the action,
                                           is None when the state is a terminal state
        """
        
        #convert all to tensors
        state = torch.FloatTensor(state)
        action = torch.LongTensor([action])
        reward = torch.FloatTensor([reward])
        if next_state is not None:
            next_state = torch.FloatTensor(next_state)
        
        #create a transition
        transition = self.Transition(state=state, action=action, reward=reward, next_state=next_state)
        
        #add to the memory
        self.memory.append(transition)

    def sample(self):
        """
        Gets a random sample of n = batch_size examples from the memory
        The transition returned contains n of each elements, i.e. a batch_size of 32
        means this will return a tuple of (32 states, 32 actions, 32 rewards, 32 next_states)
            
        returns:
            Transitions (namedtuple): a tuple of (s, a, r, s'), 
        """
        
        #sample batch_size transitions for the memory
        transitions = random.sample(self.memory, self.batch_size)
        
        #unzip and then rezip so each element contains batch_size examples 
        return self.Transition(*(zip(*transitions)))

    def __len__(self):
        """
        Returns the length of the memory, i.e. number of examples in the memory
        
        returns:
            length (int): number of examples in the memory
        """
        
        return len(self.memory)

In [6]:
class NoopResetEnv(gym.Wrapper):
    def __init__(self, env, noop_max=30):
        """Sample initial states by taking random number of no-ops on reset.
        No-op is assumed to be action 0.
        """
        gym.Wrapper.__init__(self, env)
        self.noop_max = noop_max
        self.override_num_noops = None
        self.noop_action = 0
        assert env.unwrapped.get_action_meanings()[0] == 'NOOP'

    def reset(self, **kwargs):
        """ Do no-op action for a number of steps in [1, noop_max]."""
        self.env.reset(**kwargs)
        if self.override_num_noops is not None:
            noops = self.override_num_noops
        else:
            noops = self.unwrapped.np_random.randint(1, self.noop_max + 1) 
        assert noops > 0
        obs = None
        for _ in range(noops):
            obs, _, done, _ = self.env.step(self.noop_action)
            if done:
                obs = self.env.reset(**kwargs)
        return obs

    def step(self, ac):
        return self.env.step(ac)
    
class MaxAndSkipEnv(gym.Wrapper):
    def __init__(self, env, skip=4):
        """Return only every `skip`-th frame"""
        gym.Wrapper.__init__(self, env)
        # most recent raw observations (for max pooling across time steps)
        self._obs_buffer = np.zeros((2,)+env.observation_space.shape, dtype=np.uint8)
        self._skip       = skip

    def reset(self):
        return self.env.reset()

    def step(self, action):
        """Repeat action, sum reward, and max over last observations."""
        total_reward = 0.0
        done = None
        for i in range(self._skip):
            obs, reward, done, info = self.env.step(action)
            if i == self._skip - 2: self._obs_buffer[0] = obs
            if i == self._skip - 1: self._obs_buffer[1] = obs
            total_reward += reward
            if done:
                break
        # Note that the observation on the done=True frame
        # doesn't matter
        max_frame = self._obs_buffer.max(axis=0)

        return max_frame, total_reward, done, info

    def reset(self, **kwargs): 
        return self.env.reset(**kwargs)
    
class FireResetEnv(gym.Wrapper):
    def __init__(self, env):
        """Take action on reset for environments that are fixed until firing."""
        gym.Wrapper.__init__(self, env)
        assert env.unwrapped.get_action_meanings()[1] == 'FIRE'
        assert len(env.unwrapped.get_action_meanings()) >= 3

    def reset(self, **kwargs):
        self.env.reset(**kwargs)
        obs, _, done, _ = self.env.step(1)
        if done:
            self.env.reset(**kwargs)
        obs, _, done, _ = self.env.step(2)
        if done:
            self.env.reset(**kwargs)
        return obs

    def step(self, ac):
        return self.env.step(ac)

class EpisodicLifeEnv(gym.Wrapper):
    def __init__(self, env):
        """Make end-of-life == end-of-episode, but only reset on true game over.
        Done by DeepMind for the DQN and co. since it helps value estimation.
        """
        gym.Wrapper.__init__(self, env)
        self.lives = 0
        self.was_real_done  = True

    def step(self, action):
        obs, reward, done, info = self.env.step(action)
        self.was_real_done = done
        # check current lives, make loss of life terminal,
        # then update lives to handle bonus lives
        lives = self.env.unwrapped.ale.lives()
        if lives < self.lives and lives > 0:
            # for Qbert sometimes we stay in lives == 0 condtion for a few frames
            # so its important to keep lives > 0, so that we only reset once
            # the environment advertises done.
            done = True
        self.lives = lives
        return obs, reward, done, info

    def reset(self, **kwargs):
        """Reset only when lives are exhausted.
        This way all states are still reachable even though lives are episodic,
        and the learner need not know about any of this behind-the-scenes.
        """
        if self.was_real_done:
            obs = self.env.reset(**kwargs)
        else:
            # no-op step to advance from terminal/lost life state
            obs, _, _, _ = self.env.step(0)
        self.lives = self.env.unwrapped.ale.lives()
        return obs
    
class WarpFrame(gym.ObservationWrapper):
    def __init__(self, env):
        """Warp frames to 84x84 as done in the Nature paper and later work."""
        gym.ObservationWrapper.__init__(self, env)
        self.width = 84
        self.height = 84
        self.observation_space = gym.spaces.Box(low=0, high=255,
            shape=(self.height, self.width, 1), dtype=np.uint8)

    def observation(self, frame):
        frame = cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY)
        frame = cv2.resize(frame, (self.width, self.height), interpolation=cv2.INTER_AREA)
        return frame[:, :, None]
    
class ClipRewardEnv(gym.RewardWrapper):
    def __init__(self, env):
        gym.RewardWrapper.__init__(self, env)

    def reward(self, reward):
        """Bin reward to {+1, 0, -1} by its sign."""
        return np.sign(reward)

class LazyFrames(object):
    def __init__(self, frames):
        """This object ensures that common frames between the observations are only stored once.
        It exists purely to optimize memory usage which can be huge for DQN's 1M frames replay
        buffers.
        This object should only be converted to numpy array before being passed to the model.
        You'd not believe how complex the previous solution was."""
        self._frames = frames
        self._out = None

    def _force(self):
        if self._out is None:
            self._out = np.concatenate(self._frames, axis=2)
            self._frames = None
        return self._out

    def __array__(self, dtype=None):
        out = self._force()
        if dtype is not None:
            out = out.astype(dtype)
        return out

    def __len__(self):
        return len(self._force())

    def __getitem__(self, i):
        return self._force()[i]    
    
class FrameStack(gym.Wrapper):
    def __init__(self, env, k):
        """Stack k last frames.
        Returns lazy array, which is much more memory efficient.
        See Also
        --------
        baselines.common.atari_wrappers.LazyFrames
        """
        gym.Wrapper.__init__(self, env)
        self.k = k
        self.frames = deque([], maxlen=k)
        shp = env.observation_space.shape
        self.observation_space = gym.spaces.Box(low=0, high=255, shape=(shp[0], shp[1], shp[2] * k), dtype=np.uint8)

    def reset(self):
        ob = self.env.reset()
        for _ in range(self.k):
            self.frames.append(ob)
        return self._get_ob()

    def step(self, action):
        ob, reward, done, info = self.env.step(action)
        self.frames.append(ob)
        return self._get_ob(), reward, done, info

    def _get_ob(self):
        assert len(self.frames) == self.k
        return LazyFrames(list(self.frames))

class ImageToPyTorch(gym.ObservationWrapper):
    """
    Image shape to num_channels x weight x height
    """
    def __init__(self, env):
        super(ImageToPyTorch, self).__init__(env)
        old_shape = self.observation_space.shape
        self.observation_space = gym.spaces.Box(low=0.0, high=1.0, shape=(old_shape[-1], old_shape[0], old_shape[1]), dtype=np.uint8)

    def observation(self, observation):
        return np.swapaxes(observation, 2, 0)
    
def wrap_pytorch(env):
    return ImageToPyTorch(env)
    
def make_atari(env_id):
    env = gym.make(env_id)
    assert 'NoFrameskip' in env.spec.id
    env = NoopResetEnv(env, noop_max=30)
    env = MaxAndSkipEnv(env, skip=4)
    return env

def wrap_deepmind(env, episode_life=True, clip_rewards=True, frame_stack=False, scale=False):
    """Configure environment for DeepMind-style Atari.
    """
    if episode_life:
        env = EpisodicLifeEnv(env)
    if 'FIRE' in env.unwrapped.get_action_meanings():
        env = FireResetEnv(env)
    env = WarpFrame(env)
    if scale:
        env = ScaledFloatFrame(env)
    if clip_rewards:
        env = ClipRewardEnv(env)
    if frame_stack:
        env = FrameStack(env, 4)
    return env

In [7]:
class DQN(nn.Module):

    def __init__(self, n_actions):
        super(DQN, self).__init__()
        self.conv1 = nn.Conv2d(4, 32, kernel_size=8, stride=4)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=4, stride=2)
        self.conv3 = nn.Conv2d(64, 64, kernel_size=3, stride=1)
        
        self.a_fc1 = nn.Linear(7*7*64, 512)
        self.a_fc2 = nn.Linear(512, n_actions) #actions from from env.action_space.n

        self.v_fc1 = nn.Linear(7*7*64, 512)
        self.v_fc2 = nn.Linear(512, 1) 
        
    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        x = F.relu(self.conv3(x))
        x = x.view(x.size(0), -1) #flattens the (N, C, H, W) to (N, C*H*W)
        
        a = F.relu(self.a_fc1(x))
        a = self.a_fc2(a)
        
        v = F.relu(self.v_fc1(x))
        v = self.v_fc2(v)
        
        return v + a - a.mean()

In [8]:
class Agent:
    def __init__(self, env, mem, model, update_freq, learning_start, e_start, e_end, e_steps, gamma, target_update, print_update):
        """
        An agent class that handles training the model

        args:
            mem (ReplayMemory): ReplayMemory object
            env (Environment): Environment object
            model (nn.Module): PyTorch model
            update_freq (int): we only update the model every update_freq steps, 1 means update every step
            learning_start (int): we only start updating the model after learning_start steps
            e_start (int): initial value of epsilon
            e_end (int): minimum value of epsilon
            e_steps (int): controls the rate of decay from e_start to e_end
            gamma (float): decay rate of rewards
            target_update (int): update target model after this many parameter updates
            print_update (int): print summary of performance after this many steps
        """
        
        self.env = env
        self.mem = mem
        self.model = model
        self.update_freq = update_freq
        self.learning_start = learning_start
        self.e_start = e_start
        self.e_end = e_end
        self.e_steps = e_steps
        self.gamma = gamma
        self.target_update = target_update
        self.print_update = print_update
        
        self.steps = 0 #number of steps taken
        self.episodes = 0 #number of episodes
        
        #put model on gpu if available
        self.model = model.to(device)
        
        #create target model
        #set to evaluation mode to turn off batch-norm/dropout if used
        self.target = copy.deepcopy(self.model)
        self.target.eval()
    
        #create optimizer
        self.optimizer = optim.Adam(self.model.parameters(), lr=1e-4)
        
    def get_epsilon(self):
        """
        Calculates the value of epsilon from the current number of steps
        
        returns:
            epsilon (float): the probability of doing a random action
        """
        epsilon = self.e_end + (self.e_start - self.e_end) * math.exp(-1. * self.steps / self.e_steps)
        return epsilon
        
    def get_action(self, state):
        """
        Selects action to perform, with probability = epsilon chooses a random action,
        else chooses the best predicted action of the model
        
        args:
            state (np.array): input state to the model
            
        returns:
            action (int): the index of the action 
        """
    
        #get value of epsilon
        epsilon = self.get_epsilon()
        
        #with probablity of epsilon, pick a random action
        if random.random() < epsilon:
            action = self.env.action_space.sample()
        
        else:
            #with probability of (1 - epsilon) pick predicted value
            with torch.no_grad():
                state = torch.FloatTensor(state).unsqueeze(0).to(device) #convert to tensor, reshape and add to gpu
                Qsa = self.model(state) #pass state through model to get Qsa
                action = Qsa.max(1)[1].item() #action is max Qsa value
                
        #make sure the value is an integer
        assert isinstance(action, int)
            
        return action
        
    def train(self):
        """
        Main training loop of the model
        
        Algorithm:
        
        While true:
            - resets environment and gets initial state
            While episode is not done:
                - selects which action to take
                - performs action on environment and receives next state, reward and if the episode has ended
                - sums rewards across episode
                - pushes the (s, a, r, s') tuple onto the memory
                - updates the current state to be the state receives from the environment
                - increase total number of steps and steps within episode
                - updates model parameters every update_freq steps and only after learning_start steps
                - updates target model after target_update parameter updates and only after learning_start steps
                - prints summary every print_update steps
            - increase number of episodes
            - update list of all total episode rewards
        """
        
        training_done = False
        reward_per_episode = []
        
        while not training_done:
            
            episode_done = False
            episode_reward = 0
            episode_steps = 0
            
            #get initial state
            state = self.env.reset()
            
            while not episode_done:
                                
                #get action
                action = self.get_action(state)
                               
                #apply action while skipping frames
                next_state, reward, episode_done, info = self.env.step(action)

                #sum rewards
                episode_reward += reward
       
                #add to memory, if episode has finished set next_state to None
                mem.push(state, action, reward, None if episode_done else next_state)
                    
                #make next_state the new state
                state = next_state
                                
                #increase number of steps
                self.steps += 1
                episode_steps += 1 
            
                #update model parameters
                if self.steps % self.update_freq == 0 and self.steps > self.learning_start:
                    loss = self.optimize()
            
                #update target model
                if self.steps % (self.target_update*self.update_freq) == 0 and self.steps > self.learning_start:
                    self.target.load_state_dict(self.model.state_dict())
            
                #print summary
                if self.steps % self.print_update == 0:
                    avg_reward_per_episode = np.mean(reward_per_episode[-10:]) #average reward of last 10 episodes
                    reward_per_episode = []
                    print(f'Episodes: {self.episodes}, Steps: {self.steps}, Epsilon: {self.get_epsilon():.2f}, Avg. Reward per Ep: {avg_reward_per_episode:.2f}')

            #increase number of episodes
            self.episodes += 1
            reward_per_episode.append(episode_reward)
                            
    
    def optimize(self):
        """
        Update model parameters
        
        Algorithm:
        
        - get a batch of transitions
        - find out which next_states are terminal states
        - 
        """
        
        #get a batch
        transitions = mem.sample()
        
        #need to set the Q value of terminal states to 0
        #this mask will be 1 for non-terminal next_states and 0 for terminal next_states
        non_terminal_mask = torch.ByteTensor(list(map(lambda ns: ns is not None, transitions.next_state)))
        
        #this will be 1 for terminal next_states, and 0 for non-terminal next states
        terminal_mask = 1 - non_terminal_mask
                
        #state_batch = (N*C,H,W), where N is batch_size, C is phi_length, H and W state height and width
        state_batch = torch.cat(transitions.state).to(device)
        
        #action_batch = (N, 1)
        action_batch = torch.cat(transitions.action).unsqueeze(1).to(device)
        
        #reward_batch = (N, 1)
        reward_batch = torch.cat(transitions.reward).unsqueeze(1).to(device)
        
        #next_state_batch = (M*C,H,W), where M is number of non_terminal next_state in the batch
        non_terminal_next_state_batch = torch.cat([ns for ns in transitions.next_state if ns is not None]).to(device)
        
        #reshape to (N,C,H,W)
        state_batch = state_batch.view(mem.batch_size, 4, 84, 84)
        
        #reshape to (V,C,H,W)
        non_terminal_next_state_batch = non_terminal_next_state_batch.view(-1, 4, 84, 84)
        
        #get predicted Q values from model
        Q_preds = self.model(state_batch)
        
        #get Q values of action taken, shape (N,1)
        Q_vals = Q_preds.gather(1, action_batch)
          
        #get Q values from target model  
        target_pred = self.target(non_terminal_next_state_batch)

        #tensor for placing target values
        target_vals = torch.zeros(mem.batch_size, 1).to(device) 

        #fill in target values for non_terminal states
        #the terminal states will stay initialized as zeros
        target_vals[non_terminal_mask] = target_pred.max(1)[0].unsqueeze(1)
            
        expected_vals = reward_batch + (target_vals * self.gamma)
            
        #calculate loss between Q values and target values
        loss = F.smooth_l1_loss(Q_vals, expected_vals.detach())
            
        #zero gradients
        self.optimizer.zero_grad()
        
        #calculate gradients 
        loss.backward()
        
        #clamp gradients
        for p in self.model.parameters():
            p.grad.data.clamp_(-1, 1)
            
        #update parameters
        self.optimizer.step()
        
        return loss.item()

In [9]:
env = make_atari(GAME)
env = wrap_deepmind(env, frame_stack=True)
env = wrap_pytorch(env)
env.seed(SEED)

mem = ReplayMemory(CAPACITY, BATCH_SIZE)
model = DQN(N_ACTIONS)
agent = Agent(env, mem, model, UPDATE_FREQ, LEARNING_START, EPSILON_START, EPSILON_END, EPSILON_STEPS, GAMMA, TARGET_UPDATE, PRINT_UPDATE)

In [10]:
agent.train()

Episodes: 5, Steps: 5000, Epsilon: 0.85, Avg. Reward per Ep: -20.40
Episodes: 10, Steps: 10000, Epsilon: 0.72, Avg. Reward per Ep: -20.00
Episodes: 15, Steps: 15000, Epsilon: 0.61, Avg. Reward per Ep: -19.80
Episodes: 20, Steps: 20000, Epsilon: 0.52, Avg. Reward per Ep: -20.20
Episodes: 26, Steps: 25000, Epsilon: 0.44, Avg. Reward per Ep: -20.17
Episodes: 30, Steps: 30000, Epsilon: 0.37, Avg. Reward per Ep: -19.50
Episodes: 34, Steps: 35000, Epsilon: 0.32, Avg. Reward per Ep: -18.50
Episodes: 38, Steps: 40000, Epsilon: 0.27, Avg. Reward per Ep: -15.25
Episodes: 41, Steps: 45000, Epsilon: 0.23, Avg. Reward per Ep: -17.67
Episodes: 44, Steps: 50000, Epsilon: 0.20, Avg. Reward per Ep: -16.33
Episodes: 47, Steps: 55000, Epsilon: 0.17, Avg. Reward per Ep: -18.00
Episodes: 50, Steps: 60000, Epsilon: 0.14, Avg. Reward per Ep: -16.67
Episodes: 52, Steps: 65000, Epsilon: 0.12, Avg. Reward per Ep: -17.00
Episodes: 55, Steps: 70000, Epsilon: 0.11, Avg. Reward per Ep: -15.00
Episodes: 57, Steps: 7

KeyboardInterrupt: 