### Imports

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision.transforms as T
!pip install stable_baselines3
import copy
from collections import namedtuple
from itertools import count
import stable_baselines3
import math
import random
import numpy as np 
import time
import os
# Changes depending on Environment: Only MsPacman was tested (DQN wasn't used after this)
NUM_ACTIONS = 9

Collecting stable_baselines3
[?25l  Downloading https://files.pythonhosted.org/packages/18/d3/6ae6e774ac6cf8f5eeca1c30b9125231db901b75f72da7d81e939f293f69/stable_baselines3-1.0-py3-none-any.whl (152kB)
[K     |██▏                             | 10kB 20.6MB/s eta 0:00:01[K     |████▎                           | 20kB 17.4MB/s eta 0:00:01[K     |██████▍                         | 30kB 15.0MB/s eta 0:00:01[K     |████████▋                       | 40kB 13.9MB/s eta 0:00:01[K     |██████████▊                     | 51kB 7.9MB/s eta 0:00:01[K     |████████████▉                   | 61kB 7.6MB/s eta 0:00:01[K     |███████████████                 | 71kB 8.6MB/s eta 0:00:01[K     |█████████████████▏              | 81kB 9.1MB/s eta 0:00:01[K     |███████████████████▎            | 92kB 9.4MB/s eta 0:00:01[K     |█████████████████████▌          | 102kB 7.7MB/s eta 0:00:01[K     |███████████████████████▋        | 112kB 7.7MB/s eta 0:00:01[K     |█████████████████████████▊      |

### Neural Network

In [None]:
### Final network architecture
class DQN(nn.Module):
    def __init__(self, in_channels=4, n_actions= NUM_ACTIONS):
        super(DQN, self).__init__()
        self.conv1 = nn.Conv2d(in_channels, 32, kernel_size=8, stride=4)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=4, stride=2)
        self.conv3 = nn.Conv2d(64, 64, kernel_size=3, stride=1)
        self.fc4 = nn.Linear(7 * 7 * 64, 512) # hard compute the input size 
        self.head = nn.Linear(512, n_actions)
        
    def forward(self, x):
        x = x.float() / 255 # normalize values to 0 - 1
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        x = F.relu(self.conv3(x))
        x = F.relu(self.fc4(x.view(x.size(0), -1)))
        return self.head(x)

### Replay Memory

In [None]:
# Declare specification for a transition that is stored
Transition = namedtuple('Transion', 
                        ('state', 'action', 'next_state', 'reward'))


"""
 Here Transition == Experience. It is comprised of:
 state: start state - 4 stacked frames 
 action: action chosen in state 
 next_state: the state that the agent ends up in from taking action in state 
 reward: the reward that the agent recieves for taking action in state
"""

class ReplayMemory(object):
    def __init__(self, capacity):
        self.capacity = capacity
        self.memory = []
        self.position = 0 # current pointer
        
    def push(self, *args):
        if len(self.memory) < self.capacity: # if buffer not full, append more
            self.memory.append(None)
        self.memory[self.position] = Transition(*args)
        self.position = (self.position + 1) % self.capacity # if buffer is full then we replace from the beginning in a cicular fashing 
        
    def sample(self, batch_size):
        return random.sample(self.memory, batch_size) # sample random batch
    
    def __len__(self):
        return len(self.memory)

### DQN Functions

In [None]:
def select_action(state):
    global steps_done
    sample = random.random() # select a random number to compare against epsilon threshold
    eps_threshold = EPS_END + (EPS_START - EPS_END)* \
        math.exp(-1. * steps_done / EPS_DECAY) 
    steps_done += 1 # increment total steps in env
    if sample > eps_threshold:  # if sample is greater then theshold then we select an action according to the policy network 
        with torch.no_grad():
            return policy_net(state.to('cuda')).max(1)[1].view(1,1)
    else:
        return torch.tensor([[random.randrange(NUM_ACTIONS)]], device=device, dtype=torch.long) # else we select a random action 

def optimize_model():
  # If the buffer is not larger than the batch size, we cannot sample a full batch 
    if len(memory) < BATCH_SIZE:
        return
    transitions = memory.sample(BATCH_SIZE) # Get training batch 
    """
    zip(*transitions) unzips the transitions into
    Transition(*) creates new named tuple
    batch.state - tuple of all the states (each state is a tensor)
    batch.next_state - tuple of all the next states (each state is a tensor)
    batch.reward - tuple of all the rewards (each reward is a float)
    batch.action - tuple of all the actions (each action is an int)    
    """
    batch = Transition(*zip(*transitions)) # unzip the zipped transitions 
    
    # extract actions and rewards
    actions = tuple((map(lambda a: torch.tensor([[a]], device='cuda'), batch.action))) 
    rewards = tuple((map(lambda r: torch.tensor([r], device='cuda'), batch.reward))) 

    # make sure not a terminal state so we don't run into errors
    non_final_mask = torch.tensor(
        tuple(map(lambda s: s is not None, batch.next_state)),
        device=device, dtype=torch.uint8)
    
    non_final_next_states = torch.cat([s for s in batch.next_state
                                       if s is not None]).to('cuda')
    

    state_batch = torch.cat(batch.state).to('cuda')
    action_batch = torch.cat(actions)
    reward_batch = torch.cat(rewards)
    
    # Compute Q(s_t, a) - the model computes Q(s_t), then we select the
    # columns of actions taken. These are the actions which would've been taken
    # for each batch state according to policy_net
    state_action_values = policy_net(state_batch).gather(1, action_batch)

    # Compute V(s_{t+1}) for all next states.
    # Expected values of actions for non_final_next_states are computed based
    # on the "older" target_net; selecting their best reward with max(1)[0].
    # This is merged based on the mask, such that we'll have either the expected
    # state value or 0 in case the state was final.
    next_state_values = torch.zeros(BATCH_SIZE, device=device)
    next_state_values[non_final_mask] = target_net(non_final_next_states).max(1)[0].detach()

    # Compute the expected Q values
    expected_state_action_values = (next_state_values * GAMMA) + reward_batch # compute target by taking expected next state value and summing with reward just recieved
    
    # Computer Huber loss 
    loss = F.smooth_l1_loss(state_action_values, expected_state_action_values.unsqueeze(1))
    
    # Optimize the model
    optimizer.zero_grad()
    loss.backward() # back prop
    for param in policy_net.parameters():
        param.grad.data.clamp_(-1, 1)
    optimizer.step()

"""
Function takes in observation returned from environment, preprocesses and returns 'state'
"""
def get_state(obs):
    state = np.array(obs)
    # print(state.shape)
    state = state.transpose((2, 0, 1))
    state = torch.from_numpy(state)
    return state.unsqueeze(0)

def train(env, total_timesteps, render=False):
    training_history = []
    episode = 0
    while steps_done < total_timesteps: # run for total timesteps 
        episode += 1
        obs = env.reset()
        state = get_state(obs) # get initial state
        total_reward = 0.0
        for t in count():
            action = select_action(state)

            if render:
                env.render()

            obs, reward, done, info = env.step(action) # step in environment

            total_reward += reward

            if not done: # check if terminal state
                next_state = get_state(obs)
            else:
                next_state = None

            reward = torch.tensor([reward], device=device)

            memory.push(state, action.to('cpu'), next_state, reward.to('cpu')) # append an experience
            state = next_state

            if steps_done > INITIAL_MEMORY: # if we have enough experiences save, optimize the model for one step
                optimize_model()

                if steps_done % TARGET_UPDATE == 0: # copy over weights every x number of steps from policy to target network
                    target_net.load_state_dict(policy_net.state_dict())

            if done:
                break
        training_history.append((total_reward, episode, t, steps_done))
        if episode % 20 == 0:
                print('Total steps: {} \t Episode: {}/{} \t Total reward: {}'.format(steps_done, episode, t, total_reward))
    env.close()
    return training_history

### Define Wrapper of Environment

This class was taken from StableBaselines3: cited in report
I transitioned to using another method later in the project. 

In [None]:
from collections import deque
import numpy as np
import gym
import copy
import cv2
cv2.ocl.setUseOpenCL(False)

# Code to wrap environment, taken from StableBaselines3: cited in report

def make_env(env, stack_frames=True, episodic_life=True, clip_rewards=False, scale=False):
    if episodic_life:
        env = EpisodicLifeEnv(env)

    env = NoopResetEnv(env, noop_max=30)
    env = MaxAndSkipEnv(env, skip=4)
    if 'FIRE' in env.unwrapped.get_action_meanings():
        env = FireResetEnv(env)

    env = WarpFrame(env)
    if stack_frames:
        env = FrameStack(env, 4)
    if clip_rewards:
        env = ClipRewardEnv(env)
    return env

class RewardScaler(gym.RewardWrapper):

    def reward(self, reward):
        return reward * 0.1


class ClipRewardEnv(gym.RewardWrapper):
    def __init__(self, env):
        gym.RewardWrapper.__init__(self, env)

    def reward(self, reward):
        """Bin reward to {+1, 0, -1} by its sign."""
        return np.sign(reward)


class LazyFrames(object):
    def __init__(self, frames):
        """This object ensures that common frames between the observations are only stored once.
        It exists purely to optimize memory usage which can be huge for DQN's 1M frames replay
        buffers.
        This object should only be converted to numpy array before being passed to the model.
        You'd not believe how complex the previous solution was."""
        self._frames = frames
        self._out = None

    def _force(self):
        if self._out is None:
            self._out = np.concatenate(self._frames, axis=2)
            self._frames = None
        return self._out

    def __array__(self, dtype=None):
        out = self._force()
        if dtype is not None:
            out = out.astype(dtype)
        return out

    def __len__(self):
        return len(self._force())

    def __getitem__(self, i):
        return self._force()[i]

class FrameStack(gym.Wrapper):
    def __init__(self, env, k):
        """Stack k last frames.
        Returns lazy array, which is much more memory efficient.
        See Also
        --------
        baselines.common.atari_wrappers.LazyFrames
        """
        gym.Wrapper.__init__(self, env)
        self.k = k
        self.frames = deque([], maxlen=k)
        shp = env.observation_space.shape
        self.observation_space = gym.spaces.Box(low=0, high=255, shape=(shp[0], shp[1], shp[2] * k), dtype=env.observation_space.dtype)

    def reset(self):
        ob = self.env.reset()
        for _ in range(self.k):
            self.frames.append(ob)
        return self._get_ob()

    def step(self, action):
        ob, reward, done, info = self.env.step(action)
        self.frames.append(ob)
        return self._get_ob(), reward, done, info

    def _get_ob(self):
        assert len(self.frames) == self.k
        return LazyFrames(list(self.frames))


class WarpFrame(gym.ObservationWrapper):
    def __init__(self, env):
        """Warp frames to 84x84 as done in the Nature paper and later work."""
        gym.ObservationWrapper.__init__(self, env)
        self.width = 84
        self.height = 84
        self.observation_space = gym.spaces.Box(low=0, high=255,
            shape=(self.height, self.width, 1), dtype=np.uint8)

    def observation(self, frame):
        frame = cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY)
        frame = cv2.resize(frame, (self.width, self.height), interpolation=cv2.INTER_AREA)
        return frame[:, :, None]


class FireResetEnv(gym.Wrapper):
    def __init__(self, env=None):
        """For environments where the user need to press FIRE for the game to start."""
        super(FireResetEnv, self).__init__(env)
        assert env.unwrapped.get_action_meanings()[1] == 'FIRE'
        assert len(env.unwrapped.get_action_meanings()) >= 3

    def step(self, action):
        return self.env.step(action)

    def reset(self):
        self.env.reset()
        obs, _, done, _ = self.env.step(1)
        if done:
            self.env.reset()
        obs, _, done, _ = self.env.step(2)
        if done:
            self.env.reset()
        return obs


class EpisodicLifeEnv(gym.Wrapper):
    def __init__(self, env=None):
        """Make end-of-life == end-of-episode, but only reset on true game over.
        Done by DeepMind for the DQN and co. since it helps value estimation.
        """
        super(EpisodicLifeEnv, self).__init__(env)
        self.lives = 0
        self.was_real_done = True
        self.was_real_reset = False

    def step(self, action):
        obs, reward, done, info = self.env.step(action)
        self.was_real_done = done
        # check current lives, make loss of life terminal,
        # then update lives to handle bonus lives
        lives = self.env.unwrapped.ale.lives()
        if lives < self.lives and lives > 0:
            # for Qbert somtimes we stay in lives == 0 condtion for a few frames
            # so its important to keep lives > 0, so that we only reset once
            # the environment advertises done.
            done = True
        self.lives = lives
        return obs, reward, done, info

    def reset(self):
        """Reset only when lives are exhausted.
        This way all states are still reachable even though lives are episodic,
        and the learner need not know about any of this behind-the-scenes.
        """
        if self.was_real_done:
            obs = self.env.reset()
            self.was_real_reset = True
        else:
            # no-op step to advance from terminal/lost life state
            obs, _, _, _ = self.env.step(0)
            self.was_real_reset = False
        self.lives = self.env.unwrapped.ale.lives()
        return obs


class MaxAndSkipEnv(gym.Wrapper):
    def __init__(self, env=None, skip=4):
        """Return only every `skip`-th frame"""
        super(MaxAndSkipEnv, self).__init__(env)
        # most recent raw observations (for max pooling across time steps)
        self._obs_buffer = deque(maxlen=2)
        self._skip = skip

    def step(self, action):
        total_reward = 0.0
        done = None
        for _ in range(self._skip):
            obs, reward, done, info = self.env.step(action)
            self._obs_buffer.append(obs)
            total_reward += reward
            if done:
                break

        max_frame = np.max(np.stack(self._obs_buffer), axis=0)

        return max_frame, total_reward, done, info

    def reset(self):
        """Clear past frame buffer and init. to first obs. from inner env."""
        self._obs_buffer.clear()
        obs = self.env.reset()
        self._obs_buffer.append(obs)
        return obs

class NoopResetEnv(gym.Wrapper):
    def __init__(self, env=None, noop_max=30):
        """Sample initial states by taking random number of no-ops on reset.
        No-op is assumed to be action 0.
        """
        super(NoopResetEnv, self).__init__(env)
        self.noop_max = noop_max
        self.override_num_noops = None
        assert env.unwrapped.get_action_meanings()[0] == 'NOOP'

    def step(self, action):
        return self.env.step(action)

    def reset(self):
        """ Do no-op action for a number of steps in [1, noop_max]."""
        self.env.reset()
        if self.override_num_noops is not None:
            noops = self.override_num_noops
        else:
            noops = np.random.randint(1, self.noop_max + 1)
        assert noops > 0
        obs = None
        for _ in range(noops):
            obs, _, done, _ = self.env.step(0)
            if done:
                obs = self.env.reset()
        return obs

### Train the Model


In [None]:
if __name__ == '__main__':
    # Path
    base_path = './drive/MyDrive/Spring 2021/CS 354/project/dqnOutput/'

    # set device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # hyperparameters
    BATCH_SIZE = 32
    GAMMA = 0.99 # discount factor
    EPS_START = 1 # how much we explore at start of training
    EPS_END = 0.02 # how much we explore as training steps approach infinity
    EPS_DECAY = 1000 # how rapidly we decay exploration
    TARGET_UPDATE = 10000 # how often we copy over weights from policy network to target network 
    RENDER = False
    LEARNING_RATE = .0005
    INITIAL_MEMORY = 1000 # how many experiences need to be stored in memory before we start optimization 
    MEMORY_SIZE = 50000 # buffer size of memory
    TOTAL_TIMESTEPS = 1000 # total timesteps: only ended up training ~ 4.5 before training was ended

    # create networks
    policy_net = DQN(n_actions=9).to(device) # chooses actions and is the one being backpropogated on
    target_net = DQN(n_actions=9).to(device) # computes target value and gets weights from policy every x timesteps 
    target_net.load_state_dict(policy_net.state_dict()) # copy over weights from policy network

    # setup optimizer
    optimizer = optim.Adam(policy_net.parameters(), lr=LEARNING_RATE)

    steps_done = 0 # track how many steps acted in evn

    # create environment and wrap it
    env = gym.make('MsPacmanNoFrameskip-v4')
    env = make_env(env)

    # initialize replay memory
    memory = ReplayMemory(MEMORY_SIZE)
    
    # train model
    start = time.time() # track total time training took
    training_history = train(env, TOTAL_TIMESTEPS) # training history is a tuple: reward per episode, episode number, time steps per episode, total timesteps so far
    print('Total time: {}'.format(time.time() - start))
    import pickle
    os.makedirs(base_path + 'models/', exist_ok = True)
    with open(base_path +'training_history.pkl', 'wb') as f:
      pickle.dump(training_history, f)
    torch.save(policy_net, base_path + "models/dqn_MsPacman_model")