- https://storage.googleapis.com/deepmind-media/dqn/DQNNaturePaper.pdf
- https://ai.intel.com/demystifying-deep-reinforcement-learning/
- https://danieltakeshi.github.io/2016/11/25/frame-skipping-and-preprocessing-for-deep-q-networks-on-atari-2600-games/

- https://github.com/AndersonJo/dqn-pytorch/blob/master/dqn.py
- https://github.com/hengyuan-hu/rainbow
- https://github.com/transedward/pytorch-dqn

In [1]:
import matplotlib.pyplot as plt
import gym
import cv2
import random
import math
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from collections import deque
from collections import namedtuple
import copy

In [11]:
SEED = 1234
CAPACITY = 10_000
BATCH_SIZE = 32
PROCESSED_SIZE = 84
GAME = 'Pong-v0'
N_ACTIONS = gym.make(GAME).action_space.n
PHI_LENGTH = 4
UPDATE_FREQ = 1
EPSILON_START = 1.0
EPSILON_END = 0.01
EPSILON_STEPS = 30_000
GAMMA = 0.99
TARGET_UPDATE = 1000
PRINT_UPDATE = 5_000

In [12]:
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
np.random.seed(SEED)
random.seed(SEED)

In [13]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [14]:
class ReplayMemory:
    def __init__(self, capacity, batch_size):
        """
        Replay memory that holds examples in the form of (s, a, r, s')
        
        args:
            capacity (int): the size of the memory
            batch_size (int): size of batches used for training model
        """
        
        self.batch_size = batch_size
        self.capacity = capacity
        self.memory = deque(maxlen=self.capacity)
        self.Transition = namedtuple('Transition', ('state', 'action', 'reward', 'next_state'))
        self._available = False

    def put(self, state, action, reward, next_state):
        """
        Places an (s, a, r, s') example in the memory
        
        args:
            state (np.array): 
            action (list[int]):
            reward (list[int]):
            next_state (np.array or None):
        """
        
        state = torch.FloatTensor(state)
        action = torch.LongTensor([action])
        reward = torch.FloatTensor([reward])
        if next_state is not None:
            next_state = torch.FloatTensor(next_state)
        transition = self.Transition(state=state, action=action, reward=reward, next_state=next_state)
        self.memory.append(transition)

    def sample(self):
        """
        Gets a random sample of n = batch_size examples from the memory
            
        returns:
            Transitions (namedtuple): a tuple of (s, a, r, s')
        """
        
        transitions = random.sample(self.memory, self.batch_size)
        return self.Transition(*(zip(*transitions)))

    def size(self):
        """
        Returns the length of the memory
        
        returns:
            length (int): number of examples in the memory
        """
        return len(self.memory)

    def is_available(self):
        """
        Returns True if we have enough examples within the memory
        
        returns:
            available (bool): True if we have at least n = batch_size examples in the memory
        """
        if self._available:
            return True

        if len(self.memory) > self.batch_size:
            self._available = True
        return self._available

In [15]:
class Environment:
    def __init__(self, game, size, seed):
        """
        A class that has helpful wrappers around the Gym environment
        
        game (string): name of Atari game, i.e. Breakout-v0
        size (int): height and width of observation after preprocessing
        seed (int): random seed
        """
        
        self.size = size
        
        #init game
        self.game = gym.make(game)
        
        #set random seed for determinism
        self.game.seed(seed)
        
    def process(self, obs):
        """
        Process an observation (i.e. convert to grayscale, resize and normalize)
        
        args:
            obs (np.array): observation from gym of game screen, should be (height, width, channels)
        
        returns:
            output (np.array): (self.size, self.size) array with all values <= 1
        """
        
        assert len(obs.shape) == 3 #make sure image is correct shape
        
        gray = cv2.cvtColor(obs, cv2.COLOR_RGB2GRAY) #convert to grayscale
        output = cv2.resize(gray, (self.size, self.size)) #resize
        output = output.astype(np.float32, copy=False) #convert to float32
        output /= 255.0 #normalize values between [0, 1]
                     
        assert (output <= 1.0).all() 
                        
        return output
    
    def get_obs(self):
        """
        Gets a processed observation
        
        returns:
            obs (np.array): (self.size, self.size) array with all values <= 1
        """
        
        obs = self.game.render('rgb_array')
        obs = self.process(obs)
        return obs
    
    def init(self):
        """
        Reset the environment and return the initial state (unprocessed)
        
        returns:
            obs (np.array): observation from gym of game screen, should be (height, width, channels)
        """
        obs = self.game.reset()
        return obs
    
    def reset(self):
        """
        Reset the environment and return the initial state (processed)
        
        returns:
            output (np.array): (self.size, self.size) array with all values <= 1
        """
        obs = self.game.reset()
        output = self.process(obs)
        return output

In [16]:
class DQN(nn.Module):

    def __init__(self, n_actions):
        super(DQN, self).__init__()
        self.conv1 = nn.Conv2d(4, 32, kernel_size=8, stride=4)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=4, stride=2)
        self.conv3 = nn.Conv2d(64, 64, kernel_size=3, stride=1)
        self.fc1 = nn.Linear(7*7*64, 512)
        self.fc2 = nn.Linear(512, n_actions) #actions from from env.action_space.n

    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        x = F.relu(self.conv3(x))
        x = F.relu(self.fc1(x.view(x.size(0), -1))) #flattens the (N, C, H, W) to (N, C*H*W)
        return self.fc2(x)

In [17]:
class Agent:
    def __init__(self, env, mem, model, phi_length, update_freq, e_start, e_end, e_steps, gamma, target_update, print_update):
        """
        An agent class that handles training the model

        args:
            mem (ReplayMemory): ReplayMemory object
            env (Environment): Environment object
            model (nn.Module): PyTorch model
            phi_length (int): number of observations to stack to make a state
            frame_skip (int): we only use every n = frame_skip observations to make a state
            e_start (int): initial value of epsilon
            e_end (int): minimum value of epsilon
            e_steps (int): number of steps for epsilon to go from e_start to e_end
            gamma (float): decay rate of rewards
            target_update (int): after how many steps (frames) to update target model
            print_update (int): after how many steps (frames) to print summary of performance
            
        """
        self.env = env
        self.mem = mem
        self.model = model
        self.phi_length = phi_length
        self.update_freq = update_freq
        self.e_start = e_start
        self.e_end = e_end
        self.e_steps = e_steps
        self.gamma = gamma
        self.target_update = target_update
        self.print_update = print_update
        
        self.steps = 0 #number of steps taken
        self.episodes = 0 #number of episodes
        self.obs_buffer = deque(maxlen=phi_length) #for holding observations to be turned into states
        
        #put model on gpu if available
        self.model = model.to(device)
        
        #create target model
        self.target = copy.deepcopy(self.model)
    
        #create optimizer
        #self.optimizer = optim.RMSprop(self.model.parameters(), lr=0.00025, alpha=0.95, momentum=0.95)
        self.optimizer = optim.Adam(self.model.parameters(), lr=1e-4)
        
    def get_epsilon(self):
        """
        Calculates the value of epsilon from the current number of frames
        
        returns:
            epsilon (int): the probability of doing a random action
        """
        epsilon = self.e_end + (self.e_start - self.e_end) * math.exp(-1. * self.steps / self.e_steps)
        return epsilon
        
    def get_action(self, state):
        """
        Selects action to perform, with probability = epsilon chooses a random action,
        else chooses the best predicted action of the model
        
        args:
            state (np.array): input state to the model
            
        returns:
            action (int): the index of the action 
        """
    
        #get value of epsilon
        epsilon = self.get_epsilon()
        
        #with probablity of epsilon, pick a random action
        if random.random() < epsilon:
            action = self.env.game.action_space.sample()
        
        else:
            #with probability of (1 - epsilon) pick predicted value
            with torch.no_grad():
                state = torch.FloatTensor(state).unsqueeze(0).to(device) #convert to tensor, reshape and add to gpu
                Qsa = self.model(state) #pass state through model to get Qa
                action = Qsa.max(1)[1].item() #action is max Qa value
                
        #make sure the value is an integer
        assert isinstance(action, int)
            
        return action
    
    def get_initial_state(self):
        """
        Get the initial state to the model, a stack of processed observations
        
        returns:
            state (np.array): a stack of n = phi_length processed observations
        """
        
        _ = self.env.reset() #reset environment
        obs = self.env.get_obs() #get a processed observation
        state = np.stack([obs for _ in range(self.phi_length)], axis=0) #stack n = phi_length times to make a state
        
        #also fill the 
        for _ in range(self.phi_length):
            self.obs_buffer.append(obs)
        
        return state
        
    def get_state(self):
        """
        Get a stack from the observation buffer
        
        returns:
            state (np.array): a stack of n = phi_length processed observations
        """
        
        state = np.array(self.obs_buffer)
        
        return state
        
    def train(self):
        
        training_done = False
        reward_per_episode = []
        rewards_all_episodes = []
        
        while not training_done:
            
            episode_done = False
            episode_reward = 0
            episode_steps = 0
            
            #get initial state
            state = self.get_initial_state()
            
            while not episode_done:
                                
                #get action
                action = self.get_action(state)
                               
                #apply action while skipping frames
                observation, reward, episode_done, info = self.env.game.step(action)

                #sum rewards
                episode_reward += reward
                    
                #append processed observation to a buffer of observations
                self.obs_buffer.append(self.env.get_obs())
                        
                #get the next state from the observation buffer
                next_state = self.get_state()
                               
                #add to memory, for terminal states, set next_state to None
                if episode_done:
                    mem.put(state, action, reward, None)
                else:
                    mem.put(state, action, reward, next_state)
                    
                #make new state the old next_state
                state = next_state
                                
                #increase number of steps
                self.steps += 1
                episode_steps += 1 
            
                #update model parameters
                if mem.is_available() and self.steps % self.update_freq == 0 and self.steps > 10_000:
                    loss = self.optimize()
            
                if self.steps % (self.target_update*self.update_freq) == 0 and self.steps > 10_000:
                    self.target.load_state_dict(self.model.state_dict())
            
                if self.steps % self.print_update == 0:
                    avg_reward_per_episode = np.mean(reward_per_episode)
                    #rewards_all_episodes.extend(reward_per_episode)
                    reward_per_episode = []
                    print(f'Episodes: {self.episodes}, Steps: {self.steps}, Epsilon: {self.get_epsilon():.2f}, Avg. Reward per Ep: {avg_reward_per_episode:.2f}')

            #increase number of episodes
            self.episodes += 1
            reward_per_episode.append(episode_reward)
                            
    
    def optimize(self):
        """
        Update model parameters
        """
        
        #get a batch
        transitions = mem.sample()
        
        #need to set the Q value of terminal states to 0
        #this mask will be 1 for non-terminal next_states and 0 for terminal next_states
        non_terminal_mask = torch.ByteTensor(list(map(lambda ns: ns is not None, transitions.next_state)))
        
        #this will be 1 for terminal next_states, and 0 for non-terminal next states
        terminal_mask = 1 - non_terminal_mask
        
        #state_batch = (N*C,H,W), where N is batch_size, C is phi_length, H and W are processed obs size
        state_batch = torch.cat(transitions.state).to(device)
        
        #action_batch = (N, 1)
        action_batch = torch.cat(transitions.action).unsqueeze(1).to(device)
        
        #reward_batch = (N, 1)
        reward_batch = torch.cat(transitions.reward).unsqueeze(1).to(device)
        
        #clip reward between +1 and -1
        reward_batch.data.clamp_(-1, 1)
        
        #next_state_batch = (V*C,H,W), where V is non_terminal next_state
        non_terminal_next_state_batch = torch.cat([ns for ns in transitions.next_state if ns is not None]).to(device)
        
        #reshape to (N,C,H,W)
        state_batch = state_batch.view(mem.batch_size, self.phi_length, self.env.size, self.env.size)
        
        #reshape to (V,C,H,W)
        non_terminal_next_state_batch = non_terminal_next_state_batch.view(-1, self.phi_length, self.env.size, self.env.size)
        
        #get predicted Q values from model
        Q_pred = self.model(state_batch)
        
        #get Q values of action taken, shape (N,1)
        Q_vals = Q_pred.gather(1, action_batch)
          
        with torch.no_grad():
            #get Q values from target model  
            target_pred = self.target(non_terminal_next_state_batch)

            #tensor for placing target values
            target_vals = torch.zeros(mem.batch_size, 1).to(device) 
            
            #fill in target values for non_terminal states
            #the terminal states will stay initialized as zeros
            target_vals[non_terminal_mask] = reward_batch[non_terminal_mask] + target_pred.max(1)[0].unsqueeze(1) * self.gamma
            target_vals[terminal_mask] = reward_batch[terminal_mask]
            
        #calculate loss between Q values and target values
        loss = F.smooth_l1_loss(Q_vals, target_vals.detach())
            
        #zero gradients
        self.optimizer.zero_grad()
        
        #calculate gradients 
        loss.backward()
        
        #clamp gradients
        for p in self.model.parameters():
            p.grad.data.clamp_(-1, 1)
            
        #update parameters
        self.optimizer.step()
        
        return loss.item()

In [18]:
env = Environment(GAME, PROCESSED_SIZE, SEED)
mem = ReplayMemory(CAPACITY, BATCH_SIZE)
model = DQN(N_ACTIONS)
agent = Agent(env, mem, model, PHI_LENGTH, UPDATE_FREQ, EPSILON_START, EPSILON_END, EPSILON_STEPS, GAMMA, TARGET_UPDATE, PRINT_UPDATE)

In [19]:
s, ns = agent.train()

Episodes: 4, Steps: 5000, Epsilon: 0.85, Avg. Reward per Ep: -20.00
Episodes: 8, Steps: 10000, Epsilon: 0.72, Avg. Reward per Ep: -20.50
Episodes: 12, Steps: 15000, Epsilon: 0.61, Avg. Reward per Ep: -20.25
Episodes: 16, Steps: 20000, Epsilon: 0.52, Avg. Reward per Ep: -19.75
Episodes: 20, Steps: 25000, Epsilon: 0.44, Avg. Reward per Ep: -20.50
Episodes: 24, Steps: 30000, Epsilon: 0.37, Avg. Reward per Ep: -20.50
Episodes: 28, Steps: 35000, Epsilon: 0.32, Avg. Reward per Ep: -20.00
Episodes: 32, Steps: 40000, Epsilon: 0.27, Avg. Reward per Ep: -20.75
Episodes: 37, Steps: 45000, Epsilon: 0.23, Avg. Reward per Ep: -20.80
Episodes: 42, Steps: 50000, Epsilon: 0.20, Avg. Reward per Ep: -20.80
Episodes: 46, Steps: 55000, Epsilon: 0.17, Avg. Reward per Ep: -20.25
Episodes: 50, Steps: 60000, Epsilon: 0.14, Avg. Reward per Ep: -20.00
Episodes: 54, Steps: 65000, Epsilon: 0.12, Avg. Reward per Ep: -20.25
Episodes: 59, Steps: 70000, Epsilon: 0.11, Avg. Reward per Ep: -20.80
Episodes: 63, Steps: 75

Episodes: 265, Steps: 580000, Epsilon: 0.01, Avg. Reward per Ep: -10.00
Episodes: 266, Steps: 585000, Epsilon: 0.01, Avg. Reward per Ep: -9.00
Episodes: 268, Steps: 590000, Epsilon: 0.01, Avg. Reward per Ep: -14.00
Episodes: 270, Steps: 595000, Epsilon: 0.01, Avg. Reward per Ep: -14.00
Episodes: 272, Steps: 600000, Epsilon: 0.01, Avg. Reward per Ep: -12.50
Episodes: 274, Steps: 605000, Epsilon: 0.01, Avg. Reward per Ep: -11.50
Episodes: 276, Steps: 610000, Epsilon: 0.01, Avg. Reward per Ep: -13.50
Episodes: 278, Steps: 615000, Epsilon: 0.01, Avg. Reward per Ep: -13.50
Episodes: 280, Steps: 620000, Epsilon: 0.01, Avg. Reward per Ep: -13.00
Episodes: 282, Steps: 625000, Epsilon: 0.01, Avg. Reward per Ep: -13.00
Episodes: 284, Steps: 630000, Epsilon: 0.01, Avg. Reward per Ep: -15.00
Episodes: 286, Steps: 635000, Epsilon: 0.01, Avg. Reward per Ep: -13.50
Episodes: 288, Steps: 640000, Epsilon: 0.01, Avg. Reward per Ep: -18.00
Episodes: 290, Steps: 645000, Epsilon: 0.01, Avg. Reward per Ep: 

Episodes: 469, Steps: 1150000, Epsilon: 0.01, Avg. Reward per Ep: -15.00
Episodes: 471, Steps: 1155000, Epsilon: 0.01, Avg. Reward per Ep: -15.50
Episodes: 472, Steps: 1160000, Epsilon: 0.01, Avg. Reward per Ep: -13.00
Episodes: 474, Steps: 1165000, Epsilon: 0.01, Avg. Reward per Ep: -12.50
Episodes: 476, Steps: 1170000, Epsilon: 0.01, Avg. Reward per Ep: -11.50
Episodes: 478, Steps: 1175000, Epsilon: 0.01, Avg. Reward per Ep: -16.00
Episodes: 479, Steps: 1180000, Epsilon: 0.01, Avg. Reward per Ep: -8.00
Episodes: 481, Steps: 1185000, Epsilon: 0.01, Avg. Reward per Ep: -15.50
Episodes: 483, Steps: 1190000, Epsilon: 0.01, Avg. Reward per Ep: -15.50
Episodes: 485, Steps: 1195000, Epsilon: 0.01, Avg. Reward per Ep: -13.50
Episodes: 487, Steps: 1200000, Epsilon: 0.01, Avg. Reward per Ep: -16.00
Episodes: 488, Steps: 1205000, Epsilon: 0.01, Avg. Reward per Ep: -13.00
Episodes: 490, Steps: 1210000, Epsilon: 0.01, Avg. Reward per Ep: -11.50
Episodes: 492, Steps: 1215000, Epsilon: 0.01, Avg. R

Episodes: 674, Steps: 1715000, Epsilon: 0.01, Avg. Reward per Ep: -15.00
Episodes: 676, Steps: 1720000, Epsilon: 0.01, Avg. Reward per Ep: -15.00
Episodes: 678, Steps: 1725000, Epsilon: 0.01, Avg. Reward per Ep: -14.50
Episodes: 680, Steps: 1730000, Epsilon: 0.01, Avg. Reward per Ep: -11.00
Episodes: 682, Steps: 1735000, Epsilon: 0.01, Avg. Reward per Ep: -14.50
Episodes: 684, Steps: 1740000, Epsilon: 0.01, Avg. Reward per Ep: -17.00
Episodes: 686, Steps: 1745000, Epsilon: 0.01, Avg. Reward per Ep: -13.00
Episodes: 687, Steps: 1750000, Epsilon: 0.01, Avg. Reward per Ep: -12.00
Episodes: 689, Steps: 1755000, Epsilon: 0.01, Avg. Reward per Ep: -9.00
Episodes: 691, Steps: 1760000, Epsilon: 0.01, Avg. Reward per Ep: -14.00
Episodes: 692, Steps: 1765000, Epsilon: 0.01, Avg. Reward per Ep: -9.00
Episodes: 694, Steps: 1770000, Epsilon: 0.01, Avg. Reward per Ep: -9.50
Episodes: 696, Steps: 1775000, Epsilon: 0.01, Avg. Reward per Ep: -11.00
Episodes: 697, Steps: 1780000, Epsilon: 0.01, Avg. Rew

Episodes: 880, Steps: 2280000, Epsilon: 0.01, Avg. Reward per Ep: -14.00
Episodes: 881, Steps: 2285000, Epsilon: 0.01, Avg. Reward per Ep: -8.00
Episodes: 883, Steps: 2290000, Epsilon: 0.01, Avg. Reward per Ep: -12.50
Episodes: 885, Steps: 2295000, Epsilon: 0.01, Avg. Reward per Ep: -11.00
Episodes: 887, Steps: 2300000, Epsilon: 0.01, Avg. Reward per Ep: -15.00
Episodes: 888, Steps: 2305000, Epsilon: 0.01, Avg. Reward per Ep: -10.00
Episodes: 890, Steps: 2310000, Epsilon: 0.01, Avg. Reward per Ep: -10.50
Episodes: 892, Steps: 2315000, Epsilon: 0.01, Avg. Reward per Ep: -14.00
Episodes: 893, Steps: 2320000, Epsilon: 0.01, Avg. Reward per Ep: -9.00
Episodes: 895, Steps: 2325000, Epsilon: 0.01, Avg. Reward per Ep: -11.50
Episodes: 897, Steps: 2330000, Epsilon: 0.01, Avg. Reward per Ep: -14.50
Episodes: 899, Steps: 2335000, Epsilon: 0.01, Avg. Reward per Ep: -9.50
Episodes: 900, Steps: 2340000, Epsilon: 0.01, Avg. Reward per Ep: -11.00
Episodes: 902, Steps: 2345000, Epsilon: 0.01, Avg. Rew

Episodes: 1073, Steps: 2840000, Epsilon: 0.01, Avg. Reward per Ep: -11.00
Episodes: 1074, Steps: 2845000, Epsilon: 0.01, Avg. Reward per Ep: -9.00
Episodes: 1076, Steps: 2850000, Epsilon: 0.01, Avg. Reward per Ep: -10.00
Episodes: 1078, Steps: 2855000, Epsilon: 0.01, Avg. Reward per Ep: -11.00
Episodes: 1079, Steps: 2860000, Epsilon: 0.01, Avg. Reward per Ep: -15.00
Episodes: 1081, Steps: 2865000, Epsilon: 0.01, Avg. Reward per Ep: -11.00
Episodes: 1083, Steps: 2870000, Epsilon: 0.01, Avg. Reward per Ep: -18.00
Episodes: 1086, Steps: 2875000, Epsilon: 0.01, Avg. Reward per Ep: -14.00
Episodes: 1088, Steps: 2880000, Epsilon: 0.01, Avg. Reward per Ep: -16.50
Episodes: 1090, Steps: 2885000, Epsilon: 0.01, Avg. Reward per Ep: -14.00
Episodes: 1092, Steps: 2890000, Epsilon: 0.01, Avg. Reward per Ep: -13.50
Episodes: 1094, Steps: 2895000, Epsilon: 0.01, Avg. Reward per Ep: -18.00
Episodes: 1096, Steps: 2900000, Epsilon: 0.01, Avg. Reward per Ep: -15.00
Episodes: 1098, Steps: 2905000, Epsilon

Episodes: 1267, Steps: 3395000, Epsilon: 0.01, Avg. Reward per Ep: -14.00
Episodes: 1269, Steps: 3400000, Epsilon: 0.01, Avg. Reward per Ep: -15.00
Episodes: 1271, Steps: 3405000, Epsilon: 0.01, Avg. Reward per Ep: -15.50
Episodes: 1273, Steps: 3410000, Epsilon: 0.01, Avg. Reward per Ep: -17.00
Episodes: 1275, Steps: 3415000, Epsilon: 0.01, Avg. Reward per Ep: -14.50
Episodes: 1277, Steps: 3420000, Epsilon: 0.01, Avg. Reward per Ep: -15.00
Episodes: 1279, Steps: 3425000, Epsilon: 0.01, Avg. Reward per Ep: -16.00
Episodes: 1282, Steps: 3430000, Epsilon: 0.01, Avg. Reward per Ep: -15.67
Episodes: 1284, Steps: 3435000, Epsilon: 0.01, Avg. Reward per Ep: -13.50
Episodes: 1286, Steps: 3440000, Epsilon: 0.01, Avg. Reward per Ep: -17.00
Episodes: 1288, Steps: 3445000, Epsilon: 0.01, Avg. Reward per Ep: -15.00
Episodes: 1290, Steps: 3450000, Epsilon: 0.01, Avg. Reward per Ep: -15.50
Episodes: 1292, Steps: 3455000, Epsilon: 0.01, Avg. Reward per Ep: -16.00
Episodes: 1294, Steps: 3460000, Epsilo

KeyboardInterrupt: 

1 - DQN IS USING NO-WRAPPERS AND PARAMETERS FROM THE UNTITLED NOTEBOOK (I.E EPSILON)
ALSO HAS THE FIXED REWARD[TERMINAL_MASK] THING, TARGET SET TO TRAIN (IMPLICITLY) 

1 - DQN-Copy2 IS USING WRAPPERS, PARAMETERS FROM NOTEBOOK, FIXED REWARD[TERMINAL_MASK] AND EXPLICITLY SETS TARGET TO TRAIN
I THINK I CAN REMOVE THE EXPLICIT TRAIN

2 - DQN-Copy1 SAME AS ABOVE BUT WITH TARGET SET TO EVAL **THIS SHOULD BE CORRECT**
ALSO SHOULD GIVE SAME RESULTS AS Copy2 DUE TO SEEDS BEING SET. EVAL SHOULD BE RIGHT BECAUSE WHAT ABOUT IF YOU USED BATCH NORM. **RESULTS ARE DIFFERENT**, SOMEWHERE THE SEED IS NOT BEING SET PROPERLY.

3 - DQN-Copy3 is a copy of copy1, using it to test determinism, i.e. if the seeds are all set correctly (which I believe they are) then results should be exact same as copy1. 

**TEST RANDOM SEED THING TOMORROW BY RUNNING RESULTS FOR DQN-Copy1 and Copy2 AND SEEING IF THEY GIVE THE SAME AS ONES TRAINED 2NITE**