In [1]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [2]:
import gym
import math
import random
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from collections import namedtuple
from itertools import count
from copy import deepcopy
from PIL import Image
import os 

import torch
import torch.nn as nn
from torch.nn import Parameter
import torch.optim as optim
import torch.nn.functional as F
from torch.autograd import Variable, Function
import torchvision.transforms as T

import ale

# if gpu is to be used
use_cuda = torch.cuda.is_available()
FloatTensor = torch.cuda.FloatTensor if use_cuda else torch.FloatTensor
IntTensor = torch.cuda.IntTensor if use_cuda else torch.IntTensor
LongTensor = torch.cuda.LongTensor if use_cuda else torch.LongTensor
ByteTensor = torch.cuda.ByteTensor if use_cuda else torch.ByteTensor
Tensor = FloatTensor

In [48]:
Transition = namedtuple('Transition',
                        ('state', 'action', 'reward', 'next_state'))

class ReplayMemory(object):

    def __init__(self, max_memory):
        self.capacity = max_memory
        self.memory = []
        self.position = 0

    def push(self, *args):
        """Saves a transition."""
        if len(self.memory) < self.capacity:
            self.memory.append(None)
        self.memory[self.position] = Transition(*args)
        self.position = (self.position + 1) % self.capacity

    def sample(self, session_size, queue_size):
        idx = np.random.randint(queue_size, len(self.memory)-session_size)
               
        return self.memory[(idx-(queue_size - 1)):(idx+session_size)]

    def __len__(self):
        return len(self.memory)
    
    
"""class ReplayMemory(object):
    
    def __init__(self, max_memory=100):
        self.max_memory = max_memory
        self.memory = []
        self.discount = Tensor([discount])

    def push(self, states, game_over):
        self.memory.append([states, game_over])
        if len(self.memory) > self.max_memory:
            del self.memory[0]


    def sample(self, model, sessions=5, session_size=20, queue_size=1):
        len_memory = len(self.memory)
        num_actions = model.output_shape[-1]

        Oin = None
        Otar = None
        
        for i, idx in enumerate(np.random.randint(queue_size, len_memory-session_size,
                                                  size=min(len_memory-session_size - queue_size, sessions))):

            mems = self.memory[idx - (queue_size - 1):idx+session_size]
            counter_sess = 0 - (queue_size - 1)
            model.reset_queue()
            for mem in mems:
                state_t, action_t, reward_t, state_tp1 = mem[0]
                game_over = mem[1]
                inputs = state_t
                
                # There should be no target values for actions not taken.
                # Thou shalt not correct actions not taken #deep
                targets = model(state_t) # Variable
                counter_sess += 1
                if counter_sess >= 0:
                    temp_queue = model.save_queue()
                    Q_sa = model(state_tp1).data.max(1)[0].view(1, 1)
                    model.load_queue(temp_queue)
                    if game_over:  # if game_over is True
                        targets[0, action_t[0]] = Variable(reward_t)
                    else:
                        targets[0, action_t[0]] = Variable(Q_sa * self.discount + reward_t)
                    print(targets.size(), "inputs")
                    print(inputs.size(), "inputs")
                    if Oin is None:
                        Oin = inputs
                        Otar = targets
                    else:
                        Oin = torch.cat((Oin, inputs), 0)
                        Otar = torch.cat((Otar, targets), 0)
                
            
        return Oin, Otar
    
"""

'class ReplayMemory(object):\n    \n    def __init__(self, max_memory=100):\n        self.max_memory = max_memory\n        self.memory = []\n        self.discount = Tensor([discount])\n\n    def push(self, states, game_over):\n        self.memory.append([states, game_over])\n        if len(self.memory) > self.max_memory:\n            del self.memory[0]\n\n\n    def sample(self, model, sessions=5, session_size=20, queue_size=1):\n        len_memory = len(self.memory)\n        num_actions = model.output_shape[-1]\n\n        Oin = None\n        Otar = None\n        \n        for i, idx in enumerate(np.random.randint(queue_size, len_memory-session_size,\n                                                  size=min(len_memory-session_size - queue_size, sessions))):\n\n            mems = self.memory[idx - (queue_size - 1):idx+session_size]\n            counter_sess = 0 - (queue_size - 1)\n            model.reset_queue()\n            for mem in mems:\n                state_t, action_t, reward_t

In [77]:
class FifoQueue(Function):
    
    def forward(self, old_queue, inputs):
        outputs = None
        queue = old_queue.clone()
        for temp in inputs:
            queue = torch.cat((queue, temp.unsqueeze(0)), 0)
            queue = queue[1:]
            if outputs is None:
                outputs = queue.clone().unsqueeze(0)
            else:
                outputs = torch.cat((outputs, queue.unsqueeze(0)), 0)
        return outputs, queue

    def backward(self, output_grad, bad_boi):
        return bad_boi[-1], output_grad[:,-1]


class Queue(nn.Module):
    def __init__(self, input_features, out_features, queue_size, bias=None):
        super(Queue, self).__init__()
        # Setup the Queue
        self.input_features = input_features
        self.queue_size = queue_size
        self.reset_queue()
        
        # Init the queue function
        self.FifoQueue = FifoQueue()

    def forward(self, inputs):
        # The actual call that is run when you pass data into this model. 
        output, self.queue = self.FifoQueue(self.queue, inputs)

        # Linear layer built into this layer. 
        return output
            
    # reunit the queue
    def reset_queue(self):
        if type(self.input_features) is list:
            queue = Variable(torch.zeros([self.queue_size]+self.input_features).type(FloatTensor))
        else:
            queue = Variable(torch.zeros([self.queue_size]+[self.input_features]).type(FloatTensor))
        self.queue = queue
"""        # Not sure if needed
        self.queue.requires_grad = False"""
        



'        # Not sure if needed\n        self.queue.requires_grad = False'

In [5]:
class Queued_DQN(nn.Module):
    def __init__(self, input_size, hidden_size, num_actions, queue_size):
        super(Queued_DQN, self).__init__()
        
        self.conv1 = nn.Conv2d(1, 16, kernel_size=8, stride=4)
        self.bn1 = nn.BatchNorm2d(16)
        self.conv2 = nn.Conv2d(16, 16, kernel_size=4, stride=2)
        self.bn2 = nn.BatchNorm2d(16)
        self.queue = Queue(1296, 1296*queue_size, queue_size)
        self.lin1 = nn.Linear(1296*queue_size, 1296*queue_size)
        self.lin2 = nn.Linear(1296*queue_size, num_actions)
        self.output_shape = [1, num_actions]
        
    def forward(self, x):
        # Replace queue with another Linear and you 
        # have the same network we use for everything else. 
        # This is the model
        
        x = F.relu(self.bn1(self.conv1(x)))
        x = F.relu(self.bn2(self.conv2(x)))
        x = x.view(x.size()[0], -1)
        x = self.queue(x)
        x = x.view(x.size()[0], -1)
        x = F.relu(self.lin1(x))
        x = self.lin2(x)
        return x
    
    def reset_queue(self):
        self.queue.reset_queue()
        
    def save_queue(self):
        return self.queue.queue.clone()
        
    def load_queue(self, input_queue):
        self.queue.queue = input_queue.clone()
        
"""class Queued_DQN(nn.Module):
    def __init__(self, input_size, hidden_size, num_actions, queue_size):
        super(Queued_DQN, self).__init__()
        self.queue = Queue(input_size, input_size*queue_size, queue_size)
        #self.lin1 = nn.Linear(input_size*queue_size, hidden_size)
        self.lin2 = nn.Linear(hidden_size, hidden_size)
        self.lin3 = nn.Linear(hidden_size, num_actions)
        self.output_shape = [1, num_actions]
        
    def forward(self, x):
        # Replace queue with another Linear and you 
        # have the same network we use for everything else. 
        # This is the model
        x = F.relu(self.queue(x))
        #x = F.relu(self.lin1(x))
        x = F.relu(self.lin2(x))
        x = self.lin3(x)
        return x
    
    def reset_queue(self):
        self.queue.reset_queue()
        
    def save_queue(self):
        return self.queue.queue.clone()
        
    def load_queue(self, input_queue):
        self.queue.queue = input_queue.clone() """

'class Queued_DQN(nn.Module):\n    def __init__(self, input_size, hidden_size, num_actions, queue_size):\n        super(Queued_DQN, self).__init__()\n        self.queue = Queue(input_size, input_size*queue_size, queue_size)\n        #self.lin1 = nn.Linear(input_size*queue_size, hidden_size)\n        self.lin2 = nn.Linear(hidden_size, hidden_size)\n        self.lin3 = nn.Linear(hidden_size, num_actions)\n        self.output_shape = [1, num_actions]\n        \n    def forward(self, x):\n        # Replace queue with another Linear and you \n        # have the same network we use for everything else. \n        # This is the model\n        x = F.relu(self.queue(x))\n        #x = F.relu(self.lin1(x))\n        x = F.relu(self.lin2(x))\n        x = self.lin3(x)\n        return x\n    \n    def reset_queue(self):\n        self.queue.reset_queue()\n        \n    def save_queue(self):\n        return self.queue.queue.clone()\n        \n    def load_queue(self, input_queue):\n        self.queue.qu

In [98]:
import numpy as np
import pandas as pd
def train_catch(catch, model, exp_replay, num_games, session_size, sessions,
                per_random_act=.1, num_actions=3, queue_size=1, 
                test_every=None, test_on_games=100, discount=.9):
    
    # Pytorch Adagrad Optimizer linked with the model
    optimizer = optim.Adagrad(model.parameters())

    # Set up the return Loss and Scores dataframes
    rLoss = pd.DataFrame(columns = ['epoch', 'loss'])
    rLoss['epoch'] = rLoss['epoch'].astype(int)

    if test_every is not None:
        rScores = pd.DataFrame(columns = ['epoch', str('mean score over ' + str(test_on_games) + ' games')])
        rScores['epoch'] = rLoss['epoch'].astype(int)
        
    # Record variables for training
    game_cnt = 0
    timestep = 0.0
    counter = 0
    after_first_ball = False
    
    pushed = 0
    
    #train over total epochs
    for e in range(num_games):

        game_over = False
        tot_loss = 0.0

        # get initial input
        input_t = Tensor(catch.observe(flatten=False, expand_dim=True))
    
        #iterate over each game
        while not game_over:
            # t_-1 is the previous observation
            input_tm1 = input_t.clone()
            
            # Calculate next action
            q = model(Variable(input_tm1, volatile=True).type(FloatTensor)).data.max(1)[1].view(1, 1)
            
            # Randomly pick an action, or use the 
            # Preciously calculated action
            if np.random.rand() <= per_random_act:
                action = (torch.rand(1) * (num_actions)).type(LongTensor)
            else:
                action = q.type(LongTensor)[0]
                
            # apply action, get rewards and new state
            reward, timestep, game_over = catch.act(action.numpy()[0]-1)
            # t_0, current timestep
            input_t = Tensor(catch.observe(flatten=False, expand_dim=True))
            
            # store experience
            exp_replay.push(input_tm1, action, Tensor([reward]), input_t)
            
            # game has completed, add one to the total game count
            if game_over:
                game_cnt += 1
                
            pushed = pushed + 1
                
            # adapt model
            if pushed >= session_size*sessions*2 + 1: 
                temp_queue = model.save_queue()
                for pep in range(sessions):
                    model.reset_queue()
                    transitions = exp.sample(session_size, queue_size)

                    batch = Transition(*zip(*transitions))

                    non_final_mask = ByteTensor(tuple(map(lambda s: s is not None,
                                                          batch.next_state)))

                    
                    non_final_next_states = Variable(torch.cat([s for s in batch.next_state
                                                                if s is not None]),
                                                     volatile=True)
                    
                    state_batch = Variable(torch.cat(batch.state))
                    action_batch = Variable(torch.cat(batch.action))
                    reward_batch = Variable(torch.cat(batch.reward))

                    # Compute Q(s_t, a) - the model computes Q(s_t), then we select the
                    # columns of actions taken
                    state_action_values = model(state_batch).gather(1, action_batch.unsqueeze(-1))

                    # Compute V(s_{t+1}) for all next states.
                    next_state_values = Variable(torch.zeros(session_size+(queue_size-1)).type(Tensor))
                                      
                    next_state_values[non_final_mask] = model(non_final_next_states).max(1)[0]
                    # Now, we don't want to mess up the loss with a volatile flag, so let's
                    # clear it. After this, we'll just end up with a Variable that has
                    # requires_grad=False
                    next_state_values.volatile = False

                    # Compute the expected Q values
                    expected_state_action_values = (next_state_values * discount) + reward_batch

                    # Compute Huber loss
                    loss = F.smooth_l1_loss(state_action_values, expected_state_action_values)
                    tot_loss = tot_loss + loss.data.numpy()[0]

                    # Optimize the model
                    optimizer.zero_grad()
                    loss.backward()
                    for param in model.parameters():
                        if param.grad is not None:
                            param.grad.data.clamp_(-1, 1)
                    optimizer.step()

                model.load_queue(temp_queue)

            

        if pushed >= session_size*sessions*2 + 1:
            # Save and report loss
            rLoss.loc[len(rLoss), :] = [int(e),loss]
            print("Epoch {:03d} | Loss {:.4f}".format(e, tot_loss))
            # Create initial start environment
            
        # Reset Game and Model Queue when the game is over. 
        catch.reset()
        model.reset_queue()
        
        # Test the current model weights if need be.
        if (test_every is not None) and ((e + 1) % test_every == 0):

            scores = test_catch(
                catch=catch, model=model, 
                test_on_games=test_on_games)

            # Save and report mean Score
            ms = scores['score'].mean()
            rScores.loc[len(rScores), :] = [int(e), ms]
            print("Epoch {:03d} | MeanScore {:.2f}".format(e, ms))

            catch.reset()

    # Return rLoss, and rScores if the user requested the model to be tested
    # while training.
    if (test_every is not None):
        return rLoss, rScores
    else:
        return rLoss



def test_catch(catch, model, test_on_games=100,  save_frames=False):

    if save_frames:
        frames_stack = []
    scores = pd.DataFrame(columns=['game', 'score'])
    scores['game'] = scores['game'].astype(int)
    # Iterate over number of games to play
    for game_num in range(test_on_games):

        # Count for num balls that reached the end
        game_over = False

        # Get initial Frame
        frame_num = 1
        input_t = Tensor(catch.observe(flatten=False, expand_dim=True))
        if save_frames:
            frames_stack.append(input_t)

        total_score = 0
        # Iterate until end of testing game
        while not game_over:

            input_tm1 = input_t.clone()

            # Get next action
            q = model(Variable(input_tm1, volatile=True).type(FloatTensor)).data.max(1)[1].view(1, 1)
            action = q.numpy()[0][0]

            # apply action, get rewards and new state
            reward, timestep, game_over = catch.act(action-1)
            input_t = Tensor(catch.observe(flatten=False, expand_dim=True))
            total_score += reward
            # Iterate frame number
            frame_num += 1

            if save_frames:
                frames_stack.append(input_t)
                
        scores.loc[len(scores), :] = [int(game_num), total_score]
        #scores.append([game_num, total_score])
        game_num += 1
        catch.reset()
        model.reset_queue()

    if save_frames:
        return scores, frames_stack
    else:
        return scores
    


In [7]:
sess_size=10
sess = 5

q_size = 4
i=0

test_name = "RUN" + str(i) + "_Q" + str(q_size)+"_breakout"

breakout = ale.AleEnv("breakout.a26", game_over_conditions={"lives":2, "episodes":200}, 
                      frame_skip=4, screen_color='gray', 
                      min_action_set=True, reduce_screen=True)

  warn("The default mode, 'constant', will be changed to 'reflect' in "


In [99]:
exp = ReplayMemory(max_memory=1000)

width = breakout.width
height = breakout.height

num_actions = len(breakout.actions)

print(width, height, num_actions)

hidden_size = width * height

model = Queued_DQN(hidden_size, hidden_size*q_size, num_actions, q_size)
if use_cuda:
    model.cuda()

84 84 4


In [100]:
breakout.reset()


In [101]:
loss, test_on_train = train_catch(catch = breakout, model = model, exp_replay = exp, num_games = 500, per_random_act=.33,
                                  test_every = 50, test_on_games = 100, session_size=sess_size, sessions=sess, queue_size=q_size)
breakout.reset()

tests = test_catch(catch = breakout, model = model, test_on_games = 1000)

  warn("The default mode, 'constant', will be changed to 'reflect' in "


Epoch 000 | Loss 1289.1372


KeyboardInterrupt: 

In [None]:
os.makedirs(".//configs", exist_ok=True)
os.makedirs(".//data", exist_ok=True)
os.makedirs(".//figs", exist_ok=True)

torch.save(model, "configs//" + test_name + ".pt")
loss.to_csv("data//" + test_name + "_loss.csv", index=False)
test_on_train.to_csv("data//" + test_name + "_tests_in_training.csv", index=False)
tests.to_csv("data//" + test_name + "_tests.csv", index=False)


In [None]:
model = torch.load("configs//" + test_name + ".pt")

In [13]:
exp.sample(model, 
          sessions=sess, 
          session_size=sess_size, 
          queue_size=q_size)

(array([[ 1.,  1.,  1., ...,  0.,  0.,  0.],
        [ 1.,  1.,  1., ...,  0.,  0.,  0.],
        [ 1.,  1.,  1., ...,  0.,  0.,  0.],
        ..., 
        [ 1.,  1.,  1., ...,  0.,  0.,  0.],
        [ 1.,  1.,  1., ...,  0.,  0.,  0.],
        [ 1.,  1.,  1., ...,  0.,  0.,  0.]]),
 array([[-0.01113155,  0.02911727, -0.00065977,  0.00394386],
        [-0.02103461, -0.02210965,  0.0265988 ,  0.00849212],
        [-0.02237904, -0.02285606,  0.02704065,  0.0142559 ],
        [-0.02158727, -0.02220373,  0.02559841,  0.0099347 ],
        [-0.02230541, -0.02113417,  0.03136184,  0.01512389],
        [-0.01932661, -0.02191391,  0.02813921,  0.01295937],
        [-0.00233078,  0.00089694,  0.02813921,  0.00447616]], dtype=float32))

In [10]:

exp.memory


[[[array([[ 1.,  1.,  1., ...,  0.,  0.,  0.]]),
   0,
   0,
   array([[ 1.,  1.,  1., ...,  0.,  0.,  0.]])],
  False],
 [[array([[ 1.,  1.,  1., ...,  0.,  0.,  0.]]),
   1,
   0,
   array([[ 1.,  1.,  1., ...,  0.,  0.,  0.]])],
  False],
 [[array([[ 1.,  1.,  1., ...,  0.,  0.,  0.]]),
   0,
   0,
   array([[ 1.,  1.,  1., ...,  0.,  0.,  0.]])],
  False],
 [[array([[ 1.,  1.,  1., ...,  0.,  0.,  0.]]),
   2,
   0,
   array([[ 1.,  1.,  1., ...,  0.,  0.,  0.]])],
  False],
 [[array([[ 1.,  1.,  1., ...,  0.,  0.,  0.]]),
   1,
   0,
   array([[ 1.,  1.,  1., ...,  0.,  0.,  0.]])],
  False],
 [[array([[ 1.,  1.,  1., ...,  0.,  0.,  0.]]),
   2,
   0,
   array([[ 1.,  1.,  1., ...,  0.,  0.,  0.]])],
  False],
 [[array([[ 1.,  1.,  1., ...,  0.,  0.,  0.]]),
   2,
   0,
   array([[ 1.,  1.,  1., ...,  0.,  0.,  0.]])],
  False],
 [[array([[ 1.,  1.,  1., ...,  0.,  0.,  0.]]),
   2,
   0,
   array([[ 1.,  1.,  1., ...,  0.,  0.,  0.]])],
  False],
 [[array([[ 1.,  1.,  1., ...,  

In [28]:
for i in np.random.randint(q_size, len(exp.memory)-sess_size,
                                                  size=min(len(exp.memory)-sess_size - q_size, sess)):
    print(exp.memory[i - (q_size - 1):i - (q_size - 1)+sess_size])

[[[array([[ 1.,  1.,  1., ...,  0.,  0.,  0.]]), 2, 0, array([[ 1.,  1.,  1., ...,  0.,  0.,  0.]])], False], [[array([[ 1.,  1.,  1., ...,  0.,  0.,  0.]]), 2, 0, array([[ 1.,  1.,  1., ...,  0.,  0.,  0.]])], False], [[array([[ 1.,  1.,  1., ...,  0.,  0.,  0.]]), 2, 0, array([[ 1.,  1.,  1., ...,  0.,  0.,  0.]])], False]]
[]
[]
[]
[]


In [25]:
np.random.randint(q_size, len(exp.memory)-sess_size,
                                                  size=min(len(exp.memory)-sess_size - q_size, sess))

array([29, 35, 54, 21, 88])

In [18]:
c = nn.Conv2d(16, 32, kernel_size=4, stride=2)

In [13]:
breakout.observe(flatten=False, expand_dim=True).shape

(1, 84, 84, 1)