## Verify that reuse gets the same (or close) activations as base method
 - Check by doing the reuse method in forward, then redo using base method, then check equality
 - Use CPU, does not fit into CUDA memory

In [13]:
import torch

import numpy as np
import pickle
import matplotlib.pyplot as plt
import sys
import time
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.autograd import Variable
from torch.distributions import Categorical

import time

import gym

In [14]:
options = {"game":"YarsRevenge-v0",            ## Game to train policy on
           "use_all_channels":False,    ## Use 3 channels (RGB) or 1(Greyscale) (MUST UPDATE PREPRO TO ALLOW FOR RGB)
           "save_model":False,          ## Save model after training (NYI)
           "n_conv_layers":2,           ## Number of layers to use (as of now, only 2 supported)
           "n_channels_out_1":20,       ## Number of channels/filters in conv1
           "n_channels_out_2":40,       ## Number of channels/filters in conv1
           "lr":0.0005,                   ## Learning rate for training
           "batch_size":1,              ## Update policy ever x episodes
           "n_episodes":10,             ## Number of episodes to train for
           "gamma":0.99,                ## Discount factor for reward
           "n_reset_graph":5,           ## Number of episodes to carry computational graph for before resetting
          # "device":"cpu",              ## Always use CPU
           "kernel_size":3,             ## No support for varying kernel sizes yet
           "render":False               ## Render gameplay
          }
options["n_channels_in"] = 3 if options["use_all_channels"] else 1  ## Number of input (color) channels

#### Define helper to turn observations into torch tensor, and read in observation info

In [15]:
## Observation -> torch tensor
def ob2torch(observation):
    return torch.tensor(observation.reshape(1, 
                                            3, # options["n_channels_in"], 
                                            n_dim_x, 
                                            n_dim_y)).float()

In [16]:
env = gym.make(options["game"])
observation = env.reset()
print(observation.shape)
observations = []

(210, 160, 3)


In [17]:
n_dim_x_d2 = observation.shape[0] // 2
n_dim_y_d2 = observation.shape[1] // 2
def prepro(I):
    """ prepro 210x160x3 uint8 frame into 6400 (80x80) 1D float vector """
    I = I[::2,::2,0] # downsample by factor of 2
    return torch.tensor(I.reshape(1, 1, n_dim_x_d2, n_dim_y_d2)).float()

In [18]:
batch_size = 1
n_channels = 3

n_dim_x = prepro(observation).shape[2]
n_dim_y = prepro(observation).shape[3]

In [19]:
## Base: Conv1 -> Conv2 -> FC -> Softmax -> Action Probabilities
class TEST_Reuse_Policy(nn.Module):
    def __init__(self, n_out_channels_1=options["n_channels_out_1"], 
                       n_out_channels_2=options["n_channels_out_2"],
                       n_channels_in=options["n_channels_in"],
                       kernel_size=options["kernel_size"]):
        super(TEST_Reuse_Policy, self).__init__()
        self.kernel_size = kernel_size
        self.x_prev = None
        
        ## Layers
        self.conv1 = nn.Conv2d(n_channels_in, n_out_channels_1, kernel_size=kernel_size)
        self.conv2 = nn.Conv2d(n_out_channels_1, n_out_channels_2, kernel_size=kernel_size)
        self.fc = nn.Linear(n_out_channels_2 * (n_dim_x - 2*(kernel_size-1)) *(n_dim_y - 2*(kernel_size-1)), 
                            env.action_space.n, 
                            bias=False) ## Assumes kernel_size=3
        
        self.last_c1 = None  ## Last activation of conv1 layer
        self.last_c2 = None  ## Last activation of conv2 layer
        
        ## Training
        #self.policy_history = Variable(torch.Tensor()).to(device=options["device"]) 
        self.saved_log_probs = []
        self.rewards = []
        self.reward_history = []         # Overall reward and loss history
        self.loss_history = []
        
    def forward(self, x):
        ## Get difference of frames
        if self.x_prev is not None:
            x_diff = (self.x_prev - x)
            self.x_prev = x
            
        ## Process for the first time
        else:
            out = F.relu(self.conv1(x))
            self.last_c1= out.clone()
            out = F.relu(self.conv2(out))
            self.last_c2= out.clone()
            out = out.view(-1)
            out = self.fc(out)
            out = torch.softmax(out, dim=-1)
            self.x_prev = x
            return out
            
        ## Get indices to redo
        redo_idx = x_diff.nonzero()
        if redo_idx.nelement() == 0:
            out = self.last_c2.view(-1)
            out = self.fc(out)
            return torch.softmax(out, dim=-1)

        ## Get min/max of indices to redo, accounting for size of convolution and borders
        min_idx_x = redo_idx.min(-2)[0][2].item()
        min_idx_y = redo_idx.min(-2)[0][3].item()
        max_idx_x = redo_idx.max(-2)[0][2].item()
        max_idx_y = redo_idx.max(-2)[0][3].item()
        r_x1 = max(min_idx_x - (self.kernel_size - 1), 0)
        r_x2 = min(max_idx_x + self.kernel_size, n_dim_x)
        r_y1 = max(min_idx_y - (self.kernel_size - 1), 0)
        r_y2 = min(max_idx_y + self.kernel_size, n_dim_y)
        
        ## Redo first level of convolutions, assign result to the proper area of the previous activations
        redo_area = x[:,:,r_x1:r_x2,r_y1:r_y2]
        redo_area = self.conv1(redo_area)
        redo_area = F.relu(redo_area)
        c1 = self.last_c1.clone()
        c1[:,:,r_x1:r_x1+redo_area.shape[2],r_y1:r_y1+redo_area.shape[3]] = redo_area
        
        ## Get min/max of indices to redo for 2nd conv, accounting for size of convolution and borders
        r_x1 = max(r_x1 - (self.kernel_size - 1), 0)
        r_x2 = min(r_x2 + self.kernel_size, n_dim_x)
        r_y1 = max(r_y1 - (self.kernel_size - 1), 0)
        r_y2 = min(r_y2 + self.kernel_size, n_dim_y)
        
        ## Redo 2nd layer of convolutions, assign it to the proper area of the previous activations
        redo_area = F.relu(self.conv2(c1[:,:,r_x1:r_x2,r_y1:r_y2]))
        c2 = self.last_c2.clone()
        c2[:,:,r_x1:r_x1+redo_area.shape[2],r_y1:r_y1+redo_area.shape[3]] = redo_area
        
        out = c2.view(-1)
        out = self.fc(out)
        
        self.last_c1 = c1
        self.last_c2 = c2
        
        
        ## Redo the convolution the base way, check for discrepency
        out2 = F.relu(self.conv1(x))
        out2 = F.relu(self.conv2(out2))
        out2 = out2.view(-1)
        out2 = self.fc(out2)
        assert(torch.allclose(out,out2,rtol=1e-4,atol=1e-4))
        
        
        return torch.softmax(out, dim=-1)        

In [20]:
#env = gym.make("Pong-v0")
observation = env.reset()

## REINFORCE

In [21]:
device = "cpu"

def select_action(state, policy):
    state = state.to(device=device)
    #state = torch.from_numpy(state).float().unsqueeze(0)
    probs = policy(state)
    m = Categorical(probs)
    action = m.sample()
    policy.saved_log_probs.append(m.log_prob(action).to(device=device))
    return action.item()

def update_policy(policy, optimizer, retain_graph=False):
    R = 0
    policy_loss = []
    returns = []
    for r in policy.rewards[::-1]:
        R = r + options["gamma"] * R
        returns.insert(0, R)
    returns = torch.tensor(returns).to(device=device)
    returns = (returns - returns.mean()) / (returns.std() + 0.0001)
    for log_prob, R in zip(policy.saved_log_probs, returns):
        policy_loss.append(-log_prob * R)
    optimizer.zero_grad()
    #print(policy_loss)
    #policy_loss = torch.cat(policy_loss).sum()
    policy_loss = sum(policy_loss)
    policy_loss.backward()
    optimizer.step()
    del policy.rewards[:]
    del policy.saved_log_probs[:]

## Training

In [22]:
np.random.seed(0)
torch.manual_seed(0)
env = gym.make(options["game"])
env.seed(0)

[0, 592379725]

In [23]:
policy = TEST_Reuse_Policy()
policy = policy.to(device=options["device"])
optimizer = optim.Adam(policy.parameters(), lr=options["lr"])

episode_number = 0
reward_sum=0
observation = env.reset()
policy.x_prev=None

batch_size = 1
x_prev = torch.zeros((1,1,n_dim_x_d2,n_dim_y_d2))

start=time.time()
while(episode_number < 10):
    curr_img = prepro(observation)
    x = curr_img.reshape(1,1,n_dim_x_d2,n_dim_y_d2)
    
    # forward the policy network and sample an action from the returned probability
    action = select_action(torch.tensor(x - x_prev).float(), policy)
    
    observation, reward, done, info = env.step(action)

    policy.rewards.append(reward)
    reward_sum += reward
    x_prev = x
    
    if done: # an episode finished
        print("Total reward for this ep({0:d}): {1:.2f}".format(episode_number, reward_sum))
        episode_number += 1

        if episode_number % batch_size == 0:
            update_policy(policy, optimizer, retain_graph=False)
            policy.x_prev=None
            #if episode_number % options["n_reset_graph"] != 0 :
            #    update_policy(policy, optimizer, retain_graph=True)
            #else:
            #    update_policy(policy, optimizer, retain_graph=False)
            #    policy.x_prev=None
        
        if episode_number % 50 == 0:
            PATH = 'models/reuse_fix'
            torch.save({
                'episode_number': episode_number,
                'model_state_dict': policy.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
            }, PATH)
            
        observation = env.reset()
        reward_sum=0
        
end = time.time()
print(end - start)

Total reward for this ep(0): 4140.00
Total reward for this ep(1): 1449.00
Total reward for this ep(2): 966.00
Total reward for this ep(3): 2139.00
Total reward for this ep(4): 2691.00
Total reward for this ep(5): 5544.00
Total reward for this ep(6): 5244.00
Total reward for this ep(7): 4899.00
Total reward for this ep(8): 5137.00
Total reward for this ep(9): 5582.00
270.62579560279846
