In [None]:
import numpy as np
import matplotlib.pyplot as plt
import gym
import sys
import random

import torch
from torch import nn
from torch import optim
from torch.nn import Module, Linear, ReLU, Sigmoid, Sequential, MaxPool2d
from torch.nn import functional as F

print(sys.version)
print(torch.__version__)
print(torch.version.cuda)

In [None]:
GPU_AVAILABLE = torch.cuda.is_available()
path_to_save = "./saved_model"
print("GPU:", GPU_AVAILABLE)

def enable_cuda(x):
    if GPU_AVAILABLE:
        return x.cuda()
    return x
def to_cpu(x):
    if GPU_AVAILABLE:
        return x.cpu()
    return x


In [None]:
# ad hoc testing shows good
def getLatentState(ram_arr, num_steps):
    assert(len(ram_arr) == 128)
    x_coord = ram_arr[100]
    y_coord = ram_arr[102]
    prev_score = int(hex(ram_arr[73])[2:]) * 100
    prev_score += int(hex(ram_arr[74])[2:])
    igloo_blocks = ram_arr[77]
    
    x_coord -= 16
    x_coord /= (160.0 - 16.0)
    y_coord -= 22
    y_coord /= (140.0 - 22.0)
    if igloo_blocks == 255:
        igloo_blocks = 0
    else:
        igloo_blocks += 1
    prev_score /= 1600.0
    igloo_blocks /= 20.0
    num_steps /= 2000.0
    
    return [x_coord, y_coord, prev_score, igloo_blocks, num_steps]

def getState(policy, num_steps, ram_arr, rgb_prev=None):
    latent = getLatentState(ram_arr, num_steps)
    latent = np.asarray(latent, dtype=np.float32)
    rgb = env.render("rgb_array")
    rgb = policy.preprocess(np.expand_dims(rgb, 0))
    if rgb_prev is not None:
        rgb_prev[0:-1] = rgb_prev[1:]
        rgb_prev[-1] = rgb
    else:
        rgb_prev = np.concatenate((rgb, rgb, rgb, rgb), axis=0)
        
    return (rgb_prev, latent)
    

In [None]:
class policy_estimator(Module):   
    def __init__(self, env):
        super(policy_estimator, self).__init__()
        _ = env.observation_space.shape
        self.input_dim = 6000
        self.output_dim = 6
        
        self.conv_depth = 3
        self.pool_stride = 1
        hidden_dim = 200
        kernel_size = 8

        self.conv_layer = Sequential(
            nn.Conv2d(in_channels=4, out_channels=16, kernel_size=8, stride=4),
            nn.BatchNorm2d(16),
            ReLU(),
            nn.Conv2d(in_channels=16, out_channels=32, kernel_size=4, stride=2),
            nn.BatchNorm2d(32),
            ReLU(),
            nn.Conv2d(in_channels=32, out_channels=32, kernel_size=3, stride=1),
            nn.BatchNorm2d(32),
            ReLU())
        self.linear_layers = Sequential(
            Linear(965, hidden_dim),
            ReLU(),
            Linear(hidden_dim, self.output_dim))
        
    # downsample and grayscale
    def preprocess(self, I):
        x = I[:,45:185,8:]
        x = x[:,::2,::2,:]  # downsample by factor of 2.
        x = 0.07 * x[:,:,:,2] + 0.72 * x[:,:,:,1] + 0.21 * x[:,:,:,0]
        x = x.astype(np.float32)
        return x
        
    # Defining the forward pass    
    def forward(self, rgb, latent):
#         print(x.shape)
        if len(rgb.shape) != 4:# is not None:
            rgb = np.asarray(rgb, dtype=np.float32)
            rgb = np.expand_dims(rgb, axis=0)
            rgb = enable_cuda(torch.FloatTensor(rgb))
            latent = np.asarray(latent, dtype=np.float32)
            latent = np.expand_dims(latent, axis=0)
            latent = enable_cuda(torch.FloatTensor(latent))
        x = self.conv_layer(rgb)
#         print("shape", x.shape)
        x = x.reshape(-1, 960)
#         print(x.shape)
#         print(latent.shape)
        x = torch.cat((x, latent), dim=1)
#         print(x.shape)
#         x = latent
        x = self.linear_layers(x)
        return x

In [None]:
env = gym.make('Frostbite-ramDeterministic-v0')
env.unwrapped.seed(0)
rewards = []
s = env.reset()
# print(s)
print(s.shape)
print(env.observation_space.shape)
pe = policy_estimator(env)
pe = enable_cuda(pe)
(rgb, latent) = getState( pe, 0, s, None)
print("latent", latent)

print(pe.forward(rgb, latent))
# Define optimizer
optimizer = optim.Adam(pe.parameters(), 
                       lr=0.0001)


In [None]:
class ProgressTracker:
    def __init__(self):
        self.xexplored = []
        self.yexplored = []
        
    def addEp(self, ep=None):
        if ep is None or ep >= len(self.xexplored):
            self.xexplored.append([])
            self.yexplored.append([])
        else:
            self.xexplored[ep] = []
            self.yexplored[ep] = []
    
    def addCoord(self, ep, xcoord, ycoord):
        self.xexplored[ep].append(ycoord)
        self.yexplored[ep].append(1- xcoord)
        
    def plotEps(self, ep, epEnd=None):
        if epEnd is None:
            epEnd = ep + 1
        flatten = lambda t: [item for sublist in t for item in sublist]
        x = flatten(self.xexplored[ep:epEnd])
        y = flatten(self.yexplored[ep:epEnd])
        heatmap, xedges, yedges = np.histogram2d(x, y, bins=(144//5, 118//5), range=((0,1), (0, 1)))
        extent = [0, 1, 0,  1]
        
        plt.clf()
        plt.imshow(heatmap.T, extent=extent, origin='lower')
        plt.show()
        
    

In [None]:
progress = ProgressTracker()

In [None]:
def discount_rewards(rewards, gamma=0.99):
    rewards = [r if r > 0 else -1 for r in rewards]
    r = np.array([(gamma**i) * rewards[i] 
                  for i in range(len(rewards))])
    # Reverse the array direction for cumsum and then
    # revert back to the original order
    r = r[::-1].cumsum()[::-1]
    r_std = r.std()
    eps = 0.0001 
#     if (r_std < eps):
#         return r - r.mean()
    return r - r.mean()#/r_std

def get_action(env, policy, s_0, debug=False):
        # Get actions and convert to numpy array
    with torch.no_grad():
        action_probs = to_cpu(policy(*s_0).detach())
        if (torch.isnan(action_probs).any()):
            print(action_probs)
            assert(False)

    if debug:
        print(" logits:", action_probs)
    action_probs = torch.distributions.Categorical(logits=action_probs)
    action = action_probs.sample()
    return action, action_probs.probs[0, action]

def do_episode(env, policy, progress, ep_num, gamma, max_steps):
    ram_0 = env.reset()
    progress.addEp()
    rgb_0 = None

    rstates = []
    lstates = []
    rewards = []
    actions = []
    actions_probs = []
    complete = False
    while complete == False and len(rstates) < max_steps:
        # Get actions and convert to numpy array
        rgb_0, latent = getState(pe, len(rstates), ram_0, rgb_0)
        
        progress.addCoord(-1, latent[0], latent[1])
        action, prob = get_action(env, policy, (rgb_0, latent), len(rstates) == 2)
        ram_0, r, complete, lives = env.step(action)
        if lives['ale.lives'] < 4:
            complete = True

        rstates.append(rgb_0)
        lstates.append(latent)
        rewards.append(r)
        actions.append(action)
        total_reward = sum(rewards)
        drewards = discount_rewards(rewards, gamma)
        actions_probs.append(prob)
    return rstates, lstates, actions, actions_probs, total_reward, drewards

def finish_batch(policy, optimizer, all_rstates, all_lstates, all_actions,
                     all_actions_probs, all_rewards, eps_clip=0.1):
    n_batch = min(len(all_actions), 24325)
    idxs = random.sample(range(len(all_actions)), n_batch)
    rstate_batch = enable_cuda(torch.FloatTensor([all_rstates[i] for i in idxs]))
    lstate_batch = enable_cuda(torch.FloatTensor([all_lstates[i] for i in idxs]))
    action_batch = np.array([all_actions[i] for i in idxs], np.uint8)
    old_probs_batch = enable_cuda(torch.cat([all_actions_probs[i] for i in idxs]))
    advantage_batch = enable_cuda(torch.FloatTensor([all_rewards[i] for i in idxs]))

    optimizer.zero_grad()

    # n_batch x output_dim
    new_probs_batch = pe(rstate_batch, lstate_batch)
    new_probs_sel = np.identity(pe.output_dim)[action_batch]

    new_probs_sel = enable_cuda(torch.FloatTensor(new_probs_sel))
    new_probs_batch = torch.sum(F.softmax(new_probs_batch, dim=1) * new_probs_sel, dim=1)

#     print("batch ", new_probs_batch)
    if (torch.isnan(new_probs_batch).any()):
        print(new_probs_batch)
        assert(False)
    if (torch.isnan(old_probs_batch).any()):
        print(old_probs_batch)
        assert(False)
    if (torch.isnan(advantage_batch).any()):
        print(advantage_batch)
        assert(False)

    r = new_probs_batch / old_probs_batch
    loss1 = r * advantage_batch
    loss2 = torch.clamp(r, 1-eps_clip, 1+eps_clip) * advantage_batch
    loss = torch.min(loss1, loss2)
    loss = torch.mean(loss)
    print("Loss ", loss)

    # Calculate gradients
    loss.backward()
    # Apply gradients
    optimizer.step()

In [None]:
def reinforce(env, policy, optimizer, progress, num_episodes=120,
              batch_size=11, max_steps =5000, gamma=0.99, eps_clip=0.1):

    # Set up lists to hold results
    total_rewards = []
    all_rewards = []
    all_actions = []
    all_actions_probs = []
    all_rstates = []
    all_lstates = []
    batch_counter = 1
    
    
#     action_space = np.arange(env.action_space.n)
    try:
        for ep in range(num_episodes):
            rs, ls, a, a_prob, tot_r, dis_r = do_episode(env, policy, progress, ep, gamma, max_steps)

            # If complete, batch data
            all_rewards.extend(dis_r)
            all_rstates.extend(rs)
            all_lstates.extend(ls)
            all_actions_probs.extend(a_prob)
            all_actions.extend(a)
            batch_counter += 1
            total_rewards.append(tot_r)
            print("Ep ", ep, ", len ", len(a),
                      " last_action:", a[-1], " last_action_prob:", a_prob[-1]," reward:", tot_r)

            # If batch is complete, update network
            if batch_counter >= batch_size:
                for _ in range(2):
                    finish_batch(pe, optimizer, all_rstates, all_lstates, all_actions,
                         all_actions_probs, all_rewards)

                all_rewards = []
                all_actions_probs = []
                all_actions = []
                all_rstates = []
                all_lstates = []
                batch_counter = 1

                # Print running average
                print("Ep: {} Average of last 10: {:.2f}\n".format(
                    ep + 1, np.mean(total_rewards[-10:])), end="")

    except Exception as e:
        print(e)
        raise e
                
    return total_rewards

In [None]:
# Define optimizer
rewards_ = reinforce(env, pe, optimizer, progress, 200)
rewards.extend(rewards_)
window = 10
smoothed_rewards = [np.mean(rewards[i-window:i+1]) if i > window 
                    else np.mean(rewards[:i+1]) for i in range(len(rewards))]

plt.figure(figsize=(12,8))
plt.plot(rewards)
plt.plot(smoothed_rewards)
plt.ylabel('Total Rewards')
plt.xlabel('Episodes')
plt.show()

In [None]:
progress.plotEps(900,1000)

In [None]:
from IPython import display
ram_0 = env.reset()
rgb_0 = None
i = 0
complete = False
while complete == False and i < 500:
    # Get actions and convert to numpy array
    rgb_0, latent = getState(pe, i, ram_0, rgb_0)
    i += 1
#         print(i)

    action, prob = get_action(env, pe, (rgb_0, latent), i == 2)
    
    plt.figure()
    plt.imshow(env.render("rgb_array"))
    plt.show()
    ram_0, r, complete, lives = env.step(action)
#     env.render()
    if r > 0:
        print(r)

In [None]:
sum(p.numel() for p in pe.parameters() if p.requires_grad)