In [None]:
import numpy as np
import matplotlib.pyplot as plt
import gym
import sys
import random

import torch
from torch import nn
from torch import optim
from torch.nn import Module, Linear, ReLU, Sigmoid, Sequential, MaxPool2d
from torch.nn import functional as F

print(sys.version)
print(torch.__version__)
print(torch.version.cuda)

In [None]:
GPU_AVAILABLE = torch.cuda.is_available()
path_to_save = "./saved_model"
print("GPU:", GPU_AVAILABLE)

def enable_cuda(x):
    if GPU_AVAILABLE:
        return x.cuda()
    return x
def to_cpu(x):
    if GPU_AVAILABLE:
        return x.cpu()
    return x


In [None]:
# ad hoc testing shows good
def getLatentState(ram_arr, num_steps, done):
    assert(len(ram_arr) == 128)
    x_coord = ram_arr[100]
    y_coord = ram_arr[102]
    prev_score = int(hex(ram_arr[73])[2:]) * 100
    prev_score += int(hex(ram_arr[74])[2:])
    igloo_blocks = ram_arr[77]
    
    x_coord -= 16
    x_coord /= (160.0 - 16.0)
    y_coord -= 22
    y_coord /= (140.0 - 22.0)
    if igloo_blocks == 255:
        igloo_blocks = 0
    else:
        igloo_blocks += 1
    prev_score /= 1600.0
    igloo_blocks /= 20.0
    num_steps /= 2000.0
#     print(f'({x_coord}, {y_coord}), {prev_score}, {igloo_blocks}')
    latent = [x_coord, y_coord, prev_score, igloo_blocks, num_steps, int(done)]
    latent = np.asarray(latent, dtype=np.float32)
    return latent

def getRGB(policy, ram_arr, rgb_prev=None):
    rgb = env.render("rgb_array")
    rgb = policy.preprocess(np.expand_dims(rgb, 0))
    if rgb_prev is not None:
        rgb_prev[0:-1] = rgb_prev[1:]
        rgb_prev[-1] = rgb
    else:
        rgb_prev = np.concatenate((rgb, rgb, rgb, rgb), axis=0)
    return rgb_prev
    

def latentStateEq(a, b):
    eps = 0.05
#     print(" coord:", a[:2], b[:2], np.linalg.norm(a[:2] - b[:2]) < eps)
#     print(" scores:", a[2:4], b[2:4], b[2] >= a[2], b[3] >= a[3])
    return (np.linalg.norm(a[:2] - b[:2]) < eps and 
            b[2] >= a[2] and b[3] >= a[3])

In [None]:

class policy_estimator(Module):   
    def __init__(self, env):
        super(policy_estimator, self).__init__()
        self.input_dim = 6000
        self.output_dim = 6
        
        self.conv_depth = 3
        self.pool_stride = 1
        hidden_dim = 200
        kernel_size = 8

        self.conv_layer = Sequential(
            nn.Conv2d(in_channels=4, out_channels=16, kernel_size=8, stride=4),
            nn.BatchNorm2d(16),
            ReLU(),
            nn.Conv2d(in_channels=16, out_channels=32, kernel_size=4, stride=2),
            nn.BatchNorm2d(32),
            ReLU(),
            nn.Conv2d(in_channels=32, out_channels=32, kernel_size=3, stride=1),
            nn.BatchNorm2d(32),
            ReLU())
        self.linear_layers = Sequential(
            Linear(972, hidden_dim),
            ReLU(),
            Linear(hidden_dim, self.output_dim))
        
    # downsample and grayscale
    def preprocess(self, I):
        x = I[:,45:185,8:]
        x = x[:,::2,::2,:]  # downsample by factor of 2.
        x = 0.07 * x[:,:,:,2] + 0.72 * x[:,:,:,1] + 0.21 * x[:,:,:,0]
#         plt.imshow(x[0], cmap='gray', vmin=0, vmax=255)
#         print(x.shape)
#         plt.pause(-1)
        x = x.astype(np.float32)
        return x
        
    # Defining the forward pass    
    def forward(self, rgb, latent):
#         print(x.shape)
        if len(rgb.shape) != 4:# is not None:
            rgb = np.asarray(rgb, dtype=np.float32)
            rgb = np.expand_dims(rgb, axis=0)
            rgb = enable_cuda(torch.FloatTensor(rgb))
            latent = np.asarray(latent, dtype=np.float32)
            latent = np.expand_dims(latent, axis=0)
            latent = enable_cuda(torch.FloatTensor(latent))
        x = self.conv_layer(rgb)
#         print("shape", x.shape)
        x = x.reshape(-1, 960)
#         print(x.shape)
#         print(latent.shape)
        x = torch.cat((x, latent), dim=1)
#         print(x.shape)
#         x = latent
        x = self.linear_layers(x)
        return x

In [None]:
class ProgressTracker:
    def __init__(self):
        self.xexplored = []
        self.yexplored = []
        
    def addEp(self, ep=None):
        if ep is None or ep >= len(self.xexplored):
            self.xexplored.append([])
            self.yexplored.append([])
        else:
            self.xexplored[ep] = []
            self.yexplored[ep] = []
    
    def addCoord(self, ep, xcoord, ycoord):
        self.xexplored[ep].append(ycoord)
        self.yexplored[ep].append(1- xcoord)
        
    def plotEps(self, ep, epEnd=None):
        if epEnd is None:
            epEnd = ep + 1
        flatten = lambda t: [item for sublist in t for item in sublist]
        x = flatten(self.xexplored[ep:epEnd])
        y = flatten(self.yexplored[ep:epEnd])
        heatmap, xedges, yedges = np.histogram2d(x, y, bins=(144//5, 118//5), range=((0,1), (0, 1)))
        extent = [0, 1, 0,  1]
        
        plt.clf()
        plt.imshow(heatmap.T, extent=extent, origin='lower')
        plt.show()
        
    

In [None]:
env = gym.make('Frostbite-ramDeterministic-v0')
env.unwrapped.seed(0)
rewards = []
s = env.reset()
policy = []
optimizer = []
progress = []

for i in range(2):
    policy.append(policy_estimator(env))
    policy[-1] = enable_cuda(policy[-1])
    rgb = getRGB(policy[-1], env.render('rgb_array'))
    latent = getLatentState(s, 0, False)
    print("latent", latent)
    latent = np.concatenate((latent, latent))

    with torch.no_grad():
        print(policy[-1].forward(rgb, latent))
    # Define optimizer
    optimizer.append(optim.Adam(policy[-1].parameters(), 
                           lr=0.0001))
    progress.append(ProgressTracker())
    rewards.append([])

In [None]:
def discount_rewards(rewards, gamma=0.99):
    rewards = [r if r > 0 else -1 for r in rewards]
    r = np.array([(gamma**i) * rewards[i] 
                  for i in range(len(rewards))])
    # Reverse the array direction for cumsum and then
    # revert back to the original order
    r = r[::-1].cumsum()[::-1]
    r_std = r.std()
    eps = 0.00001 
    if (r_std < eps):
#         assert(False)
        pass
    return r - r.mean()#/r_std

def get_action(env, policy, s_0, debug=False):
        # Get actions and convert to numpy array
    with torch.no_grad():
        action_probs = to_cpu(policy(*s_0).detach())
        if (torch.isnan(action_probs).any()):
            print(action_probs)
            assert(False)

    if debug:
        print(" logits:", action_probs)
    action_probs = torch.distributions.Categorical(logits=action_probs)
    action = action_probs.sample()
    return action, action_probs.probs[0, action]

def finish_batch(policy, optimizer, all_rstates, all_lstates, all_actions,
                     all_actions_probs, all_rewards, eps_clip=0.1):
    n_batch = min(len(all_actions), 24325)
    idxs = random.sample(range(len(all_actions)), n_batch)
    rstate_batch = enable_cuda(torch.FloatTensor([all_rstates[i] for i in idxs]))
#     rstate_batch = None
    lstate_batch = enable_cuda(torch.FloatTensor([all_lstates[i] for i in idxs]))
    action_batch = np.array([all_actions[i] for i in idxs], np.uint8)
    old_probs_batch = enable_cuda(torch.cat([all_actions_probs[i] for i in idxs]))
    advantage_batch = enable_cuda(torch.FloatTensor([all_rewards[i] for i in idxs]))

    optimizer.zero_grad()


    # n_batch x output_dim
    new_probs_batch = policy(rstate_batch, lstate_batch)
    new_probs_sel = np.identity(policy.output_dim)[action_batch]

    new_probs_sel = enable_cuda(torch.FloatTensor(new_probs_sel))
    new_probs_batch = torch.sum(F.softmax(new_probs_batch, dim=1) * new_probs_sel, dim=1)

#     print("batch ", new_probs_batch)
    if (torch.isnan(new_probs_batch).any()):
        print(new_probs_batch)
        assert(False)
    if (torch.isnan(old_probs_batch).any()):
        print(old_probs_batch)
        assert(False)
    if (torch.isnan(advantage_batch).any()):
        print(advantage_batch)
        assert(False)

    r = new_probs_batch / old_probs_batch
    loss1 = r * advantage_batch
    loss2 = torch.clamp(r, 1-eps_clip, 1+eps_clip) * advantage_batch
    loss = torch.min(loss1, loss2)
    loss = torch.mean(loss)
    print("Loss ", loss)

    # Calculate gradients
    loss.backward()
    # Apply gradients
    optimizer.step()

In [None]:
kStopAction = 1
kNoop = 0
def selfplayEpisode(env, policy_a, policy_b, progress_a, progress_b,
                        tmax = 30, tdiff = 10, scale=1.0, gamma=0.99):
    ta, tb = 0, 0
    obs = env.reset()
    s0_latent = getLatentState(obs, 0, False)
    s0_rgb = getRGB(policy_a, env.render('rgb_array'))
    sa_rgb, sb_rgb = s0_rgb, s0_rgb
    done = False
    ra_game = 0
    rb_game = 0
    sra_arr, sla_arr, srb_arr, slb_arr = [], [], [], []
    aa_arr, apa_arr = [], []
    ab_arr, apb_arr = [], []
    progress_a.addEp()
    progress_b.addEp()
    while ta < tmax:
        sa_rgb = getRGB(policy_a, env.render('rgb_array'), sa_rgb)
        sa_latent = getLatentState(obs, ta, done) # don't clobber for bob
        progress_a.addCoord(-1, sa_latent[0], sa_latent[1])
        sa_latent_ = np.concatenate((sa_latent, s0_latent))

        sa = [sa_rgb, sa_latent_]
        action, action_prob = get_action(env, policy_a, sa)
        sra_arr.append(sa_rgb)
        sla_arr.append(sa_latent_)
        aa_arr.append(action)
        apa_arr.append(action_prob)
        ta += 1
        if action == kStopAction:
            break
        if not done:
            obs, reward, done, lives = env.step(action)
            if lives['ale.lives'] < 4:
                done = True
            ra_game += reward

    done = False
    obs = env.reset()
    while tb < tmax + tdiff//2:
        sb_rgb = getRGB(policy_b, env.render('rgb_array'), sb_rgb)
        sb_latent = getLatentState(obs, tb, done)
        progress_b.addCoord(-1, sb_latent[0], sb_latent[1])
        sb_latent_ = np.concatenate((sb_latent, sa_latent))

        sb = [sb_rgb, sb_latent_]
        action, action_prob = get_action(env, policy_b, sb)
        srb_arr.append(sb_rgb)
        slb_arr.append(sb_latent_)
        ab_arr.append(action)
        apb_arr.append(action_prob)
        tb += 1
        if latentStateEq(sa_latent, sb_latent):
            print(" Bob got it! ", tb, ta)
            break
        if done:
            tb = ta + tdiff
            break
        # translate stop to noop
        action = kNoop if action == kStopAction else action
        obs, reward, done, lives = env.step(action)
        if lives['ale.lives'] < 4:
            done = True
        rb_game += reward
        
        
    print("  > game rewards: ", ra_game, rb_game)
    ra = (1 - scale) * ra_game + scale * max(0, tb - ta)
    rb = (1 - scale) * rb_game + -scale * tb
#     print("  > self play: ", ra, rb)
    # update with custom losses
    dra_arr = discount_rewards([0.0 if i!=(ta - 1) else ra for i in range(ta)], gamma)
    drb_arr = discount_rewards([0.0 if i!=(tb - 1) else rb for i in range(tb)], gamma)
    
    return ((sra_arr, sla_arr, aa_arr, apa_arr, dra_arr, ra),
            (srb_arr, slb_arr, ab_arr, apb_arr, drb_arr, rb))



In [None]:

def reinforce(env, po_a, po_b, o_a, o_b, pr_a, pr_b, num_episodes=120,
              batch_size=11, max_steps =5000, gamma=0.99, eps_clip=0.1):

    # Set up lists to hold results
    # alice @ index 0, bob @ 1
    dr, tr = [[],[]], [[], []]  # discounted, total rewards
    a, ap = [[], []], [[], []]  # action,     action prob
    sr, sl = [[], []], [[], []] # state rgb,  latent
    batch_counter = 1
    
    
#     action_space = np.arange(env.action_space.n)
    try:
        for ep in range(num_episodes):
            res = selfplayEpisode(env, po_a, po_b, pr_a, pr_b)
            for i in range(2):
                sr[i].extend(res[i][0])
                sl[i].extend(res[i][1])
                a[i].extend(res[i][2])
                ap[i].extend(res[i][3])
                dr[i].extend(res[i][4])
                tr[i].append(res[i][5])

            batch_counter += 1
            print("Ep ", ep, ", len ", len(res[0][2]), len(res[1][2]), ", aa:", a[0][-1], a[1][-1], 
                      " ap[0]:", ap[0][-1], ap[1][-1]," tr:", tr[0][-1],tr[1][-1])

            # If batch is complete, update network
            if batch_counter >= batch_size:
                for _ in range(2):#batch_size//2):
                    for i in range(2):
                        if i == 0:
                            finish_batch(po_a, o_a, sr[i], sl[i], a[i], ap[i], dr[i])
                        if i == 1:
                            finish_batch(po_b, o_b, sr[i], sl[i], a[i], ap[i], dr[i])

                for i in range(2):
                    sr[i].clear()
                    sl[i].clear()
                    a[i].clear()
                    ap[i].clear()
                    dr[i].clear()
#                     tr[i].clear()
                batch_counter = 1

                # Print running average
                print("Ep: {} Average of last 10: {:.2f} {:.2f}\n".format(
                    ep + 1, np.mean(tr[0][-10:]), np.mean(tr[0][-10:])), end="")

    #             assert(False)
    except Exception as e:
        print(e)
        raise e
                
    return tr

In [None]:
# Define optimizer
rewards_ = reinforce(env, policy[0], policy[1], optimizer[0], optimizer[1], progress[0], progress[1], 240)
rewards[0].extend(rewards_[0])
rewards[1].extend(rewards_[1])
window = 10
plt.figure(figsize=(12,8))
smoothed_rewards = []
for ii in range(2):
    smoothed_rewards.append([np.mean(rewards[ii][i-window:i+1]) if i > window 
                        else np.mean(rewards[ii][:i+1]) for i in range(len(rewards[ii]) - 100)])
    name = "Alice" if ii == 0 else "Bob"
    plt.plot(rewards[ii][:-100], label = name)
    plt.plot(smoothed_rewards[ii], label = name + "_smooth")
plt.ylabel('Total Rewards')
plt.xlabel('Episodes')
plt.legend()
plt.title("Alice-Bob style training with fixed Tmax")
plt.show()

In [None]:
progress[1].plotEps(0,750)

In [None]:
sum(p.numel() for p in pe.parameters() if p.requires_grad)