In [1]:
# from model import *
from agent_internals import *

In [2]:
from ppo_helper import *

In [3]:
brain = DefaultAgentBrain(7).cuda()

In [4]:
# I will write a demo reward func
# Reward func has to accept a batch of traces and the seed offset
# Reward func must provide 0 for everything after termination

In [5]:
#def reward_func_full(traces, seed_offset, past_terminated, contexts=None):
#    # ignore contexts for now
#    with torch.no_grad():
#        batches, trace_length = traces.size()
#        reward_len = trace_length - seed_offset + 1 # includes reward for initial seed
#        rewards = torch.zeros(batches, reward_len, device=traces.device)
#        for i in range(reward_len):
#            trace_index = i + seed_offset - 1 # includes reward for initial seed
#            rewards[:, i] += torch.logical_and(
#                             torch.logical_and((traces[:, trace_index] > 2),  \
#                                               (torch.logical_not(past_terminated[:, i]))), \
#                             (((traces[:, trace_index] - traces[:, trace_index-1]) % 4) == 1)) * 1.0
#            rewards[:, i] += torch.logical_and((traces[:, trace_index] == 2),  \
#                                              (torch.logical_not(past_terminated[:, i]))) * 5.0 # teach it to finish early
#    return rewards

In [6]:
# only 4's get rewards; nothing else matters.
def reward_func_stupid(traces, seed_offset, past_terminated, contexts=None):
    # ignore contexts for now
    with torch.no_grad():
        batches, trace_length = traces.size()
        reward_len = trace_length - seed_offset + 1 # includes reward for initial seed
        rewards = torch.zeros(batches, reward_len, device=traces.device)
        for i in range(reward_len):
            trace_index = i + seed_offset - 1 # includes reward for initial seed
            rewards[:, i] += torch.logical_and((traces[:, trace_index] == 4),  \
                                              (torch.logical_not(past_terminated[:, i]))) * 5.0 # teach it to finish early
    return rewards

In [7]:
# only 4's get rewards; nothing else matters.
def reward_func_modular(traces, seed_offset, past_terminated, contexts=None):
    # ignore contexts for now
    with torch.no_grad():
        batches, trace_length = traces.size()
        reward_len = trace_length - seed_offset + 1 # includes reward for initial seed
        rewards = torch.zeros(batches, reward_len, device=traces.device)
        for i in range(reward_len):
            trace_index = i + seed_offset - 1 # includes reward for initial seed
            rewards[:, i] += torch.logical_and(
                             torch.logical_and((traces[:, trace_index] > 2),  \
                                               (torch.logical_not(past_terminated[:, i]))), \
                             (((traces[:, trace_index] - traces[:, trace_index-1]) % 4) == 1)) * 1.0
    return rewards

In [8]:
# only a reward at the end
def reward_func_final(traces, seed_offset, past_terminated, contexts=None):
    # ignore contexts for now
    with torch.no_grad():
        batches, trace_length = traces.size()
        reward_len = trace_length - seed_offset + 1 # includes reward for initial seed
        rewards = torch.zeros(batches, reward_len, device=traces.device)
        for i in range(reward_len):
            trace_index = i + seed_offset - 1 # includes reward for initial seed
            rewards[:, i] += torch.logical_and((traces[:, trace_index] == 2),  \
                                              (torch.logical_not(past_terminated[:, i]))) * 5.0 # teach it to finish early
    return rewards

In [9]:
def reward_func_full(traces, seed_offset, past_terminated, contexts=None):
    return reward_func_modular(traces, seed_offset, past_terminated, contexts) + \
           reward_func_final(traces, seed_offset, past_terminated, contexts)

In [10]:
reward_func = reward_func_stupid #reward_func_modular #reward_func_stupid #reward_func_full

In [11]:
#def get_value(x):
#    with torch.no_grad():
#        val = brain.dopamine(brain.text_enc(x))
#    return val

# First attempt will have full gradient for brain - ruining the val func - and full gradient for val func.
# Another attempt will have a smarter gradient for the policy (only the generator, not the encoder)
get_value = SolitaryValueFunc(7).cuda()
get_value.text_enc = brain.text_enc
get_value.dopamine = brain.dopamine

In [12]:
# From the lunar lander demo code: gamma 0.99, tau 0.97.
# Will maybe change later since the env is so different
#                                        policy,  value,  gamma, tau, reward_func
env_buffer = SentenceOutputSingleEpisode(brain, get_value, 0.99, 0.97, reward_func)

In [13]:
seeds = torch.randint(3, 7, (3, 10), device=brain.get_device())

In [14]:
get_value(seeds)

tensor([[-0.6918],
        [-4.1317],
        [-3.6766]], device='cuda:0', grad_fn=<SumBackward1>)

In [15]:
env_buffer.fill(seeds)

In [16]:
# ok, that's tested, now let's test an actual full training session

In [17]:
def get_seeds(minVal=3, maxVal=7, lenSeeds=10, batchSize=16, device='cuda'):
    return torch.randint(minVal, maxVal, (batchSize, lenSeeds), device=device)

In [18]:
0.7**4

0.24009999999999995

In [19]:
def get_bb(num_buffers=64):
    buffer_buffer = []
    for i in range(num_buffers):
        print(i)
        # Let's try to make this particular system simpler on the value func, 
        # make it focus on the here-and-now, more. Old vals: 0.99, 0.97
        #                                                      0.7, 0.6
        buffer = SentenceOutputSingleEpisode(brain, get_value, 0.0, 0.0, reward_func)
        buffer.fill(get_seeds(batchSize=8))
        buffer = buffer.to('cpu') # avoid eating VRAM
        buffer_buffer.append(buffer)
    return buffer_buffer
num_buffers=64
buffer_buffer = get_bb(num_buffers)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63


In [20]:
buffer_buffer

[<ppo_helper.SentenceOutputSingleEpisode at 0x7f29a3150b90>,
 <ppo_helper.SentenceOutputSingleEpisode at 0x7f29a40bae10>,
 <ppo_helper.SentenceOutputSingleEpisode at 0x7f29a3ed1be0>,
 <ppo_helper.SentenceOutputSingleEpisode at 0x7f2a8daa6960>,
 <ppo_helper.SentenceOutputSingleEpisode at 0x7f29a402de20>,
 <ppo_helper.SentenceOutputSingleEpisode at 0x7f2a8da7cc20>,
 <ppo_helper.SentenceOutputSingleEpisode at 0x7f29a5043fb0>,
 <ppo_helper.SentenceOutputSingleEpisode at 0x7f29a3ea6f60>,
 <ppo_helper.SentenceOutputSingleEpisode at 0x7f2a8daa55b0>,
 <ppo_helper.SentenceOutputSingleEpisode at 0x7f29a3c33b60>,
 <ppo_helper.SentenceOutputSingleEpisode at 0x7f29a304a2d0>,
 <ppo_helper.SentenceOutputSingleEpisode at 0x7f29a3049160>,
 <ppo_helper.SentenceOutputSingleEpisode at 0x7f29a304b260>,
 <ppo_helper.SentenceOutputSingleEpisode at 0x7f29a3153c50>,
 <ppo_helper.SentenceOutputSingleEpisode at 0x7f29a3153d40>,
 <ppo_helper.SentenceOutputSingleEpisode at 0x7f2a8daa49e0>,
 <ppo_helper.SentenceOut

In [21]:
mse_loss = nn.MSELoss()

In [22]:
# changing it so that only the dopamine portion is affected
# not sure if this option makes the most sense *here*, but it makes the most sense in general.
# that is, other modes will learn the representation, and the dopamine is only responsible for the evaluation.
val_optimizer = optim.Adam(get_value.dopamine.parameters(), lr=0.0005, betas=(0.9, 0.98), eps=1e-9)
val_epochs = 16 #16 # old is 80, but that's only sampling a few per turn; we're gonna go through the whole buffer-buffer
# 16 was too much, let's go with 2 times through all 64 16-batch sets.

In [23]:
# this training loop is just for 'burning in' to have an initial value func that corresponds to the policy;
# the other one (later) will include clamping and is slightly more advanced
# this is 8 Gb VRAM, by the way (at batchSize 16). Not *too* surprising, but I will need the bigger system right away

In [24]:
import random
def train_val_func(val_optimizer, epochs, buffer_buffer):
    for epoch in range(epochs):
        print(f"==========Epoch {epoch}=====================")
        get_value.train()
        train_loss = 0
        random.shuffle(buffer_buffer)
        i = 0
        for buffer in buffer_buffer:
            i += 1
            buffer = buffer.cuda()
            val_optimizer.zero_grad()
            new_vals = buffer.get_values(evaluation = False) # call value func correctly, with gradients
            loss = mse_loss(new_vals, buffer.returns)
            loss.backward()
            val_optimizer.step()
            train_loss += loss.item()
            print(f"episode {i}, val func loss {loss.item()}\n")
            buffer = buffer.cpu()
        val_optimizer.zero_grad()
        print(f"Val func train loss in epoch {epoch}:{train_loss / (len(buffer_buffer))}")
#train_val_func(val_optimizer, val_epochs, buffer_buffer)

In [25]:
# let's recalculate gaes after burn-in, to get any progress on the policy network at all
def restore_coherence(buffer_buffer):
    for buffer in buffer_buffer:
        buffer=buffer.cuda()
        buffer.values = buffer.get_values() # retrained val func
        buffer.gaes = buffer.get_gaes()
        buffer = buffer.cpu()
    return buffer_buffer
#buffer_buffer = restore_coherence(buffer_buffer)

In [26]:
policy_optimizer = optim.Adam(brain.parameters(), lr=0.0003, betas=(0.9, 0.98), eps=1e-9)
policy_epochs = 4
epochs = policy_epochs

In [27]:
policy_clip_range = 0.1 #0.5

In [28]:
entropy_loss_weight = 5e-2 #0.01 I think this is too high for the policy I'm trying to produce

In [29]:
# batchsize 8 is the correct path. THis is like 7.5 gigs of VRAM
# just barely enough to work well.

In [30]:
def train_policy(policy_optimizer, epochs, buffer_buffer, policy_clip_range=0.1, entropy_loss_weight=0.01):
    for epoch in range(epochs):
        print(f"==========Epoch {epoch}=====================")
        brain.train()
        train_loss = 0
        random.shuffle(buffer_buffer)
        i = 0
        for buffer in buffer_buffer:
            i += 1
            buffer = buffer.cuda()
            #seeds = buffer.traces[:, :buffer.seed_offset]
            policy_optimizer.zero_grad()
            # hopefully no torch.no_grad's in that generation func, huh
            # NO, this is NOT the correct path. I need logpas that correspond to the traces, no others
            # traces, logpas, entropies = brain.generate(seeds)
            # THIS is the correct one. Can be accelerated by passing in masks, later.
            logpas, entropies = brain.compute_probabilities(buffer.traces, buffer.seed_offset, buffer.contexts)
            #print(logpas.size())
            #print(buffer.logpas.size())
            # Add constraints to kill logpas / entropis in past_terminated? Just to avoid confusion?
            ratios = (logpas - buffer.logpas).exp()
            pi_obj = buffer.gaes * ratios
            pi_obj_clipped = buffer.gaes * ratios.clamp(1.0 - policy_clip_range,
                                                       1.0 + policy_clip_range)
            policy_loss = -torch.min(pi_obj, pi_obj_clipped).mean()
            entropy_loss = -entropies.mean() * entropy_loss_weight
            
            loss = policy_loss + entropy_loss
    
            loss.backward()
            policy_optimizer.step()
            train_loss += loss.item()
            print(f"episode {i}, policy loss {loss.item()}\n")
            buffer = buffer.cpu()
        policy_optimizer.zero_grad()
        print(f"Policy train loss in epoch {epoch}:{train_loss / (len(buffer_buffer))}")
#train_policy(policy_optimizer, policy_epochs, buffer_buffer, policy_clip_range, entropy_loss_weight)

In [31]:
# find some way of clearing all the nonesense from the VRAM, I don't need the training artefacts to persist

In [32]:
import time
start = time.time()

In [33]:
time.time() - start

0.10008716583251953

In [34]:
def average_return(bb):
    """The average return (at the end of the seeds alone) from a buffer-buffer"""
    s = torch.zeros(bb[0].returns[:, 0].size(), device = bb[0].returns[:, 0].device)
    for b in bb:
        s += bb[0].returns[:, 0]
    return torch.sum(s).item()/(len(bb) * bb[0].returns.size()[0])

In [35]:
# no policy optimization on first round, only subsequent
def run_round(round_num, policy_optimizer, val_optimizer, num_buffers=64, policy_epochs=4, val_epochs=16, policy_clip_range=0.5, entropy_loss_weight=1e-3):
    # First, get some samples
    brain.eval()
    get_value.eval()
    buffer_buffer = get_bb(num_buffers) # run the inference side
    print(f"Return before training was {average_return(buffer_buffer)}")
    if round_num > 0:
        train_policy(policy_optimizer, policy_epochs, buffer_buffer, policy_clip_range, entropy_loss_weight)
    train_val_func(val_optimizer, val_epochs, buffer_buffer)

In [36]:
import time

num_buffers=16 # keep it simpler
num_rounds = 150 # give it more of a chance to learn policy, which can only change a little over each round.
for i in range(num_rounds):
    start = time.time()
    print(f"**********************ROUND {i} ***************************\n")
    run_round(i, policy_optimizer, val_optimizer, num_buffers, policy_epochs, val_epochs, policy_clip_range, entropy_loss_weight)
    elapsed = time.time() - start
    print(f"***********************TIME WAS {elapsed / 60} min*****************************\n")
    # I think the entropy was too low last time, let's see if this fixes the issue.
    if i > 40:
        entropy_loss_weight = max(entropy_loss_weight / 2, 1e-4)#1e-3)

**********************ROUND 0 ***************************

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
Return before training was 0.0
episode 1, val func loss 16.809741973876953

episode 2, val func loss 7713.18359375

episode 3, val func loss 381.5754089355469

episode 4, val func loss 673.3908081054688

episode 5, val func loss 98.31396484375

episode 6, val func loss 18.95507049560547

episode 7, val func loss 50.3583869934082

episode 8, val func loss 44.9396858215332

episode 9, val func loss 92.9799575805664

episode 10, val func loss 4.098732948303223

episode 11, val func loss 46.45364761352539

episode 12, val func loss 160.7159881591797

episode 13, val func loss 47.65835189819336

episode 14, val func loss 11.609933853149414

episode 15, val func loss 0.7637920379638672

episode 16, val func loss 13.92902946472168

Val func train loss in epoch 0:585.9835059046745
episode 1, val func loss 43.05809020996094

episode 2, val func loss 12.171830177307129

episode 3, val func loss 10.10

In [37]:
bb = get_bb()

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63


In [38]:
b = bb[0]

In [39]:
b

<ppo_helper.SentenceOutputSingleEpisode at 0x7f29a38eb950>

In [40]:
b = b.cuda()

In [41]:
b.traces[0, b.seed_offset - 1:]

tensor([3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4],
       device='cuda:0')

In [42]:
b.traces[0, :b.seed_offset]

tensor([5, 6, 5, 5, 3, 6, 4, 4, 5, 3], device='cuda:0')

In [43]:
b.terminated[0, :b.seed_offset]

tensor([False, False, False, False, False, False, False, False, False, False],
       device='cuda:0')

In [44]:
# Right, this only includes the seed_offset and forward

In [45]:
b.rewards

tensor([[0., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5.,
         5., 5., 5., 5., 5.],
        [0., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5.,
         5., 5., 5., 5., 5.],
        [5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5.,
         5., 5., 5., 5., 5.],
        [0., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5.,
         5., 5., 5., 5., 5.],
        [5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5.,
         5., 5., 5., 5., 5.],
        [0., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5.,
         5., 5., 5., 5., 5.],
        [0., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5.,
         5., 5., 5., 5., 5.],
        [0., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5.,
         5., 5., 5., 5., 5.]], device='cuda:0')

In [46]:
b.returns

tensor([[0., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5.,
         5., 5., 5., 5., 5.],
        [0., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5.,
         5., 5., 5., 5., 5.],
        [5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5.,
         5., 5., 5., 5., 5.],
        [0., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5.,
         5., 5., 5., 5., 5.],
        [5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5.,
         5., 5., 5., 5., 5.],
        [0., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5.,
         5., 5., 5., 5., 5.],
        [0., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5.,
         5., 5., 5., 5., 5.],
        [0., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5.,
         5., 5., 5., 5., 5.]], device='cuda:0')

In [47]:
b.values

tensor([[2.4634, 4.3715, 4.5108, 5.0831, 4.1181, 4.8333, 4.9635, 3.9916, 4.2776,
         4.1128, 4.3017, 5.3786, 3.9250, 5.1134, 4.1784, 4.1687, 5.1195, 4.5503,
         5.3096, 4.7779, 5.1868, 4.6155, 4.4610],
        [2.1432, 4.2237, 4.3687, 4.2824, 4.4731, 4.2677, 4.8788, 4.7567, 4.2276,
         4.3233, 4.9661, 4.9824, 4.4496, 4.8898, 4.9468, 4.0156, 6.0770, 5.4565,
         4.9377, 4.0246, 5.2688, 5.3207, 4.9316],
        [2.7244, 3.8690, 4.6356, 4.1560, 4.4666, 4.2487, 4.4653, 4.2932, 4.2075,
         4.6175, 4.4669, 5.2189, 5.4922, 4.6494, 5.1375, 3.8699, 3.9895, 5.3308,
         4.7931, 4.4217, 4.6379, 5.6374, 5.4216],
        [2.7236, 4.2479, 4.6708, 4.0291, 4.5579, 4.2675, 4.5704, 4.5645, 4.6824,
         4.2967, 4.7550, 5.3980, 5.3288, 6.5061, 5.0135, 4.7007, 5.2294, 5.0667,
         5.0433, 5.0298, 4.7801, 4.7716, 5.0897],
        [2.9831, 4.6749, 4.4412, 4.3566, 3.6912, 4.2229, 4.0773, 4.4642, 4.6372,
         4.5166, 4.2040, 5.0360, 4.5983, 4.7178, 4.5089, 4.9249, 4.6426

In [48]:
b.traces[1, b.seed_offset - 1:]

tensor([6, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4],
       device='cuda:0')

In [49]:
b.rewards[1]

tensor([0., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5.,
        5., 5., 5., 5., 5.], device='cuda:0')

In [50]:
average_return(bb)

1.25

In [53]:
16*64*22*150

3379200