In [1]:
import sys
import torch  
import gym
import numpy as np  
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.autograd import Variable
import matplotlib.pyplot as plt
import pandas as pd
from torch.distributions.categorical import Categorical
import math
import os
import torchvision.transforms as transforms
import torchvision.datasets as datasets
from preprocess import mean, std, preprocess_input_function
from settings import train_dir, test_dir, train_push_dir, train_batch_size, test_batch_size, train_push_batch_size
from settings import base_architecture, img_size, prototype_shape, num_classes, prototype_activation_function, add_on_layers_type
from receptive_field import compute_rf_prototype
import cv2
from preference_model import construct_PrefNet, paired_cross_entropy_loss, PrefNet
from tqdm import tqdm
from settings import joint_optimizer_lrs, joint_lr_step_size

In [2]:
def unravel_index(index, shape):
    out = []
    for dim in reversed(shape):
        out.append(index % dim)
        index = index // dim
    return reversed(out)

In [3]:
'''
Use PPnet's forward pass as the policy network (actor network); what about the network for value function (critic network)?
Since there are only determinant actions, this is essentially A2C...
'''
class A3C_PPnet(nn.Module):
    def __init__(self, PPnet, preference_model, k=3, p=5, learning_rate=3e-4, dummy_reward=False):
        super(A3C_PPnet, self).__init__()
        
        
        self.PPnet = PPnet.cuda()
        #for param in self.PPnet.features.parameters():
        #    param.requires_grad = True
        self.k = k
        self.pf_model = preference_model.cuda()
        
        self.PPnet_multi = torch.nn.DataParallel(self.PPnet)
        for p in self.PPnet_multi.module.features.parameters():
            p.requires_grad = True
        for p in self.PPnet_multi.module.add_on_layers.parameters():
            p.requires_grad = True
        self.PPnet_multi.module.prototype_vectors.requires_grad = True
        for p in self.PPnet_multi.module.last_layer.parameters():
            p.requires_grad = True
        #self.critic_model = self.construct_critic().cuda()
        self.p = p
        self.critic_model = Critic().cuda()
        policy_optimizer_specs = [{'params': self.PPnet.features.parameters(), 'lr': joint_optimizer_lrs['features'], 'weight_decay': 1e-3}, 
                                  {'params': self.PPnet.add_on_layers.parameters(), 'lr': joint_optimizer_lrs['add_on_layers'], 'weight_decay': 1e-3},
                                  {'params': self.PPnet.prototype_vectors, 'lr': joint_optimizer_lrs['prototype_vectors']},
                                  ]
        self.policy_optimizer = torch.optim.Adam(policy_optimizer_specs)
        #self.policy_optimizer = torch.optim.Adam(self.PPnet.features.parameters())
        self.critic_optimizer = torch.optim.Adam(self.critic_model.parameters())
    
        
    def get_heatmaps(self, batch_x, labels, dummy=False):
        self.PPnet_multi.eval()
        n_prototypes = self.PPnet_multi.module.num_prototypes
        prototype_shape = self.PPnet_multi.module.prototype_shape
        max_dist = prototype_shape[1] * prototype_shape[2] * prototype_shape[3]
        protoL_rf_info = self.PPnet_multi.module.proto_layer_rf_info
        
        batch_x = batch_x.cuda()
        protoL_input_torch, proto_dist_torch = self.PPnet_multi.module.push_forward(batch_x)
        #for param in self.PPnet_multi.parameters():
        #    if param.requires_grad == True:
        #        print(param.grad)
        #print(proto_dist_torch.grad)
        #proto_dist_ = np.copy(proto_dist_torch.detach().cpu().numpy())
        #print(proto_dist_torch.grad_fn)
        proto_dist_ = proto_dist_torch.view(proto_dist_torch.shape[0], proto_dist_torch.shape[1], -1)
        distances = torch.amin(proto_dist_, axis=-1)
        #distances = torch.tensor(distances)
        #print("Distances grad: ", distances.grad)
        actions = self.sample_from_distances(distances, labels)
        proto_dist = torch.clone(proto_dist_torch)
        # Move to cpu and cast to numpy here
        # proto_dist shape: (1000, 80, 7, 7)
        proto_dist = torch.transpose(proto_dist, 0, 1)
        proto_dist = proto_dist.detach().cpu().numpy()
        heatmaps = []
        joint_log_probs = []
        r = []
        for action in actions:
            img_idx, probs, j, class_identity = action[0], action[1], action[2], action[3]
            heatmaps_j = []
            r_j = []
            for i in img_idx:
                closest_patch_indices_in_distance_map_j = list(np.unravel_index(np.argmin(proto_dist[j][i],axis=None), proto_dist[j][i].shape))
                #print(closest_patch_indices_in_distance_map_j)
                closest_patch_indices_in_distance_map_j = [0] + closest_patch_indices_in_distance_map_j
                #print(closest_patch_indices_in_distance_map_j)
                closest_patch_indices_in_img = compute_rf_prototype(batch_x.size(2), closest_patch_indices_in_distance_map_j, protoL_rf_info)
                closest_patch = \
                    batch_x[i, :, closest_patch_indices_in_img[1]:closest_patch_indices_in_img[2], closest_patch_indices_in_img[3]:closest_patch_indices_in_img[4]]
                closest_patch = closest_patch.cpu().numpy()
                closest_patch = np.transpose(closest_patch, (1, 2, 0))

                original_img = batch_x[i].cpu().numpy()
                original_img = np.transpose(original_img, (1, 2, 0))
                if self.PPnet_multi.module.prototype_activation_function == 'log':
                    act_pattern = np.log((proto_dist[j][i] + 1)/(proto_dist[j][i] + self.PPnet_multi.module.epsilon))
                elif self.PPnet_multi.module.prototype_activation_function == 'linear':
                    act_pattern = max_dist - proto_dist[j][i]
                else:
                    act_pattern = prototype_activation_function_in_numpy(proto_dist[j][i])

                patch_indices = closest_patch_indices_in_img[1:5]
                
                img_size = original_img.shape[0]
                
                score = - (img_size//2 - (patch_indices[0]+patch_indices[1])//2)**2 - (img_size//2 - (patch_indices[2]+patch_indices[3])//2)**2
                
                upsampled_act_pattern = cv2.resize(act_pattern, dsize=(img_size, img_size), interpolation=cv2.INTER_CUBIC)
                rescaled_act_pattern = upsampled_act_pattern - np.amin(upsampled_act_pattern)
                rescaled_act_pattern = rescaled_act_pattern / np.amax(rescaled_act_pattern)
                heatmap = cv2.applyColorMap(np.uint8(255*rescaled_act_pattern), cv2.COLORMAP_JET)
                heatmap = np.float32(heatmap) / 255
                heatmap = heatmap[..., ::-1]
                overlayed_original_img = 0.5 * original_img + 1.0 * heatmap
                overlayed_original_img = overlayed_original_img - np.amin(overlayed_original_img)
                overlayed_original_img = overlayed_original_img / np.amax(overlayed_original_img)
                if dummy:
                    heatmaps_j.append(overlayed_original_img)
                    r_j.append(score)
                else:
                    heatmaps_j.append(overlayed_original_img)
            joint_log_prob = torch.prod(probs) * math.factorial(self.k)
            #print(joint_log_prob.grad_fn)
            heatmaps.append(heatmaps_j)
            if dummy:
                r.append(r_j)
            joint_log_probs.append(joint_log_prob)
                
        # num_prototypes * self.k heatmaps in total
        # num_prototypes probs
        #for prob in joint_log_probs:
        #    print(prob.grad_fn)
        if dummy:
            r = np.sum(np.array(r), axis=1)
            r = torch.tensor(r)
            return heatmaps, joint_log_probs, distances, r
        return heatmaps, joint_log_probs, distances
    
    def sample_from_distances(self, distances, labels):
        '''
        Takes in distances of shape (80, 1000)
        returns actions of shape (1000, ), one for each prototype
        '''
        distances = torch.clip(distances, min=1e-7, max=None)
        similarities = 1 / distances
        softmax_dist = F.softmax(similarities, dim=0)
        softmax_dist = torch.transpose(softmax_dist, 0, 1)
        # Maybe using combinatorics?
        actions = []
        for i in range(softmax_dist.shape[0]):
            class_identity = torch.argmax(self.PPnet_multi.module.prototype_class_identity[i])
            #print("Class identity: ", class_identity)
            class_dist = softmax_dist[i][labels==class_identity]
            #print("Class dist shape:", class_dist.shape)
            if len(class_dist) > self.k:
                dist = Categorical(class_dist)
                img_idx = dist.sample(sample_shape=torch.tensor([self.k]))
                probs = dist.log_prob(img_idx)
                probs = torch.exp(probs)
                actions.append([img_idx, probs, i, class_identity])
        return actions
    
    def construct_critic(self):
        critic_model = nn.Sequential(
                        nn.Linear(512 * self.k * 7 * 7, 120),
                        nn.Sigmoid(),
                        nn.Linear(120, 20),
                        nn.Sigmoid(),
                        nn.Linear(20, 1)
                        )
        return critic_model
    
    # Currently just the same architecture as the pref_net
    '''
    def critic(self, heatmaps):
        values = torch.empty(len(heatmaps))
        for i in tqdm(range(len(heatmaps))):
            #x = torch.tensor(heatmaps[i])
            x = np.concatenate(heatmaps[i], axis=1)
            x = torch.tensor(x).cuda()
            x = torch.unsqueeze(x, axis=0)
            x = torch.transpose(x, 1, 3)
            with torch.no_grad():
                x = self.pf_model.conv_features(x)
                x = torch.flatten(x, 1) # flatten all dimensions except batch
            x = self.critic_model(x)
            values[i] = x
            #print(i)
        return values
    '''
    
    # Need to vectorize
    def get_critic_inputs(self, heatmaps, dummy=False):
        critic_inputs = []
        for i in range(len(heatmaps)):
            x = np.concatenate(heatmaps[i], axis=1)
            x = torch.tensor(x).cuda()
            x = torch.unsqueeze(x, axis=0)
            x = torch.transpose(x, 1, 3)
            with torch.no_grad():
                x = self.pf_model.conv_features(x)
                x = torch.flatten(x, 1) # flatten all dimensions except batch
            critic_inputs.append(x)
        critic_inputs = torch.stack(critic_inputs, dim=0)
        critic_inputs = critic_inputs.view(critic_inputs.shape[0], -1)
        #print(critic_inputs.shape)
        return critic_inputs
        
    def get_rewards(self, heatmaps, dummy=False):
        if dummy:
            h = heatmaps
            rewards = np.empty(len(h))
            for i in range(len(h)):
                score = -np.sum(np.square(h[i][0]-h[i][1])) - np.sum(np.square(h[i][1]-h[i][2])) - np.sum(np.square(h[i][0]-h[i][2]))
            rewards[i] = score
            return torch.tensor(rewards)
        with torch.no_grad():
            rewards = torch.empty(len(heatmaps))
            for i in range(len(heatmaps)):
                pf_input = torch.tensor(heatmaps[i]).cuda()
                pf_input = pf_input.view(pf_input.shape[0]*pf_input.shape[1], pf_input.shape[2], pf_input.shape[3])
                pf_input = torch.transpose(pf_input, 0, 2)
                pf_input = torch.transpose(pf_input, 1, 2)
                pf_input = torch.unsqueeze(pf_input, axis=0)
                reward = self.pf_model(pf_input)
                rewards[i] = reward
                #print(i)
        return rewards
        
    def update_v1(self, rewards, values, probs):
        self.policy_optimizer.zero_grad()
        for prob in probs:
            prob = prob.cuda()
        rewards = rewards.cuda()
        values = values.cuda()
        #print(list(self.PPnet_multi.module.features.parameters())[0])
        policy_loss = 0
        for i in range(len(rewards)):
            policy_loss -= probs[i] * (rewards[i] - values[i])    
        policy_loss.backward(retain_graph=True)
        self.policy_optimizer.step()
        #print(list(self.PPnet_multi.module.features.parameters())[0])
        self.critic_optimizer.zero_grad()
        #print(list(self.critic_model.parameters())[0])
        critic_loss = 0
        for i in range(len(rewards)):
            critic_loss += (rewards[i] - values[i]) ** 2
        #print(critic_loss.grad_fn)
        critic_loss.backward()
        self.critic_optimizer.step()
        #print(list(self.critic_model.parameters())[0])
        
        return 
    
    def update_v2(self, rewards, values, probs):
        
        return
    
    
    def run(self, batch_x, labels):

        # action is n_prototypes * k heatmaps
        heatmaps, probs, img_distances = self.get_heatmaps(batch_x, labels)
        #print("Finished calculating heatmaps")
        critic_inputs = self.get_critic_inputs(heatmaps)
        #print("Finished getting critic inputs")
        values = self.critic_model(critic_inputs)
        #print("Finished calculating values")
        rewards = self.get_rewards(heatmaps)
        #print("Finished calculating rewards")
        self.update_v1(rewards, values, probs)
        #print("Finished updating. Done.")
        
        return rewards, values, probs
    
    def run_dummy(self, batch_x):
        heatmaps, probs, img_distances, rewards  = self.get_heatmaps(batch_x, dummy=True)
        #print("Finished calculating heatmaps")
        critic_inputs = self.get_critic_inputs(heatmaps)
        #print("Finished getting critic inputs")
        values = self.critic_model(critic_inputs)
        #print("Finished calculating values")
        #rewards = self.get_rewards(h, dummy=True)
        #print("Finished calculating rewards")
        self.update_v1(rewards, values, probs)
        
        return rewards, values, probs, heatmaps

In [4]:
class Critic(nn.Module):
    def __init__(self, k=3, learning_rate=3e-4):
        super(Critic, self).__init__()
        
        '''
        self.critic_linear1 = nn.Linear(num_inputs, hidden_size)
        self.critic_linear2 = nn.Linear(hidden_size, 1)

        self.actor_linear1 = nn.Linear(num_inputs, hidden_size)
        self.actor_linear2 = nn.Linear(hidden_size, num_actions)
        '''
        
        self.k = k
        self.fc1 = nn.Linear(512 * k * 7 * 7, 120)
        self.fc2 = nn.Linear(120, 20)
        self.fc3 = nn.Linear(20, 1)
        
    
    def forward(self, x):

        out = torch.sigmoid(self.fc1(x))
        out = torch.sigmoid(self.fc2(out))
        out = self.fc3(out)

        return out

In [5]:
ppnet = torch.load(r'../saved_models/vgg19/004/100_7push0.7344.pth')

pf_model = construct_PrefNet("resnet18")
pf_model.load_state_dict(torch.load("./human_comparisons/pref_model_009_65+35_ep50_adam_0.0001"))
#pf_model = torch.load(r'./human_comparisons/pref_model_009_65+35_ep50_adam_0.0001_1')

<All keys matched successfully>

In [6]:
normalize = transforms.Normalize(mean=mean, std=std)

train_dataset = datasets.ImageFolder(
        train_push_dir,
        transforms.Compose([
        transforms.Resize(size=(img_size, img_size)),
        transforms.ToTensor(),
        normalize,
    ]))
dataloader = torch.utils.data.DataLoader(
    train_dataset, batch_size=100, shuffle=False,
    num_workers=4, pin_memory=False)

In [13]:
a3c = A3C_PPnet(ppnet, pf_model)

In [14]:
prob_records = np.empty((50, 1000))
reward_records = np.empty((50, 1000))
data_iter = iter(dataloader)
for _ in range(20):
    batch, labels = next(data_iter)
for i in tqdm(range(50)):
    rewards, values, probs = a3c.run(batch, labels)
    print(len(probs))
    total_reward = 0
    mse_loss = 0
    for j in range(len(probs)):
        #probs[j] = probs[j].detach().cpu().numpy()
        #rewards[j] = rewards[j].detach().cpu().numpy()
        total_reward += probs[j] * rewards[j]
        mse_loss += (rewards[j] - values[j]) ** 2
        prob_records[i][j] = probs[j].detach().cpu().numpy()
        reward_records[i][j] = rewards[j].detach().cpu().numpy()
    print("Iteration "+str(i)+" total expected reward: ", total_reward)
    print("Iteration "+str(i)+" total mse loss: ", mse_loss)

  0%|          | 0/50 [00:00<?, ?it/s]

20


  2%|▏         | 1/50 [00:05<04:35,  5.62s/it]

Iteration 0 total expected reward:  tensor(-0.0125, device='cuda:0', grad_fn=<AddBackward0>)
Iteration 0 total mse loss:  tensor([75.8457], device='cuda:0', grad_fn=<AddBackward0>)
20


  4%|▍         | 2/50 [00:09<03:46,  4.73s/it]

Iteration 1 total expected reward:  tensor(-0.0076, device='cuda:0', grad_fn=<AddBackward0>)
Iteration 1 total mse loss:  tensor([79.1165], device='cuda:0', grad_fn=<AddBackward0>)
20


  6%|▌         | 3/50 [00:13<03:29,  4.46s/it]

Iteration 2 total expected reward:  tensor(-0.0026, device='cuda:0', grad_fn=<AddBackward0>)
Iteration 2 total mse loss:  tensor([88.0002], device='cuda:0', grad_fn=<AddBackward0>)
20


  8%|▊         | 4/50 [00:17<03:18,  4.32s/it]

Iteration 3 total expected reward:  tensor(0.0010, device='cuda:0', grad_fn=<AddBackward0>)
Iteration 3 total mse loss:  tensor([91.5827], device='cuda:0', grad_fn=<AddBackward0>)
20


 10%|█         | 5/50 [00:22<03:11,  4.26s/it]

Iteration 4 total expected reward:  tensor(0.0068, device='cuda:0', grad_fn=<AddBackward0>)
Iteration 4 total mse loss:  tensor([119.3363], device='cuda:0', grad_fn=<AddBackward0>)
20


 12%|█▏        | 6/50 [00:26<03:04,  4.20s/it]

Iteration 5 total expected reward:  tensor(0.0154, device='cuda:0', grad_fn=<AddBackward0>)
Iteration 5 total mse loss:  tensor([104.2474], device='cuda:0', grad_fn=<AddBackward0>)
20


 14%|█▍        | 7/50 [00:30<03:00,  4.19s/it]

Iteration 6 total expected reward:  tensor(0.0009, device='cuda:0', grad_fn=<AddBackward0>)
Iteration 6 total mse loss:  tensor([99.6993], device='cuda:0', grad_fn=<AddBackward0>)
20


 16%|█▌        | 8/50 [00:34<02:54,  4.17s/it]

Iteration 7 total expected reward:  tensor(0.0065, device='cuda:0', grad_fn=<AddBackward0>)
Iteration 7 total mse loss:  tensor([111.9434], device='cuda:0', grad_fn=<AddBackward0>)
20


 18%|█▊        | 9/50 [00:38<02:50,  4.16s/it]

Iteration 8 total expected reward:  tensor(-0.0006, device='cuda:0', grad_fn=<AddBackward0>)
Iteration 8 total mse loss:  tensor([96.2179], device='cuda:0', grad_fn=<AddBackward0>)
20


 20%|██        | 10/50 [00:42<02:45,  4.14s/it]

Iteration 9 total expected reward:  tensor(0.0086, device='cuda:0', grad_fn=<AddBackward0>)
Iteration 9 total mse loss:  tensor([89.1398], device='cuda:0', grad_fn=<AddBackward0>)
20


 22%|██▏       | 11/50 [00:46<02:41,  4.13s/it]

Iteration 10 total expected reward:  tensor(0.0088, device='cuda:0', grad_fn=<AddBackward0>)
Iteration 10 total mse loss:  tensor([107.7140], device='cuda:0', grad_fn=<AddBackward0>)
20


 24%|██▍       | 12/50 [00:50<02:36,  4.13s/it]

Iteration 11 total expected reward:  tensor(-0.0003, device='cuda:0', grad_fn=<AddBackward0>)
Iteration 11 total mse loss:  tensor([109.7402], device='cuda:0', grad_fn=<AddBackward0>)
20


 26%|██▌       | 13/50 [00:55<02:32,  4.13s/it]

Iteration 12 total expected reward:  tensor(-0.0075, device='cuda:0', grad_fn=<AddBackward0>)
Iteration 12 total mse loss:  tensor([73.0191], device='cuda:0', grad_fn=<AddBackward0>)
20


 28%|██▊       | 14/50 [00:59<02:28,  4.13s/it]

Iteration 13 total expected reward:  tensor(-0.0185, device='cuda:0', grad_fn=<AddBackward0>)
Iteration 13 total mse loss:  tensor([118.2613], device='cuda:0', grad_fn=<AddBackward0>)
20


 30%|███       | 15/50 [01:03<02:23,  4.11s/it]

Iteration 14 total expected reward:  tensor(-0.0155, device='cuda:0', grad_fn=<AddBackward0>)
Iteration 14 total mse loss:  tensor([122.8279], device='cuda:0', grad_fn=<AddBackward0>)
20


 32%|███▏      | 16/50 [01:07<02:20,  4.12s/it]

Iteration 15 total expected reward:  tensor(-0.0236, device='cuda:0', grad_fn=<AddBackward0>)
Iteration 15 total mse loss:  tensor([126.0282], device='cuda:0', grad_fn=<AddBackward0>)
20


 34%|███▍      | 17/50 [01:11<02:15,  4.12s/it]

Iteration 16 total expected reward:  tensor(-0.0189, device='cuda:0', grad_fn=<AddBackward0>)
Iteration 16 total mse loss:  tensor([119.4154], device='cuda:0', grad_fn=<AddBackward0>)
20


 36%|███▌      | 18/50 [01:15<02:11,  4.11s/it]

Iteration 17 total expected reward:  tensor(-0.0092, device='cuda:0', grad_fn=<AddBackward0>)
Iteration 17 total mse loss:  tensor([81.3436], device='cuda:0', grad_fn=<AddBackward0>)
20


 38%|███▊      | 19/50 [01:19<02:07,  4.12s/it]

Iteration 18 total expected reward:  tensor(-0.0219, device='cuda:0', grad_fn=<AddBackward0>)
Iteration 18 total mse loss:  tensor([104.0648], device='cuda:0', grad_fn=<AddBackward0>)
20


 40%|████      | 20/50 [01:23<02:03,  4.11s/it]

Iteration 19 total expected reward:  tensor(-0.0181, device='cuda:0', grad_fn=<AddBackward0>)
Iteration 19 total mse loss:  tensor([103.4849], device='cuda:0', grad_fn=<AddBackward0>)
20


 42%|████▏     | 21/50 [01:28<01:59,  4.13s/it]

Iteration 20 total expected reward:  tensor(-0.0174, device='cuda:0', grad_fn=<AddBackward0>)
Iteration 20 total mse loss:  tensor([104.6459], device='cuda:0', grad_fn=<AddBackward0>)
20


 44%|████▍     | 22/50 [01:32<01:55,  4.12s/it]

Iteration 21 total expected reward:  tensor(-0.0113, device='cuda:0', grad_fn=<AddBackward0>)
Iteration 21 total mse loss:  tensor([70.6046], device='cuda:0', grad_fn=<AddBackward0>)
20


 46%|████▌     | 23/50 [01:36<01:51,  4.12s/it]

Iteration 22 total expected reward:  tensor(-0.0156, device='cuda:0', grad_fn=<AddBackward0>)
Iteration 22 total mse loss:  tensor([82.9409], device='cuda:0', grad_fn=<AddBackward0>)
20


 48%|████▊     | 24/50 [01:40<01:47,  4.12s/it]

Iteration 23 total expected reward:  tensor(-0.0131, device='cuda:0', grad_fn=<AddBackward0>)
Iteration 23 total mse loss:  tensor([93.1980], device='cuda:0', grad_fn=<AddBackward0>)
20


 50%|█████     | 25/50 [01:44<01:42,  4.12s/it]

Iteration 24 total expected reward:  tensor(-0.0118, device='cuda:0', grad_fn=<AddBackward0>)
Iteration 24 total mse loss:  tensor([102.5907], device='cuda:0', grad_fn=<AddBackward0>)
20


 52%|█████▏    | 26/50 [01:48<01:38,  4.11s/it]

Iteration 25 total expected reward:  tensor(-0.0106, device='cuda:0', grad_fn=<AddBackward0>)
Iteration 25 total mse loss:  tensor([90.6459], device='cuda:0', grad_fn=<AddBackward0>)
20


 54%|█████▍    | 27/50 [01:52<01:34,  4.12s/it]

Iteration 26 total expected reward:  tensor(-0.0185, device='cuda:0', grad_fn=<AddBackward0>)
Iteration 26 total mse loss:  tensor([69.3362], device='cuda:0', grad_fn=<AddBackward0>)
20


 56%|█████▌    | 28/50 [01:56<01:30,  4.12s/it]

Iteration 27 total expected reward:  tensor(-0.0082, device='cuda:0', grad_fn=<AddBackward0>)
Iteration 27 total mse loss:  tensor([73.6583], device='cuda:0', grad_fn=<AddBackward0>)
20


 58%|█████▊    | 29/50 [02:00<01:26,  4.11s/it]

Iteration 28 total expected reward:  tensor(-0.0190, device='cuda:0', grad_fn=<AddBackward0>)
Iteration 28 total mse loss:  tensor([76.0025], device='cuda:0', grad_fn=<AddBackward0>)
20


 60%|██████    | 30/50 [02:05<01:22,  4.13s/it]

Iteration 29 total expected reward:  tensor(-0.0101, device='cuda:0', grad_fn=<AddBackward0>)
Iteration 29 total mse loss:  tensor([67.5377], device='cuda:0', grad_fn=<AddBackward0>)
20


 62%|██████▏   | 31/50 [02:09<01:18,  4.11s/it]

Iteration 30 total expected reward:  tensor(-0.0035, device='cuda:0', grad_fn=<AddBackward0>)
Iteration 30 total mse loss:  tensor([72.0587], device='cuda:0', grad_fn=<AddBackward0>)
20


 64%|██████▍   | 32/50 [02:13<01:14,  4.13s/it]

Iteration 31 total expected reward:  tensor(-0.0173, device='cuda:0', grad_fn=<AddBackward0>)
Iteration 31 total mse loss:  tensor([83.1363], device='cuda:0', grad_fn=<AddBackward0>)
20


 66%|██████▌   | 33/50 [02:17<01:10,  4.12s/it]

Iteration 32 total expected reward:  tensor(-0.0134, device='cuda:0', grad_fn=<AddBackward0>)
Iteration 32 total mse loss:  tensor([94.9140], device='cuda:0', grad_fn=<AddBackward0>)
20


 68%|██████▊   | 34/50 [02:21<01:06,  4.13s/it]

Iteration 33 total expected reward:  tensor(-0.0112, device='cuda:0', grad_fn=<AddBackward0>)
Iteration 33 total mse loss:  tensor([83.2455], device='cuda:0', grad_fn=<AddBackward0>)
20


 70%|███████   | 35/50 [02:25<01:01,  4.12s/it]

Iteration 34 total expected reward:  tensor(-0.0201, device='cuda:0', grad_fn=<AddBackward0>)
Iteration 34 total mse loss:  tensor([76.8968], device='cuda:0', grad_fn=<AddBackward0>)
20


 72%|███████▏  | 36/50 [02:29<00:57,  4.11s/it]

Iteration 35 total expected reward:  tensor(-0.0110, device='cuda:0', grad_fn=<AddBackward0>)
Iteration 35 total mse loss:  tensor([93.7090], device='cuda:0', grad_fn=<AddBackward0>)
20


 74%|███████▍  | 37/50 [02:33<00:53,  4.11s/it]

Iteration 36 total expected reward:  tensor(-0.0187, device='cuda:0', grad_fn=<AddBackward0>)
Iteration 36 total mse loss:  tensor([78.2053], device='cuda:0', grad_fn=<AddBackward0>)
20


 76%|███████▌  | 38/50 [02:37<00:49,  4.11s/it]

Iteration 37 total expected reward:  tensor(-0.0178, device='cuda:0', grad_fn=<AddBackward0>)
Iteration 37 total mse loss:  tensor([79.1336], device='cuda:0', grad_fn=<AddBackward0>)
20


 78%|███████▊  | 39/50 [02:42<00:45,  4.11s/it]

Iteration 38 total expected reward:  tensor(-0.0193, device='cuda:0', grad_fn=<AddBackward0>)
Iteration 38 total mse loss:  tensor([60.4648], device='cuda:0', grad_fn=<AddBackward0>)
20


 80%|████████  | 40/50 [02:46<00:41,  4.11s/it]

Iteration 39 total expected reward:  tensor(-0.0190, device='cuda:0', grad_fn=<AddBackward0>)
Iteration 39 total mse loss:  tensor([75.9254], device='cuda:0', grad_fn=<AddBackward0>)
20


 82%|████████▏ | 41/50 [02:50<00:36,  4.11s/it]

Iteration 40 total expected reward:  tensor(-0.0234, device='cuda:0', grad_fn=<AddBackward0>)
Iteration 40 total mse loss:  tensor([70.3113], device='cuda:0', grad_fn=<AddBackward0>)
20


 84%|████████▍ | 42/50 [02:54<00:32,  4.11s/it]

Iteration 41 total expected reward:  tensor(-0.0197, device='cuda:0', grad_fn=<AddBackward0>)
Iteration 41 total mse loss:  tensor([62.8112], device='cuda:0', grad_fn=<AddBackward0>)
20


 86%|████████▌ | 43/50 [02:58<00:28,  4.11s/it]

Iteration 42 total expected reward:  tensor(-0.0245, device='cuda:0', grad_fn=<AddBackward0>)
Iteration 42 total mse loss:  tensor([57.8034], device='cuda:0', grad_fn=<AddBackward0>)
20


 88%|████████▊ | 44/50 [03:02<00:24,  4.13s/it]

Iteration 43 total expected reward:  tensor(-0.0210, device='cuda:0', grad_fn=<AddBackward0>)
Iteration 43 total mse loss:  tensor([55.8984], device='cuda:0', grad_fn=<AddBackward0>)
20


 90%|█████████ | 45/50 [03:06<00:20,  4.11s/it]

Iteration 44 total expected reward:  tensor(-0.0215, device='cuda:0', grad_fn=<AddBackward0>)
Iteration 44 total mse loss:  tensor([62.6374], device='cuda:0', grad_fn=<AddBackward0>)
20


 92%|█████████▏| 46/50 [03:10<00:16,  4.10s/it]

Iteration 45 total expected reward:  tensor(-0.0214, device='cuda:0', grad_fn=<AddBackward0>)
Iteration 45 total mse loss:  tensor([61.2502], device='cuda:0', grad_fn=<AddBackward0>)
20


 94%|█████████▍| 47/50 [03:14<00:12,  4.09s/it]

Iteration 46 total expected reward:  tensor(-0.0162, device='cuda:0', grad_fn=<AddBackward0>)
Iteration 46 total mse loss:  tensor([61.1780], device='cuda:0', grad_fn=<AddBackward0>)
20


 96%|█████████▌| 48/50 [03:19<00:08,  4.10s/it]

Iteration 47 total expected reward:  tensor(-0.0188, device='cuda:0', grad_fn=<AddBackward0>)
Iteration 47 total mse loss:  tensor([51.5845], device='cuda:0', grad_fn=<AddBackward0>)
20


 98%|█████████▊| 49/50 [03:23<00:04,  4.13s/it]

Iteration 48 total expected reward:  tensor(-0.0181, device='cuda:0', grad_fn=<AddBackward0>)
Iteration 48 total mse loss:  tensor([50.4070], device='cuda:0', grad_fn=<AddBackward0>)
20


100%|██████████| 50/50 [03:27<00:00,  4.15s/it]

Iteration 49 total expected reward:  tensor(-0.0204, device='cuda:0', grad_fn=<AddBackward0>)
Iteration 49 total mse loss:  tensor([49.9347], device='cuda:0', grad_fn=<AddBackward0>)





In [12]:
np.save(r'./A3C_results/prob_records_iter10_batch20.npy', prob_records)
np.save(r'./A3C_results/reward_records_iter10_batch20.npy', reward_records)

In [39]:
prob_records_sorted = np.sort(prob_records, axis=1)
sorted_prob_index = np.argsort(prob_records, axis=1)
print(sorted_prob_index.shape)

(10, 1000)


In [42]:
prob_records[0][sorted_prob_index[0][-100:]]

array([3.1546278 , 3.33953476, 3.35639477, 3.66883135, 3.70347977,
       3.7588191 , 3.83299041, 3.83299041, 3.88497162, 3.9135766 ,
       3.95696306, 4.15255356, 4.31025791, 4.33145142, 4.36347342,
       4.46746016, 4.61634111, 4.68425179, 4.71241188, 4.9184804 ,
       4.94390678, 4.95119905, 5.11668777, 5.30823374, 5.35067034,
       5.37403584, 5.37604761, 5.42540026, 5.46156883, 5.51010084,
       5.57676554, 5.58917475, 5.62009096, 5.63833141, 5.65631962,
       5.73786974, 5.74734545, 5.75679827, 5.7850008 , 5.84051895,
       5.84336281, 5.86585045, 5.86763573, 5.88327503, 5.89575529,
       5.9021287 , 5.92162323, 5.92976141, 5.93262672, 5.94436979,
       5.95309639, 5.96435022, 5.96940517, 5.98616838, 5.98733902,
       5.99058199, 5.99097824, 5.9921999 , 5.99560738, 5.99593163,
       5.99617386, 5.99622107, 5.9969244 , 5.99782753, 5.99933243,
       5.99948931, 5.99974155, 5.99978209, 5.99981308, 5.99985504,
       5.99987555, 5.99994183, 5.99994183, 5.9999485 , 5.99995

In [48]:
reward_records[0][sorted_prob_index[0][-100:]]

array([ 2.95592165, -2.82480574, -2.66942024, -2.87655592,  2.65273333,
        2.95444059, -2.83822775, -2.83822775,  2.95611429, -2.87763977,
       -2.55860901, -2.87759066,  1.05071831,  1.99227178, -2.63136625,
        2.9560895 ,  2.95625019,  1.81558704, -2.87426138,  2.95626116,
       -2.87668371,  2.95592642,  0.56000394,  2.95593262,  2.82985115,
       -2.87542105,  2.95610237, -2.7824192 ,  2.95625544,  2.95538521,
       -0.88582468,  1.78973567,  2.84648705, -0.49043298,  1.76294279,
        2.82052374, -2.86940479,  0.74559736,  2.929389  , -2.56735229,
        2.8025589 ,  2.95449209,  2.95450878,  1.80669439,  2.95627499,
        2.39929771, -2.87768412,  2.95241928,  1.55573928, -2.63089275,
        2.95429802, -2.86952496,  2.16030598,  2.95075798,  0.91464406,
        2.95511246,  2.95502377, -2.40150547,  2.77435732,  1.88581896,
        2.95567465,  2.95625257,  1.82907784, -2.84644365,  2.37964201,
        2.9548502 ,  2.93420267,  2.95518255,  1.77334046,  2.95

In [53]:
labels

tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3,
        3, 3, 3, 3])

In [7]:
a3c_dummy_reward = A3C_PPnet(ppnet, pf_model)

In [8]:

prob_records_dummy = np.empty((100, 1000))
reward_records_dummy = np.empty((100, 1000))
h_records = []
data_iter = iter(dataloader)
for _ in range(20):
    batch, label = next(data_iter)
for i in tqdm(range(100)):
    rewards, values, probs, h = a3c_dummy_reward.run_dummy(batch)
    total_reward = 0
    mse_loss = 0
    
    h_records.append(h)
    for j in range(len(probs)):
        #probs[j] = probs[j].detach().cpu().numpy()
        #rewards[j] = rewards[j].detach().cpu().numpy()
        total_reward += probs[j] * rewards[j]
        mse_loss += (rewards[j] - values[j]) ** 2
        prob_records_dummy[i][j] = probs[j].detach().cpu().numpy()
        reward_records_dummy[i][j] = rewards[j].detach().cpu().numpy()
    print("Iteration "+str(i)+" total expected reward: ", total_reward)
    print("Iteration "+str(i)+" total mse loss: ", mse_loss)


  1%|          | 1/100 [00:26<42:57, 26.04s/it]

Iteration 0 total expected reward:  tensor(-1941382.1250, device='cuda:0', grad_fn=<AddBackward0>)
Iteration 0 total mse loss:  tensor([1.0895e+10], device='cuda:0', grad_fn=<AddBackward0>)


  2%|▏         | 2/100 [00:50<41:12, 25.23s/it]

Iteration 1 total expected reward:  tensor(-571138.7500, device='cuda:0', grad_fn=<AddBackward0>)
Iteration 1 total mse loss:  tensor([1.0007e+10], device='cuda:0', grad_fn=<AddBackward0>)


  3%|▎         | 3/100 [01:15<40:37, 25.12s/it]

Iteration 2 total expected reward:  tensor(-71382.9141, device='cuda:0', grad_fn=<AddBackward0>)
Iteration 2 total mse loss:  tensor([7.6649e+09], device='cuda:0', grad_fn=<AddBackward0>)


  4%|▍         | 4/100 [01:40<39:57, 24.98s/it]

Iteration 3 total expected reward:  tensor(-13.2413, device='cuda:0', grad_fn=<AddBackward0>)
Iteration 3 total mse loss:  tensor([5.0904e+09], device='cuda:0', grad_fn=<AddBackward0>)


  5%|▌         | 5/100 [02:05<39:26, 24.91s/it]

Iteration 4 total expected reward:  tensor(-16.3895, device='cuda:0', grad_fn=<AddBackward0>)
Iteration 4 total mse loss:  tensor([9.8136e+09], device='cuda:0', grad_fn=<AddBackward0>)


  6%|▌         | 6/100 [02:29<38:54, 24.84s/it]

Iteration 5 total expected reward:  tensor(-27.8129, device='cuda:0', grad_fn=<AddBackward0>)
Iteration 5 total mse loss:  tensor([2.4086e+10], device='cuda:0', grad_fn=<AddBackward0>)


  7%|▋         | 7/100 [02:54<38:26, 24.80s/it]

Iteration 6 total expected reward:  tensor(-38.8384, device='cuda:0', grad_fn=<AddBackward0>)
Iteration 6 total mse loss:  tensor([4.3673e+10], device='cuda:0', grad_fn=<AddBackward0>)


  8%|▊         | 8/100 [03:19<37:58, 24.76s/it]

Iteration 7 total expected reward:  tensor(-43.6491, device='cuda:0', grad_fn=<AddBackward0>)
Iteration 7 total mse loss:  tensor([5.3941e+10], device='cuda:0', grad_fn=<AddBackward0>)


  9%|▉         | 9/100 [03:43<37:25, 24.67s/it]

Iteration 8 total expected reward:  tensor(-45.7838, device='cuda:0', grad_fn=<AddBackward0>)
Iteration 8 total mse loss:  tensor([5.8970e+10], device='cuda:0', grad_fn=<AddBackward0>)


 10%|█         | 10/100 [04:08<36:56, 24.63s/it]

Iteration 9 total expected reward:  tensor(-46.1637, device='cuda:0', grad_fn=<AddBackward0>)
Iteration 9 total mse loss:  tensor([5.9781e+10], device='cuda:0', grad_fn=<AddBackward0>)


 11%|█         | 11/100 [04:33<36:34, 24.66s/it]

Iteration 10 total expected reward:  tensor(-46.2589, device='cuda:0', grad_fn=<AddBackward0>)
Iteration 10 total mse loss:  tensor([6.0013e+10], device='cuda:0', grad_fn=<AddBackward0>)


 12%|█▏        | 12/100 [04:57<36:04, 24.60s/it]

Iteration 11 total expected reward:  tensor(-45.1847, device='cuda:0', grad_fn=<AddBackward0>)
Iteration 11 total mse loss:  tensor([5.7369e+10], device='cuda:0', grad_fn=<AddBackward0>)


 13%|█▎        | 13/100 [05:22<35:40, 24.60s/it]

Iteration 12 total expected reward:  tensor(-43.5257, device='cuda:0', grad_fn=<AddBackward0>)
Iteration 12 total mse loss:  tensor([5.3708e+10], device='cuda:0', grad_fn=<AddBackward0>)


 14%|█▍        | 14/100 [05:47<35:22, 24.68s/it]

Iteration 13 total expected reward:  tensor(-37.8697, device='cuda:0', grad_fn=<AddBackward0>)
Iteration 13 total mse loss:  tensor([4.1709e+10], device='cuda:0', grad_fn=<AddBackward0>)


 15%|█▌        | 15/100 [06:11<35:01, 24.72s/it]

Iteration 14 total expected reward:  tensor(-30.4738, device='cuda:0', grad_fn=<AddBackward0>)
Iteration 14 total mse loss:  tensor([2.7813e+10], device='cuda:0', grad_fn=<AddBackward0>)


 16%|█▌        | 16/100 [06:36<34:40, 24.77s/it]

Iteration 15 total expected reward:  tensor(-28.3569, device='cuda:0', grad_fn=<AddBackward0>)
Iteration 15 total mse loss:  tensor([2.4439e+10], device='cuda:0', grad_fn=<AddBackward0>)


 17%|█▋        | 17/100 [07:01<34:17, 24.78s/it]

Iteration 16 total expected reward:  tensor(-24.2169, device='cuda:0', grad_fn=<AddBackward0>)
Iteration 16 total mse loss:  tensor([1.8222e+10], device='cuda:0', grad_fn=<AddBackward0>)


 18%|█▊        | 18/100 [07:26<34:02, 24.91s/it]

Iteration 17 total expected reward:  tensor(-21.4249, device='cuda:0', grad_fn=<AddBackward0>)
Iteration 17 total mse loss:  tensor([1.4833e+10], device='cuda:0', grad_fn=<AddBackward0>)


 19%|█▉        | 19/100 [07:52<34:08, 25.29s/it]

Iteration 18 total expected reward:  tensor(-19.6913, device='cuda:0', grad_fn=<AddBackward0>)
Iteration 18 total mse loss:  tensor([1.2943e+10], device='cuda:0', grad_fn=<AddBackward0>)


 20%|██        | 20/100 [08:18<34:00, 25.51s/it]

Iteration 19 total expected reward:  tensor(-19.0698, device='cuda:0', grad_fn=<AddBackward0>)
Iteration 19 total mse loss:  tensor([1.2612e+10], device='cuda:0', grad_fn=<AddBackward0>)


 21%|██        | 21/100 [08:44<33:39, 25.56s/it]

Iteration 20 total expected reward:  tensor(-15.7033, device='cuda:0', grad_fn=<AddBackward0>)
Iteration 20 total mse loss:  tensor([8.5897e+09], device='cuda:0', grad_fn=<AddBackward0>)


 22%|██▏       | 22/100 [09:11<33:37, 25.86s/it]

Iteration 21 total expected reward:  tensor(-14.6041, device='cuda:0', grad_fn=<AddBackward0>)
Iteration 21 total mse loss:  tensor([7.8885e+09], device='cuda:0', grad_fn=<AddBackward0>)


 23%|██▎       | 23/100 [09:37<33:13, 25.90s/it]

Iteration 22 total expected reward:  tensor(-12.5540, device='cuda:0', grad_fn=<AddBackward0>)
Iteration 22 total mse loss:  tensor([6.0519e+09], device='cuda:0', grad_fn=<AddBackward0>)


 24%|██▍       | 24/100 [10:03<32:48, 25.90s/it]

Iteration 23 total expected reward:  tensor(-10.4966, device='cuda:0', grad_fn=<AddBackward0>)
Iteration 23 total mse loss:  tensor([4.4342e+09], device='cuda:0', grad_fn=<AddBackward0>)


 25%|██▌       | 25/100 [10:28<32:19, 25.86s/it]

Iteration 24 total expected reward:  tensor(-8.0991, device='cuda:0', grad_fn=<AddBackward0>)
Iteration 24 total mse loss:  tensor([2.7540e+09], device='cuda:0', grad_fn=<AddBackward0>)


 26%|██▌       | 26/100 [10:54<31:59, 25.94s/it]

Iteration 25 total expected reward:  tensor(-6.7717, device='cuda:0', grad_fn=<AddBackward0>)
Iteration 25 total mse loss:  tensor([2.2930e+09], device='cuda:0', grad_fn=<AddBackward0>)


 27%|██▋       | 27/100 [11:21<31:37, 25.99s/it]

Iteration 26 total expected reward:  tensor(-6.7465, device='cuda:0', grad_fn=<AddBackward0>)
Iteration 26 total mse loss:  tensor([2.2245e+09], device='cuda:0', grad_fn=<AddBackward0>)


 28%|██▊       | 28/100 [11:46<31:07, 25.93s/it]

Iteration 27 total expected reward:  tensor(-5.8282, device='cuda:0', grad_fn=<AddBackward0>)
Iteration 27 total mse loss:  tensor([1.8189e+09], device='cuda:0', grad_fn=<AddBackward0>)


 29%|██▉       | 29/100 [12:12<30:35, 25.85s/it]

Iteration 28 total expected reward:  tensor(-5.5300, device='cuda:0', grad_fn=<AddBackward0>)
Iteration 28 total mse loss:  tensor([1.5659e+09], device='cuda:0', grad_fn=<AddBackward0>)


 30%|███       | 30/100 [12:38<30:13, 25.90s/it]

Iteration 29 total expected reward:  tensor(-5.7877, device='cuda:0', grad_fn=<AddBackward0>)
Iteration 29 total mse loss:  tensor([1.9903e+09], device='cuda:0', grad_fn=<AddBackward0>)


 31%|███       | 31/100 [13:04<29:47, 25.90s/it]

Iteration 30 total expected reward:  tensor(-5.3885, device='cuda:0', grad_fn=<AddBackward0>)
Iteration 30 total mse loss:  tensor([1.5990e+09], device='cuda:0', grad_fn=<AddBackward0>)


 32%|███▏      | 32/100 [13:30<29:24, 25.94s/it]

Iteration 31 total expected reward:  tensor(-4.9428, device='cuda:0', grad_fn=<AddBackward0>)
Iteration 31 total mse loss:  tensor([1.3764e+09], device='cuda:0', grad_fn=<AddBackward0>)


 33%|███▎      | 33/100 [13:56<28:57, 25.94s/it]

Iteration 32 total expected reward:  tensor(-4.5908, device='cuda:0', grad_fn=<AddBackward0>)
Iteration 32 total mse loss:  tensor([1.2775e+09], device='cuda:0', grad_fn=<AddBackward0>)


 34%|███▍      | 34/100 [14:22<28:31, 25.94s/it]

Iteration 33 total expected reward:  tensor(-4.5497, device='cuda:0', grad_fn=<AddBackward0>)
Iteration 33 total mse loss:  tensor([1.3121e+09], device='cuda:0', grad_fn=<AddBackward0>)


 35%|███▌      | 35/100 [14:48<28:10, 26.01s/it]

Iteration 34 total expected reward:  tensor(-4.8396, device='cuda:0', grad_fn=<AddBackward0>)
Iteration 34 total mse loss:  tensor([1.6628e+09], device='cuda:0', grad_fn=<AddBackward0>)


 36%|███▌      | 36/100 [15:14<27:42, 25.97s/it]

Iteration 35 total expected reward:  tensor(-3.8679, device='cuda:0', grad_fn=<AddBackward0>)
Iteration 35 total mse loss:  tensor([9.4986e+08], device='cuda:0', grad_fn=<AddBackward0>)


 37%|███▋      | 37/100 [15:40<27:15, 25.96s/it]

Iteration 36 total expected reward:  tensor(-4.1962, device='cuda:0', grad_fn=<AddBackward0>)
Iteration 36 total mse loss:  tensor([1.1541e+09], device='cuda:0', grad_fn=<AddBackward0>)


 38%|███▊      | 38/100 [16:06<26:51, 25.98s/it]

Iteration 37 total expected reward:  tensor(-3.9301, device='cuda:0', grad_fn=<AddBackward0>)
Iteration 37 total mse loss:  tensor([9.6886e+08], device='cuda:0', grad_fn=<AddBackward0>)


 39%|███▉      | 39/100 [16:32<26:22, 25.94s/it]

Iteration 38 total expected reward:  tensor(-3.5820, device='cuda:0', grad_fn=<AddBackward0>)
Iteration 38 total mse loss:  tensor([1.0452e+09], device='cuda:0', grad_fn=<AddBackward0>)


 40%|████      | 40/100 [16:58<26:03, 26.06s/it]

Iteration 39 total expected reward:  tensor(-3.2408, device='cuda:0', grad_fn=<AddBackward0>)
Iteration 39 total mse loss:  tensor([7.3408e+08], device='cuda:0', grad_fn=<AddBackward0>)


 41%|████      | 41/100 [17:24<25:34, 26.02s/it]

Iteration 40 total expected reward:  tensor(-3.0675, device='cuda:0', grad_fn=<AddBackward0>)
Iteration 40 total mse loss:  tensor([5.1560e+08], device='cuda:0', grad_fn=<AddBackward0>)


 42%|████▏     | 42/100 [17:50<25:08, 26.01s/it]

Iteration 41 total expected reward:  tensor(-2.5345, device='cuda:0', grad_fn=<AddBackward0>)
Iteration 41 total mse loss:  tensor([3.5418e+08], device='cuda:0', grad_fn=<AddBackward0>)


 43%|████▎     | 43/100 [18:16<24:42, 26.01s/it]

Iteration 42 total expected reward:  tensor(-2.8222, device='cuda:0', grad_fn=<AddBackward0>)
Iteration 42 total mse loss:  tensor([5.3237e+08], device='cuda:0', grad_fn=<AddBackward0>)


 44%|████▍     | 44/100 [18:42<24:18, 26.05s/it]

Iteration 43 total expected reward:  tensor(-5.1784, device='cuda:0', grad_fn=<AddBackward0>)
Iteration 43 total mse loss:  tensor([2.0568e+09], device='cuda:0', grad_fn=<AddBackward0>)


 45%|████▌     | 45/100 [19:08<23:44, 25.89s/it]

Iteration 44 total expected reward:  tensor(-2.6081, device='cuda:0', grad_fn=<AddBackward0>)
Iteration 44 total mse loss:  tensor([3.3563e+08], device='cuda:0', grad_fn=<AddBackward0>)


 46%|████▌     | 46/100 [19:33<23:12, 25.78s/it]

Iteration 45 total expected reward:  tensor(-4.5302, device='cuda:0', grad_fn=<AddBackward0>)
Iteration 45 total mse loss:  tensor([8.0103e+08], device='cuda:0', grad_fn=<AddBackward0>)


 47%|████▋     | 47/100 [19:59<22:45, 25.76s/it]

Iteration 46 total expected reward:  tensor(-7.2452, device='cuda:0', grad_fn=<AddBackward0>)
Iteration 46 total mse loss:  tensor([1.8889e+09], device='cuda:0', grad_fn=<AddBackward0>)


 47%|████▋     | 47/100 [20:01<22:35, 25.57s/it]

Unexpected exception formatting exception. Falling back to standard exception



Traceback (most recent call last):
  File "/usr/local/linux/anaconda3.8/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 3369, in run_code
  File "/tmp/ipykernel_1654755/4072965456.py", line 8, in <cell line: 7>
  File "/tmp/ipykernel_1654755/1927103216.py", line 259, in run_dummy
  File "/tmp/ipykernel_1654755/1927103216.py", line 80, in get_heatmaps
RuntimeError: [enforce fail at CPUAllocator.cpp:65] . DefaultCPUAllocator: can't allocate memory: you tried to allocate 602112 bytes. Error code 12 (Cannot allocate memory)

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/linux/anaconda3.8/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 1982, in showtraceback
    stb = self.InteractiveTB.structured_traceback(
  File "/usr/local/linux/anaconda3.8/lib/python3.8/site-packages/IPython/core/ultratb.py", line 1118, in structured_traceback
  File "/usr/local/linux/anaconda3.8/lib/py

In [9]:
np.save(r'./A3C_results/prob_records_iter10_batch20_dummy.npy', prob_records_dummy)
np.save(r'./A3C_results/reward_records_iter10_batch20_dummy.npy', reward_records_dummy)

In [14]:
a3c.PPnet_multi.module.prototype_class_identity.shape

torch.Size([1000, 200])

(10, 1000)


In [None]:
'''
reward_results = np.empty((50, 60))
value_loss_results = np.empty((50, 60))
for epoch in range(50):
    for i, (train_batch, label) in tqdm(enumerate(dataloader)):
    #for i in tqdm(range(100)):
        rewards, values, probs = a3c.run(train_batch)
        total_reward = 0
        mse_loss = 0
        for j in range(len(probs)):
            total_reward += probs[j] * rewards[j]
            mse_loss += (rewards[j] - values[j]) ** 2

        print("Epoch "+ str(epoch)+" Batch "+str(i)+" total expected reward: ", total_reward)
        print("Batch "+str(i)+" total mse loss: ", mse_loss)
        reward_results[epoch][i] = total_reward.detach().cpu().numpy()
        value_loss_results[epoch][i] = mse_loss.detach().cpu().numpy()
        #if i%10 == 0:
        #    torch.save(a3c.PPnet_multi, '../saved_models/iteration_'+str(i)+'_ppnet_multi_'+str(total_reward))
        #    torch.save(a3c.critic_model, '../saved_models/iteration_'+str(i)+'_critic_model_'+str(mse_loss))
        del train_batch
        for prob in probs:
            del prob
        del values
        del rewards
    avg_reward = np.mean(reward_results[epoch])
    torch.save(a3c.PPnet_multi, '../saved_models/ep_'+str(epoch)+'_ppnet_multi_'+str(avg_reward))
    torch.save(a3c.critic_model, '../saved_models/ep_'+str(epoch)+'_critic_model_'+str(avg_reward))
'''

1it [03:04, 184.88s/it]

Epoch 0 Batch 0 total expected reward:  tensor(1016.9069, device='cuda:0', grad_fn=<AddBackward0>)
Batch 0 total mse loss:  tensor([4916.5386], device='cuda:0', grad_fn=<AddBackward0>)


2it [06:13, 186.97s/it]

Epoch 0 Batch 1 total expected reward:  tensor(404.0566, device='cuda:0', grad_fn=<AddBackward0>)
Batch 1 total mse loss:  tensor([4228.8071], device='cuda:0', grad_fn=<AddBackward0>)


3it [09:24, 188.72s/it]

Epoch 0 Batch 2 total expected reward:  tensor(753.0262, device='cuda:0', grad_fn=<AddBackward0>)
Batch 2 total mse loss:  tensor([4006.8752], device='cuda:0', grad_fn=<AddBackward0>)


4it [12:33, 188.97s/it]

Epoch 0 Batch 3 total expected reward:  tensor(570.3600, device='cuda:0', grad_fn=<AddBackward0>)
Batch 3 total mse loss:  tensor([4125.3672], device='cuda:0', grad_fn=<AddBackward0>)


5it [15:37, 187.11s/it]

Epoch 0 Batch 4 total expected reward:  tensor(-85.6844, device='cuda:0', grad_fn=<AddBackward0>)
Batch 4 total mse loss:  tensor([5101.0708], device='cuda:0', grad_fn=<AddBackward0>)


6it [18:40, 185.69s/it]

Epoch 0 Batch 5 total expected reward:  tensor(243.7928, device='cuda:0', grad_fn=<AddBackward0>)
Batch 5 total mse loss:  tensor([4995.9551], device='cuda:0', grad_fn=<AddBackward0>)


7it [21:43, 184.84s/it]

Epoch 0 Batch 6 total expected reward:  tensor(218.7568, device='cuda:0', grad_fn=<AddBackward0>)
Batch 6 total mse loss:  tensor([3814.1658], device='cuda:0', grad_fn=<AddBackward0>)


8it [24:46, 184.36s/it]

Epoch 0 Batch 7 total expected reward:  tensor(180.2791, device='cuda:0', grad_fn=<AddBackward0>)
Batch 7 total mse loss:  tensor([3603.8333], device='cuda:0', grad_fn=<AddBackward0>)


9it [27:49, 183.86s/it]

Epoch 0 Batch 8 total expected reward:  tensor(228.3367, device='cuda:0', grad_fn=<AddBackward0>)
Batch 8 total mse loss:  tensor([3703.2227], device='cuda:0', grad_fn=<AddBackward0>)


10it [30:58, 185.38s/it]

Epoch 0 Batch 9 total expected reward:  tensor(19.4229, device='cuda:0', grad_fn=<AddBackward0>)
Batch 9 total mse loss:  tensor([4448.5674], device='cuda:0', grad_fn=<AddBackward0>)


11it [34:01, 184.67s/it]

Epoch 0 Batch 10 total expected reward:  tensor(84.8035, device='cuda:0', grad_fn=<AddBackward0>)
Batch 10 total mse loss:  tensor([4350.8662], device='cuda:0', grad_fn=<AddBackward0>)


12it [37:04, 184.19s/it]

Epoch 0 Batch 11 total expected reward:  tensor(43.6808, device='cuda:0', grad_fn=<AddBackward0>)
Batch 11 total mse loss:  tensor([4839.4316], device='cuda:0', grad_fn=<AddBackward0>)


13it [40:06, 183.72s/it]

Epoch 0 Batch 12 total expected reward:  tensor(86.5323, device='cuda:0', grad_fn=<AddBackward0>)
Batch 12 total mse loss:  tensor([4624.2812], device='cuda:0', grad_fn=<AddBackward0>)


14it [43:10, 183.56s/it]

Epoch 0 Batch 13 total expected reward:  tensor(82.4327, device='cuda:0', grad_fn=<AddBackward0>)
Batch 13 total mse loss:  tensor([3737.3894], device='cuda:0', grad_fn=<AddBackward0>)


15it [46:13, 183.39s/it]

Epoch 0 Batch 14 total expected reward:  tensor(2.5676, device='cuda:0', grad_fn=<AddBackward0>)
Batch 14 total mse loss:  tensor([5085.5103], device='cuda:0', grad_fn=<AddBackward0>)


16it [49:16, 183.27s/it]

Epoch 0 Batch 15 total expected reward:  tensor(1.1845, device='cuda:0', grad_fn=<AddBackward0>)
Batch 15 total mse loss:  tensor([4467.2900], device='cuda:0', grad_fn=<AddBackward0>)


17it [52:18, 183.08s/it]

Epoch 0 Batch 16 total expected reward:  tensor(-1.0540, device='cuda:0', grad_fn=<AddBackward0>)
Batch 16 total mse loss:  tensor([7315.7681], device='cuda:0', grad_fn=<AddBackward0>)


18it [55:21, 182.95s/it]

Epoch 0 Batch 17 total expected reward:  tensor(0.0311, device='cuda:0', grad_fn=<AddBackward0>)
Batch 17 total mse loss:  tensor([6836.4512], device='cuda:0', grad_fn=<AddBackward0>)


19it [58:24, 183.04s/it]

Epoch 0 Batch 18 total expected reward:  tensor(-0.0040, device='cuda:0', grad_fn=<AddBackward0>)
Batch 18 total mse loss:  tensor([6357.5249], device='cuda:0', grad_fn=<AddBackward0>)


20it [1:01:27, 183.08s/it]

Epoch 0 Batch 19 total expected reward:  tensor(-0.0082, device='cuda:0', grad_fn=<AddBackward0>)
Batch 19 total mse loss:  tensor([7962.1641], device='cuda:0', grad_fn=<AddBackward0>)


21it [1:04:35, 184.35s/it]

Epoch 0 Batch 20 total expected reward:  tensor(-0.0143, device='cuda:0', grad_fn=<AddBackward0>)
Batch 20 total mse loss:  tensor([8594.5586], device='cuda:0', grad_fn=<AddBackward0>)


22it [1:07:38, 183.90s/it]

Epoch 0 Batch 21 total expected reward:  tensor(-0.0544, device='cuda:0', grad_fn=<AddBackward0>)
Batch 21 total mse loss:  tensor([7550.9956], device='cuda:0', grad_fn=<AddBackward0>)


23it [1:10:47, 185.57s/it]

Epoch 0 Batch 22 total expected reward:  tensor(-0.0122, device='cuda:0', grad_fn=<AddBackward0>)
Batch 22 total mse loss:  tensor([8897.7363], device='cuda:0', grad_fn=<AddBackward0>)


24it [1:13:50, 184.89s/it]

Epoch 0 Batch 23 total expected reward:  tensor(-0.0010, device='cuda:0', grad_fn=<AddBackward0>)
Batch 23 total mse loss:  tensor([4967.7935], device='cuda:0', grad_fn=<AddBackward0>)


25it [1:16:53, 184.26s/it]

Epoch 0 Batch 24 total expected reward:  tensor(-0.0069, device='cuda:0', grad_fn=<AddBackward0>)
Batch 24 total mse loss:  tensor([5089.6978], device='cuda:0', grad_fn=<AddBackward0>)


26it [1:19:56, 183.81s/it]

Epoch 0 Batch 25 total expected reward:  tensor(-0.0014, device='cuda:0', grad_fn=<AddBackward0>)
Batch 25 total mse loss:  tensor([4638.7930], device='cuda:0', grad_fn=<AddBackward0>)


27it [1:23:00, 183.89s/it]

Epoch 0 Batch 26 total expected reward:  tensor(0.0007, device='cuda:0', grad_fn=<AddBackward0>)
Batch 26 total mse loss:  tensor([5249.4731], device='cuda:0', grad_fn=<AddBackward0>)


28it [1:26:03, 183.74s/it]

Epoch 0 Batch 27 total expected reward:  tensor(-0.0275, device='cuda:0', grad_fn=<AddBackward0>)
Batch 27 total mse loss:  tensor([6061.5889], device='cuda:0', grad_fn=<AddBackward0>)


29it [1:29:07, 183.66s/it]

Epoch 0 Batch 28 total expected reward:  tensor(-0.0036, device='cuda:0', grad_fn=<AddBackward0>)
Batch 28 total mse loss:  tensor([5953.1812], device='cuda:0', grad_fn=<AddBackward0>)


30it [1:32:10, 183.56s/it]

Epoch 0 Batch 29 total expected reward:  tensor(-0.0053, device='cuda:0', grad_fn=<AddBackward0>)
Batch 29 total mse loss:  tensor([6268.8320], device='cuda:0', grad_fn=<AddBackward0>)


31it [1:35:14, 183.61s/it]

Epoch 0 Batch 30 total expected reward:  tensor(-0.0103, device='cuda:0', grad_fn=<AddBackward0>)
Batch 30 total mse loss:  tensor([7646.6675], device='cuda:0', grad_fn=<AddBackward0>)


32it [1:38:17, 183.58s/it]

Epoch 0 Batch 31 total expected reward:  tensor(-0.0043, device='cuda:0', grad_fn=<AddBackward0>)
Batch 31 total mse loss:  tensor([5503.0444], device='cuda:0', grad_fn=<AddBackward0>)


33it [1:41:20, 183.27s/it]

Epoch 0 Batch 32 total expected reward:  tensor(-0.0031, device='cuda:0', grad_fn=<AddBackward0>)
Batch 32 total mse loss:  tensor([5558.5527], device='cuda:0', grad_fn=<AddBackward0>)


34it [1:44:23, 183.36s/it]

Epoch 0 Batch 33 total expected reward:  tensor(-0.0088, device='cuda:0', grad_fn=<AddBackward0>)
Batch 33 total mse loss:  tensor([7085.7886], device='cuda:0', grad_fn=<AddBackward0>)


35it [1:47:27, 183.30s/it]

Epoch 0 Batch 34 total expected reward:  tensor(-0.0130, device='cuda:0', grad_fn=<AddBackward0>)
Batch 34 total mse loss:  tensor([8239.4844], device='cuda:0', grad_fn=<AddBackward0>)


36it [1:50:29, 183.14s/it]

Epoch 0 Batch 35 total expected reward:  tensor(-0.0043, device='cuda:0', grad_fn=<AddBackward0>)
Batch 35 total mse loss:  tensor([5598.7573], device='cuda:0', grad_fn=<AddBackward0>)


37it [1:53:32, 183.10s/it]

Epoch 0 Batch 36 total expected reward:  tensor(-0.0131, device='cuda:0', grad_fn=<AddBackward0>)
Batch 36 total mse loss:  tensor([7981.6836], device='cuda:0', grad_fn=<AddBackward0>)


38it [1:56:36, 183.10s/it]

Epoch 0 Batch 37 total expected reward:  tensor(-0.0116, device='cuda:0', grad_fn=<AddBackward0>)
Batch 37 total mse loss:  tensor([7371.9585], device='cuda:0', grad_fn=<AddBackward0>)


39it [1:59:39, 183.13s/it]

Epoch 0 Batch 38 total expected reward:  tensor(-0.0119, device='cuda:0', grad_fn=<AddBackward0>)
Batch 38 total mse loss:  tensor([6825.1416], device='cuda:0', grad_fn=<AddBackward0>)


40it [2:02:42, 183.15s/it]

Epoch 0 Batch 39 total expected reward:  tensor(-0.0072, device='cuda:0', grad_fn=<AddBackward0>)
Batch 39 total mse loss:  tensor([5390.9570], device='cuda:0', grad_fn=<AddBackward0>)


41it [2:05:45, 183.12s/it]

Epoch 0 Batch 40 total expected reward:  tensor(0.0010, device='cuda:0', grad_fn=<AddBackward0>)
Batch 40 total mse loss:  tensor([4396.9385], device='cuda:0', grad_fn=<AddBackward0>)


42it [2:08:48, 183.05s/it]

Epoch 0 Batch 41 total expected reward:  tensor(-0.0031, device='cuda:0', grad_fn=<AddBackward0>)
Batch 41 total mse loss:  tensor([4575.1216], device='cuda:0', grad_fn=<AddBackward0>)


43it [2:11:52, 183.37s/it]

Epoch 0 Batch 42 total expected reward:  tensor(-0.0038, device='cuda:0', grad_fn=<AddBackward0>)
Batch 42 total mse loss:  tensor([4108.1245], device='cuda:0', grad_fn=<AddBackward0>)


44it [2:14:55, 183.33s/it]

Epoch 0 Batch 43 total expected reward:  tensor(-0.0027, device='cuda:0', grad_fn=<AddBackward0>)
Batch 43 total mse loss:  tensor([4514.3813], device='cuda:0', grad_fn=<AddBackward0>)


45it [2:17:58, 183.27s/it]

Epoch 0 Batch 44 total expected reward:  tensor(-0.0062, device='cuda:0', grad_fn=<AddBackward0>)
Batch 44 total mse loss:  tensor([4919.3984], device='cuda:0', grad_fn=<AddBackward0>)


46it [2:21:03, 183.76s/it]

Epoch 0 Batch 45 total expected reward:  tensor(-0.0131, device='cuda:0', grad_fn=<AddBackward0>)
Batch 45 total mse loss:  tensor([5993.8628], device='cuda:0', grad_fn=<AddBackward0>)


47it [2:24:06, 183.42s/it]

Epoch 0 Batch 46 total expected reward:  tensor(-0.0101, device='cuda:0', grad_fn=<AddBackward0>)
Batch 46 total mse loss:  tensor([5377.4248], device='cuda:0', grad_fn=<AddBackward0>)


48it [2:27:09, 183.29s/it]

Epoch 0 Batch 47 total expected reward:  tensor(-0.0111, device='cuda:0', grad_fn=<AddBackward0>)
Batch 47 total mse loss:  tensor([5398.7749], device='cuda:0', grad_fn=<AddBackward0>)


49it [2:30:12, 183.14s/it]

Epoch 0 Batch 48 total expected reward:  tensor(-0.0080, device='cuda:0', grad_fn=<AddBackward0>)
Batch 48 total mse loss:  tensor([4665.1250], device='cuda:0', grad_fn=<AddBackward0>)


50it [2:33:15, 183.26s/it]

Epoch 0 Batch 49 total expected reward:  tensor(-0.0071, device='cuda:0', grad_fn=<AddBackward0>)
Batch 49 total mse loss:  tensor([4387.9541], device='cuda:0', grad_fn=<AddBackward0>)


51it [2:36:20, 183.70s/it]

Epoch 0 Batch 50 total expected reward:  tensor(-0.0067, device='cuda:0', grad_fn=<AddBackward0>)
Batch 50 total mse loss:  tensor([4339.1909], device='cuda:0', grad_fn=<AddBackward0>)


52it [2:39:24, 183.69s/it]

Epoch 0 Batch 51 total expected reward:  tensor(-0.0046, device='cuda:0', grad_fn=<AddBackward0>)
Batch 51 total mse loss:  tensor([4309.1201], device='cuda:0', grad_fn=<AddBackward0>)


53it [2:42:27, 183.59s/it]

Epoch 0 Batch 52 total expected reward:  tensor(-0.0065, device='cuda:0', grad_fn=<AddBackward0>)
Batch 52 total mse loss:  tensor([4332.1655], device='cuda:0', grad_fn=<AddBackward0>)


54it [2:45:30, 183.52s/it]

Epoch 0 Batch 53 total expected reward:  tensor(-0.0059, device='cuda:0', grad_fn=<AddBackward0>)
Batch 53 total mse loss:  tensor([4185.9224], device='cuda:0', grad_fn=<AddBackward0>)


55it [2:48:34, 183.47s/it]

Epoch 0 Batch 54 total expected reward:  tensor(-0.0118, device='cuda:0', grad_fn=<AddBackward0>)
Batch 54 total mse loss:  tensor([4696.1245], device='cuda:0', grad_fn=<AddBackward0>)


56it [2:51:37, 183.44s/it]

Epoch 0 Batch 55 total expected reward:  tensor(-0.0072, device='cuda:0', grad_fn=<AddBackward0>)
Batch 55 total mse loss:  tensor([3741.0706], device='cuda:0', grad_fn=<AddBackward0>)


57it [2:54:40, 183.22s/it]

Epoch 0 Batch 56 total expected reward:  tensor(-0.0039, device='cuda:0', grad_fn=<AddBackward0>)
Batch 56 total mse loss:  tensor([4022.4524], device='cuda:0', grad_fn=<AddBackward0>)


58it [2:57:43, 183.18s/it]

Epoch 0 Batch 57 total expected reward:  tensor(-0.0057, device='cuda:0', grad_fn=<AddBackward0>)
Batch 57 total mse loss:  tensor([3942.8516], device='cuda:0', grad_fn=<AddBackward0>)


59it [3:00:46, 183.23s/it]

Epoch 0 Batch 58 total expected reward:  tensor(-0.0080, device='cuda:0', grad_fn=<AddBackward0>)
Batch 58 total mse loss:  tensor([4300.3872], device='cuda:0', grad_fn=<AddBackward0>)


60it [3:03:50, 183.83s/it]

Epoch 0 Batch 59 total expected reward:  tensor(-0.0111, device='cuda:0', grad_fn=<AddBackward0>)
Batch 59 total mse loss:  tensor([4057.8540], device='cuda:0', grad_fn=<AddBackward0>)



1it [03:06, 186.23s/it]

Epoch 1 Batch 0 total expected reward:  tensor(-0.0012, device='cuda:0', grad_fn=<AddBackward0>)
Batch 0 total mse loss:  tensor([3738.5444], device='cuda:0', grad_fn=<AddBackward0>)


2it [06:09, 184.41s/it]

Epoch 1 Batch 1 total expected reward:  tensor(-0.0010, device='cuda:0', grad_fn=<AddBackward0>)
Batch 1 total mse loss:  tensor([3778.4324], device='cuda:0', grad_fn=<AddBackward0>)


3it [09:12, 183.96s/it]

Epoch 1 Batch 2 total expected reward:  tensor(-0.0048, device='cuda:0', grad_fn=<AddBackward0>)
Batch 2 total mse loss:  tensor([3351.2896], device='cuda:0', grad_fn=<AddBackward0>)


4it [12:15, 183.51s/it]

Epoch 1 Batch 3 total expected reward:  tensor(-0.0095, device='cuda:0', grad_fn=<AddBackward0>)
Batch 3 total mse loss:  tensor([3887.1760], device='cuda:0', grad_fn=<AddBackward0>)


5it [15:19, 183.62s/it]

Epoch 1 Batch 4 total expected reward:  tensor(-0.0062, device='cuda:0', grad_fn=<AddBackward0>)
Batch 4 total mse loss:  tensor([3920.5427], device='cuda:0', grad_fn=<AddBackward0>)


6it [18:22, 183.54s/it]

Epoch 1 Batch 5 total expected reward:  tensor(-0.0082, device='cuda:0', grad_fn=<AddBackward0>)
Batch 5 total mse loss:  tensor([4219.5688], device='cuda:0', grad_fn=<AddBackward0>)


7it [21:25, 183.36s/it]

Epoch 1 Batch 6 total expected reward:  tensor(-0.0024, device='cuda:0', grad_fn=<AddBackward0>)
Batch 6 total mse loss:  tensor([4577.5127], device='cuda:0', grad_fn=<AddBackward0>)


8it [24:29, 183.39s/it]

Epoch 1 Batch 7 total expected reward:  tensor(-0.0056, device='cuda:0', grad_fn=<AddBackward0>)
Batch 7 total mse loss:  tensor([3830.0105], device='cuda:0', grad_fn=<AddBackward0>)


9it [27:32, 183.38s/it]

Epoch 1 Batch 8 total expected reward:  tensor(-0.0054, device='cuda:0', grad_fn=<AddBackward0>)
Batch 8 total mse loss:  tensor([3400.0171], device='cuda:0', grad_fn=<AddBackward0>)


10it [30:35, 183.25s/it]

Epoch 1 Batch 9 total expected reward:  tensor(-0.0080, device='cuda:0', grad_fn=<AddBackward0>)
Batch 9 total mse loss:  tensor([3770.7754], device='cuda:0', grad_fn=<AddBackward0>)


11it [33:39, 183.40s/it]

Epoch 1 Batch 10 total expected reward:  tensor(-0.0064, device='cuda:0', grad_fn=<AddBackward0>)
Batch 10 total mse loss:  tensor([4412.8281], device='cuda:0', grad_fn=<AddBackward0>)


12it [36:42, 183.29s/it]

Epoch 1 Batch 11 total expected reward:  tensor(-0.0099, device='cuda:0', grad_fn=<AddBackward0>)
Batch 11 total mse loss:  tensor([3628.6902], device='cuda:0', grad_fn=<AddBackward0>)


13it [39:45, 183.26s/it]

Epoch 1 Batch 12 total expected reward:  tensor(-0.0029, device='cuda:0', grad_fn=<AddBackward0>)
Batch 12 total mse loss:  tensor([4459.6392], device='cuda:0', grad_fn=<AddBackward0>)


14it [42:48, 183.28s/it]

Epoch 1 Batch 13 total expected reward:  tensor(-0.0039, device='cuda:0', grad_fn=<AddBackward0>)
Batch 13 total mse loss:  tensor([3476.3945], device='cuda:0', grad_fn=<AddBackward0>)


15it [45:52, 183.45s/it]

Epoch 1 Batch 14 total expected reward:  tensor(-0.0003, device='cuda:0', grad_fn=<AddBackward0>)
Batch 14 total mse loss:  tensor([5175.2651], device='cuda:0', grad_fn=<AddBackward0>)


16it [48:56, 183.56s/it]

Epoch 1 Batch 15 total expected reward:  tensor(-0.0018, device='cuda:0', grad_fn=<AddBackward0>)
Batch 15 total mse loss:  tensor([3797.0889], device='cuda:0', grad_fn=<AddBackward0>)


17it [51:59, 183.51s/it]

Epoch 1 Batch 16 total expected reward:  tensor(-0.0032, device='cuda:0', grad_fn=<AddBackward0>)
Batch 16 total mse loss:  tensor([4408.0161], device='cuda:0', grad_fn=<AddBackward0>)


18it [55:03, 183.46s/it]

Epoch 1 Batch 17 total expected reward:  tensor(-0.0068, device='cuda:0', grad_fn=<AddBackward0>)
Batch 17 total mse loss:  tensor([3442.2710], device='cuda:0', grad_fn=<AddBackward0>)


19it [58:06, 183.29s/it]

Epoch 1 Batch 18 total expected reward:  tensor(-0.0020, device='cuda:0', grad_fn=<AddBackward0>)
Batch 18 total mse loss:  tensor([3804.2090], device='cuda:0', grad_fn=<AddBackward0>)


20it [1:01:09, 183.33s/it]

Epoch 1 Batch 19 total expected reward:  tensor(-0.0074, device='cuda:0', grad_fn=<AddBackward0>)
Batch 19 total mse loss:  tensor([3633.5408], device='cuda:0', grad_fn=<AddBackward0>)


21it [1:04:13, 183.37s/it]

Epoch 1 Batch 20 total expected reward:  tensor(-0.0075, device='cuda:0', grad_fn=<AddBackward0>)
Batch 20 total mse loss:  tensor([3737.6404], device='cuda:0', grad_fn=<AddBackward0>)


22it [1:07:16, 183.31s/it]

Epoch 1 Batch 21 total expected reward:  tensor(-0.0076, device='cuda:0', grad_fn=<AddBackward0>)
Batch 21 total mse loss:  tensor([3082.6763], device='cuda:0', grad_fn=<AddBackward0>)


23it [1:10:22, 184.26s/it]

Epoch 1 Batch 22 total expected reward:  tensor(-0.0100, device='cuda:0', grad_fn=<AddBackward0>)
Batch 22 total mse loss:  tensor([3168.4644], device='cuda:0', grad_fn=<AddBackward0>)


24it [1:13:31, 185.60s/it]

Epoch 1 Batch 23 total expected reward:  tensor(0.0016, device='cuda:0', grad_fn=<AddBackward0>)
Batch 23 total mse loss:  tensor([5579.3506], device='cuda:0', grad_fn=<AddBackward0>)


25it [1:16:38, 185.92s/it]

Epoch 1 Batch 24 total expected reward:  tensor(0.0019, device='cuda:0', grad_fn=<AddBackward0>)
Batch 24 total mse loss:  tensor([5720.8462], device='cuda:0', grad_fn=<AddBackward0>)


26it [1:19:42, 185.50s/it]

Epoch 1 Batch 25 total expected reward:  tensor(-0.0003, device='cuda:0', grad_fn=<AddBackward0>)
Batch 25 total mse loss:  tensor([4319.8481], device='cuda:0', grad_fn=<AddBackward0>)


27it [1:22:46, 185.00s/it]

Epoch 1 Batch 26 total expected reward:  tensor(0.0006, device='cuda:0', grad_fn=<AddBackward0>)
Batch 26 total mse loss:  tensor([5505.6606], device='cuda:0', grad_fn=<AddBackward0>)


28it [1:25:49, 184.51s/it]

Epoch 1 Batch 27 total expected reward:  tensor(-0.0039, device='cuda:0', grad_fn=<AddBackward0>)
Batch 27 total mse loss:  tensor([4418.0225], device='cuda:0', grad_fn=<AddBackward0>)


29it [1:28:52, 184.09s/it]

Epoch 1 Batch 28 total expected reward:  tensor(-0.0032, device='cuda:0', grad_fn=<AddBackward0>)
Batch 28 total mse loss:  tensor([4602.6748], device='cuda:0', grad_fn=<AddBackward0>)


30it [1:31:56, 183.84s/it]

Epoch 1 Batch 29 total expected reward:  tensor(-0.0048, device='cuda:0', grad_fn=<AddBackward0>)
Batch 29 total mse loss:  tensor([4163.5142], device='cuda:0', grad_fn=<AddBackward0>)


31it [1:34:59, 183.73s/it]

Epoch 1 Batch 30 total expected reward:  tensor(-0.0098, device='cuda:0', grad_fn=<AddBackward0>)
Batch 30 total mse loss:  tensor([3517.2683], device='cuda:0', grad_fn=<AddBackward0>)


32it [1:38:03, 183.69s/it]

Epoch 1 Batch 31 total expected reward:  tensor(-0.0051, device='cuda:0', grad_fn=<AddBackward0>)
Batch 31 total mse loss:  tensor([3706.7026], device='cuda:0', grad_fn=<AddBackward0>)


33it [1:41:06, 183.51s/it]

Epoch 1 Batch 32 total expected reward:  tensor(-0.0031, device='cuda:0', grad_fn=<AddBackward0>)
Batch 32 total mse loss:  tensor([4576.5303], device='cuda:0', grad_fn=<AddBackward0>)


34it [1:44:09, 183.46s/it]

Epoch 1 Batch 33 total expected reward:  tensor(-0.0079, device='cuda:0', grad_fn=<AddBackward0>)
Batch 33 total mse loss:  tensor([4024.8303], device='cuda:0', grad_fn=<AddBackward0>)


35it [1:47:12, 183.26s/it]

Epoch 1 Batch 34 total expected reward:  tensor(-0.0136, device='cuda:0', grad_fn=<AddBackward0>)
Batch 34 total mse loss:  tensor([3513.3564], device='cuda:0', grad_fn=<AddBackward0>)


36it [1:50:16, 183.38s/it]

Epoch 1 Batch 35 total expected reward:  tensor(-0.0045, device='cuda:0', grad_fn=<AddBackward0>)
Batch 35 total mse loss:  tensor([4400.9673], device='cuda:0', grad_fn=<AddBackward0>)


37it [1:53:19, 183.35s/it]

Epoch 1 Batch 36 total expected reward:  tensor(-0.0135, device='cuda:0', grad_fn=<AddBackward0>)
Batch 36 total mse loss:  tensor([3478.9106], device='cuda:0', grad_fn=<AddBackward0>)


38it [1:56:22, 183.40s/it]

Epoch 1 Batch 37 total expected reward:  tensor(-0.0107, device='cuda:0', grad_fn=<AddBackward0>)
Batch 37 total mse loss:  tensor([3763.9404], device='cuda:0', grad_fn=<AddBackward0>)


39it [1:59:26, 183.58s/it]

Epoch 1 Batch 38 total expected reward:  tensor(-0.0118, device='cuda:0', grad_fn=<AddBackward0>)
Batch 38 total mse loss:  tensor([3250.6606], device='cuda:0', grad_fn=<AddBackward0>)


40it [2:02:30, 183.70s/it]

Epoch 1 Batch 39 total expected reward:  tensor(-0.0076, device='cuda:0', grad_fn=<AddBackward0>)
Batch 39 total mse loss:  tensor([3580.4814], device='cuda:0', grad_fn=<AddBackward0>)


41it [2:05:34, 183.65s/it]

Epoch 1 Batch 40 total expected reward:  tensor(0.0012, device='cuda:0', grad_fn=<AddBackward0>)
Batch 40 total mse loss:  tensor([5217.1431], device='cuda:0', grad_fn=<AddBackward0>)


42it [2:08:37, 183.45s/it]

Epoch 1 Batch 41 total expected reward:  tensor(-0.0030, device='cuda:0', grad_fn=<AddBackward0>)
Batch 41 total mse loss:  tensor([4416.0093], device='cuda:0', grad_fn=<AddBackward0>)


43it [2:11:40, 183.47s/it]

Epoch 1 Batch 42 total expected reward:  tensor(-0.0044, device='cuda:0', grad_fn=<AddBackward0>)
Batch 42 total mse loss:  tensor([3771.7815], device='cuda:0', grad_fn=<AddBackward0>)


44it [2:14:45, 183.66s/it]

Epoch 1 Batch 43 total expected reward:  tensor(-0.0025, device='cuda:0', grad_fn=<AddBackward0>)
Batch 43 total mse loss:  tensor([4262.9175], device='cuda:0', grad_fn=<AddBackward0>)


45it [2:17:48, 183.55s/it]

Epoch 1 Batch 44 total expected reward:  tensor(-0.0062, device='cuda:0', grad_fn=<AddBackward0>)
Batch 44 total mse loss:  tensor([3898.2798], device='cuda:0', grad_fn=<AddBackward0>)


46it [2:20:51, 183.45s/it]

Epoch 1 Batch 45 total expected reward:  tensor(-0.0132, device='cuda:0', grad_fn=<AddBackward0>)
Batch 45 total mse loss:  tensor([3474.5000], device='cuda:0', grad_fn=<AddBackward0>)


47it [2:23:54, 183.36s/it]

Epoch 1 Batch 46 total expected reward:  tensor(-0.0096, device='cuda:0', grad_fn=<AddBackward0>)
Batch 46 total mse loss:  tensor([3490.7405], device='cuda:0', grad_fn=<AddBackward0>)


48it [2:26:58, 183.58s/it]

Epoch 1 Batch 47 total expected reward:  tensor(-0.0102, device='cuda:0', grad_fn=<AddBackward0>)
Batch 47 total mse loss:  tensor([3794.9155], device='cuda:0', grad_fn=<AddBackward0>)


49it [2:30:01, 183.47s/it]

Epoch 1 Batch 48 total expected reward:  tensor(-0.0075, device='cuda:0', grad_fn=<AddBackward0>)
Batch 48 total mse loss:  tensor([3555.9111], device='cuda:0', grad_fn=<AddBackward0>)


50it [2:33:05, 183.51s/it]

Epoch 1 Batch 49 total expected reward:  tensor(-0.0071, device='cuda:0', grad_fn=<AddBackward0>)
Batch 49 total mse loss:  tensor([3776.5974], device='cuda:0', grad_fn=<AddBackward0>)


51it [2:36:09, 183.50s/it]

Epoch 1 Batch 50 total expected reward:  tensor(-0.0073, device='cuda:0', grad_fn=<AddBackward0>)
Batch 50 total mse loss:  tensor([3559.9253], device='cuda:0', grad_fn=<AddBackward0>)


52it [2:39:12, 183.38s/it]

Epoch 1 Batch 51 total expected reward:  tensor(-0.0053, device='cuda:0', grad_fn=<AddBackward0>)
Batch 51 total mse loss:  tensor([3956.7644], device='cuda:0', grad_fn=<AddBackward0>)


53it [2:42:15, 183.27s/it]

Epoch 1 Batch 52 total expected reward:  tensor(-0.0072, device='cuda:0', grad_fn=<AddBackward0>)
Batch 52 total mse loss:  tensor([3761.3872], device='cuda:0', grad_fn=<AddBackward0>)


54it [2:45:18, 183.26s/it]

Epoch 1 Batch 53 total expected reward:  tensor(-0.0062, device='cuda:0', grad_fn=<AddBackward0>)
Batch 53 total mse loss:  tensor([3745.6973], device='cuda:0', grad_fn=<AddBackward0>)


55it [2:48:20, 183.00s/it]

Epoch 1 Batch 54 total expected reward:  tensor(-0.0114, device='cuda:0', grad_fn=<AddBackward0>)
Batch 54 total mse loss:  tensor([3298.7727], device='cuda:0', grad_fn=<AddBackward0>)


56it [2:51:23, 182.96s/it]

Epoch 1 Batch 55 total expected reward:  tensor(-0.0076, device='cuda:0', grad_fn=<AddBackward0>)
Batch 55 total mse loss:  tensor([3326.0845], device='cuda:0', grad_fn=<AddBackward0>)


57it [2:54:26, 183.06s/it]

Epoch 1 Batch 56 total expected reward:  tensor(-0.0034, device='cuda:0', grad_fn=<AddBackward0>)
Batch 56 total mse loss:  tensor([4174.3384], device='cuda:0', grad_fn=<AddBackward0>)


58it [2:57:30, 183.18s/it]

Epoch 1 Batch 57 total expected reward:  tensor(-0.0050, device='cuda:0', grad_fn=<AddBackward0>)
Batch 57 total mse loss:  tensor([3664.1670], device='cuda:0', grad_fn=<AddBackward0>)


59it [3:00:33, 183.20s/it]

Epoch 1 Batch 58 total expected reward:  tensor(-0.0079, device='cuda:0', grad_fn=<AddBackward0>)
Batch 58 total mse loss:  tensor([3671.9734], device='cuda:0', grad_fn=<AddBackward0>)


60it [3:03:37, 183.32s/it]

Epoch 1 Batch 59 total expected reward:  tensor(-0.0110, device='cuda:0', grad_fn=<AddBackward0>)
Batch 59 total mse loss:  tensor([3481.4668], device='cuda:0', grad_fn=<AddBackward0>)


60it [3:03:37, 183.62s/it]
1it [03:06, 186.15s/it]

Epoch 2 Batch 0 total expected reward:  tensor(-0.0014, device='cuda:0', grad_fn=<AddBackward0>)
Batch 0 total mse loss:  tensor([4042.6140], device='cuda:0', grad_fn=<AddBackward0>)


2it [06:08, 184.14s/it]

Epoch 2 Batch 1 total expected reward:  tensor(-0.0010, device='cuda:0', grad_fn=<AddBackward0>)
Batch 1 total mse loss:  tensor([4080.5486], device='cuda:0', grad_fn=<AddBackward0>)


3it [09:12, 183.84s/it]

Epoch 2 Batch 2 total expected reward:  tensor(-0.0044, device='cuda:0', grad_fn=<AddBackward0>)
Batch 2 total mse loss:  tensor([3376.0002], device='cuda:0', grad_fn=<AddBackward0>)


4it [12:15, 183.73s/it]

Epoch 2 Batch 3 total expected reward:  tensor(-0.0088, device='cuda:0', grad_fn=<AddBackward0>)
Batch 3 total mse loss:  tensor([3351.4998], device='cuda:0', grad_fn=<AddBackward0>)


5it [15:19, 183.79s/it]

Epoch 2 Batch 4 total expected reward:  tensor(-0.0063, device='cuda:0', grad_fn=<AddBackward0>)
Batch 4 total mse loss:  tensor([3687.2834], device='cuda:0', grad_fn=<AddBackward0>)


6it [18:22, 183.53s/it]

Epoch 2 Batch 5 total expected reward:  tensor(-0.0076, device='cuda:0', grad_fn=<AddBackward0>)
Batch 5 total mse loss:  tensor([3658.0208], device='cuda:0', grad_fn=<AddBackward0>)


7it [21:26, 183.53s/it]

Epoch 2 Batch 6 total expected reward:  tensor(-0.0026, device='cuda:0', grad_fn=<AddBackward0>)
Batch 6 total mse loss:  tensor([4620.1382], device='cuda:0', grad_fn=<AddBackward0>)


8it [24:29, 183.33s/it]

Epoch 2 Batch 7 total expected reward:  tensor(-0.0056, device='cuda:0', grad_fn=<AddBackward0>)
Batch 7 total mse loss:  tensor([3673.5881], device='cuda:0', grad_fn=<AddBackward0>)


9it [27:33, 183.46s/it]

Epoch 2 Batch 8 total expected reward:  tensor(-0.0054, device='cuda:0', grad_fn=<AddBackward0>)
Batch 8 total mse loss:  tensor([3435.1799], device='cuda:0', grad_fn=<AddBackward0>)


10it [30:36, 183.47s/it]

Epoch 2 Batch 9 total expected reward:  tensor(-0.0084, device='cuda:0', grad_fn=<AddBackward0>)
Batch 9 total mse loss:  tensor([3189.8325], device='cuda:0', grad_fn=<AddBackward0>)


11it [33:39, 183.37s/it]

Epoch 2 Batch 10 total expected reward:  tensor(-0.0078, device='cuda:0', grad_fn=<AddBackward0>)
Batch 10 total mse loss:  tensor([3977.2395], device='cuda:0', grad_fn=<AddBackward0>)


12it [36:42, 183.33s/it]

Epoch 2 Batch 11 total expected reward:  tensor(-0.0104, device='cuda:0', grad_fn=<AddBackward0>)
Batch 11 total mse loss:  tensor([3054.7043], device='cuda:0', grad_fn=<AddBackward0>)


13it [39:46, 183.36s/it]

Epoch 2 Batch 12 total expected reward:  tensor(-0.0031, device='cuda:0', grad_fn=<AddBackward0>)
Batch 12 total mse loss:  tensor([4865.6455], device='cuda:0', grad_fn=<AddBackward0>)


14it [42:49, 183.34s/it]

Epoch 2 Batch 13 total expected reward:  tensor(-0.0041, device='cuda:0', grad_fn=<AddBackward0>)
Batch 13 total mse loss:  tensor([3607.3845], device='cuda:0', grad_fn=<AddBackward0>)


15it [45:52, 183.35s/it]

Epoch 2 Batch 14 total expected reward:  tensor(-0.0001, device='cuda:0', grad_fn=<AddBackward0>)
Batch 14 total mse loss:  tensor([5666.0361], device='cuda:0', grad_fn=<AddBackward0>)


16it [48:56, 183.37s/it]

Epoch 2 Batch 15 total expected reward:  tensor(-0.0016, device='cuda:0', grad_fn=<AddBackward0>)
Batch 15 total mse loss:  tensor([4210.4590], device='cuda:0', grad_fn=<AddBackward0>)


17it [51:59, 183.34s/it]

Epoch 2 Batch 16 total expected reward:  tensor(-0.0034, device='cuda:0', grad_fn=<AddBackward0>)
Batch 16 total mse loss:  tensor([4520.9961], device='cuda:0', grad_fn=<AddBackward0>)


18it [55:02, 183.31s/it]

Epoch 2 Batch 17 total expected reward:  tensor(-0.0061, device='cuda:0', grad_fn=<AddBackward0>)
Batch 17 total mse loss:  tensor([3479.7087], device='cuda:0', grad_fn=<AddBackward0>)


19it [58:05, 183.22s/it]

Epoch 2 Batch 18 total expected reward:  tensor(-0.0016, device='cuda:0', grad_fn=<AddBackward0>)
Batch 18 total mse loss:  tensor([4149.5356], device='cuda:0', grad_fn=<AddBackward0>)


20it [1:01:09, 183.37s/it]

Epoch 2 Batch 19 total expected reward:  tensor(-0.0077, device='cuda:0', grad_fn=<AddBackward0>)
Batch 19 total mse loss:  tensor([3312.1580], device='cuda:0', grad_fn=<AddBackward0>)


21it [1:04:12, 183.24s/it]

Epoch 2 Batch 20 total expected reward:  tensor(-0.0079, device='cuda:0', grad_fn=<AddBackward0>)
Batch 20 total mse loss:  tensor([3595.5771], device='cuda:0', grad_fn=<AddBackward0>)


22it [1:07:15, 183.14s/it]

Epoch 2 Batch 21 total expected reward:  tensor(-0.0067, device='cuda:0', grad_fn=<AddBackward0>)
Batch 21 total mse loss:  tensor([3179.4197], device='cuda:0', grad_fn=<AddBackward0>)


23it [1:10:18, 183.12s/it]

Epoch 2 Batch 22 total expected reward:  tensor(-0.0096, device='cuda:0', grad_fn=<AddBackward0>)
Batch 22 total mse loss:  tensor([3080.9353], device='cuda:0', grad_fn=<AddBackward0>)


24it [1:13:21, 183.01s/it]

Epoch 2 Batch 23 total expected reward:  tensor(0.0020, device='cuda:0', grad_fn=<AddBackward0>)
Batch 23 total mse loss:  tensor([6177.8706], device='cuda:0', grad_fn=<AddBackward0>)


25it [1:16:24, 183.09s/it]

Epoch 2 Batch 24 total expected reward:  tensor(0.0020, device='cuda:0', grad_fn=<AddBackward0>)
Batch 24 total mse loss:  tensor([6203.6353], device='cuda:0', grad_fn=<AddBackward0>)


26it [1:19:27, 183.16s/it]

Epoch 2 Batch 25 total expected reward:  tensor(-0.0013, device='cuda:0', grad_fn=<AddBackward0>)
Batch 25 total mse loss:  tensor([4347.6274], device='cuda:0', grad_fn=<AddBackward0>)


27it [1:22:30, 183.03s/it]

Epoch 2 Batch 26 total expected reward:  tensor(0.0008, device='cuda:0', grad_fn=<AddBackward0>)
Batch 26 total mse loss:  tensor([5558.8765], device='cuda:0', grad_fn=<AddBackward0>)


28it [1:25:33, 183.07s/it]

Epoch 2 Batch 27 total expected reward:  tensor(-0.0049, device='cuda:0', grad_fn=<AddBackward0>)
Batch 27 total mse loss:  tensor([4157.4863], device='cuda:0', grad_fn=<AddBackward0>)


29it [1:28:36, 183.04s/it]

Epoch 2 Batch 28 total expected reward:  tensor(-0.0030, device='cuda:0', grad_fn=<AddBackward0>)
Batch 28 total mse loss:  tensor([4825.8438], device='cuda:0', grad_fn=<AddBackward0>)


30it [1:31:39, 183.04s/it]

Epoch 2 Batch 29 total expected reward:  tensor(-0.0049, device='cuda:0', grad_fn=<AddBackward0>)
Batch 29 total mse loss:  tensor([3966.2419], device='cuda:0', grad_fn=<AddBackward0>)


31it [1:34:43, 183.10s/it]

Epoch 2 Batch 30 total expected reward:  tensor(-0.0094, device='cuda:0', grad_fn=<AddBackward0>)
Batch 30 total mse loss:  tensor([3411.4763], device='cuda:0', grad_fn=<AddBackward0>)


32it [1:37:46, 183.11s/it]

Epoch 2 Batch 31 total expected reward:  tensor(-0.0045, device='cuda:0', grad_fn=<AddBackward0>)
Batch 31 total mse loss:  tensor([3774.3240], device='cuda:0', grad_fn=<AddBackward0>)


33it [1:40:50, 183.37s/it]

Epoch 2 Batch 32 total expected reward:  tensor(-0.0025, device='cuda:0', grad_fn=<AddBackward0>)
Batch 32 total mse loss:  tensor([4687.8750], device='cuda:0', grad_fn=<AddBackward0>)


34it [1:43:53, 183.22s/it]

Epoch 2 Batch 33 total expected reward:  tensor(-0.0085, device='cuda:0', grad_fn=<AddBackward0>)
Batch 33 total mse loss:  tensor([3927.1980], device='cuda:0', grad_fn=<AddBackward0>)


35it [1:46:56, 183.31s/it]

Epoch 2 Batch 34 total expected reward:  tensor(-0.0128, device='cuda:0', grad_fn=<AddBackward0>)
Batch 34 total mse loss:  tensor([3220.0264], device='cuda:0', grad_fn=<AddBackward0>)


36it [1:49:59, 183.18s/it]

Epoch 2 Batch 35 total expected reward:  tensor(-0.0045, device='cuda:0', grad_fn=<AddBackward0>)
Batch 35 total mse loss:  tensor([4429.3975], device='cuda:0', grad_fn=<AddBackward0>)


37it [1:53:02, 183.16s/it]

Epoch 2 Batch 36 total expected reward:  tensor(-0.0132, device='cuda:0', grad_fn=<AddBackward0>)
Batch 36 total mse loss:  tensor([3199.3093], device='cuda:0', grad_fn=<AddBackward0>)


38it [1:56:05, 183.23s/it]

Epoch 2 Batch 37 total expected reward:  tensor(-0.0115, device='cuda:0', grad_fn=<AddBackward0>)
Batch 37 total mse loss:  tensor([3403.0464], device='cuda:0', grad_fn=<AddBackward0>)


39it [1:59:09, 183.19s/it]

Epoch 2 Batch 38 total expected reward:  tensor(-0.0114, device='cuda:0', grad_fn=<AddBackward0>)
Batch 38 total mse loss:  tensor([3087.5867], device='cuda:0', grad_fn=<AddBackward0>)


40it [2:02:12, 183.30s/it]

Epoch 2 Batch 39 total expected reward:  tensor(-0.0073, device='cuda:0', grad_fn=<AddBackward0>)
Batch 39 total mse loss:  tensor([3478.1777], device='cuda:0', grad_fn=<AddBackward0>)


41it [2:05:16, 183.34s/it]

Epoch 2 Batch 40 total expected reward:  tensor(0.0004, device='cuda:0', grad_fn=<AddBackward0>)
Batch 40 total mse loss:  tensor([5070.8198], device='cuda:0', grad_fn=<AddBackward0>)


42it [2:08:19, 183.33s/it]

Epoch 2 Batch 41 total expected reward:  tensor(-0.0030, device='cuda:0', grad_fn=<AddBackward0>)
Batch 41 total mse loss:  tensor([4566.0264], device='cuda:0', grad_fn=<AddBackward0>)


43it [2:11:22, 183.29s/it]

Epoch 2 Batch 42 total expected reward:  tensor(-0.0037, device='cuda:0', grad_fn=<AddBackward0>)
Batch 42 total mse loss:  tensor([4032.0840], device='cuda:0', grad_fn=<AddBackward0>)


44it [2:14:26, 183.42s/it]

Epoch 2 Batch 43 total expected reward:  tensor(-0.0035, device='cuda:0', grad_fn=<AddBackward0>)
Batch 43 total mse loss:  tensor([4041.1836], device='cuda:0', grad_fn=<AddBackward0>)


45it [2:17:29, 183.34s/it]

Epoch 2 Batch 44 total expected reward:  tensor(-0.0065, device='cuda:0', grad_fn=<AddBackward0>)
Batch 44 total mse loss:  tensor([3574.9272], device='cuda:0', grad_fn=<AddBackward0>)


46it [2:20:32, 183.27s/it]

Epoch 2 Batch 45 total expected reward:  tensor(-0.0131, device='cuda:0', grad_fn=<AddBackward0>)
Batch 45 total mse loss:  tensor([3193.3992], device='cuda:0', grad_fn=<AddBackward0>)


47it [2:23:35, 183.13s/it]

Epoch 2 Batch 46 total expected reward:  tensor(-0.0103, device='cuda:0', grad_fn=<AddBackward0>)
Batch 46 total mse loss:  tensor([3373.6230], device='cuda:0', grad_fn=<AddBackward0>)


48it [2:26:38, 183.08s/it]

Epoch 2 Batch 47 total expected reward:  tensor(-0.0102, device='cuda:0', grad_fn=<AddBackward0>)
Batch 47 total mse loss:  tensor([3460.0049], device='cuda:0', grad_fn=<AddBackward0>)


49it [2:29:41, 183.07s/it]

Epoch 2 Batch 48 total expected reward:  tensor(-0.0073, device='cuda:0', grad_fn=<AddBackward0>)
Batch 48 total mse loss:  tensor([3323.3330], device='cuda:0', grad_fn=<AddBackward0>)


50it [2:32:44, 183.08s/it]

Epoch 2 Batch 49 total expected reward:  tensor(-0.0070, device='cuda:0', grad_fn=<AddBackward0>)
Batch 49 total mse loss:  tensor([3367.0090], device='cuda:0', grad_fn=<AddBackward0>)


51it [2:35:47, 183.08s/it]

Epoch 2 Batch 50 total expected reward:  tensor(-0.0073, device='cuda:0', grad_fn=<AddBackward0>)
Batch 50 total mse loss:  tensor([3522.6333], device='cuda:0', grad_fn=<AddBackward0>)


52it [2:38:50, 183.06s/it]

Epoch 2 Batch 51 total expected reward:  tensor(-0.0048, device='cuda:0', grad_fn=<AddBackward0>)
Batch 51 total mse loss:  tensor([4163.8677], device='cuda:0', grad_fn=<AddBackward0>)


53it [2:41:52, 182.76s/it]

Epoch 2 Batch 52 total expected reward:  tensor(-0.0069, device='cuda:0', grad_fn=<AddBackward0>)
Batch 52 total mse loss:  tensor([3723.0659], device='cuda:0', grad_fn=<AddBackward0>)


54it [2:44:55, 182.84s/it]

Epoch 2 Batch 53 total expected reward:  tensor(-0.0063, device='cuda:0', grad_fn=<AddBackward0>)
Batch 53 total mse loss:  tensor([3660.1062], device='cuda:0', grad_fn=<AddBackward0>)


55it [2:47:59, 183.08s/it]

Epoch 2 Batch 54 total expected reward:  tensor(-0.0117, device='cuda:0', grad_fn=<AddBackward0>)
Batch 54 total mse loss:  tensor([3101.0220], device='cuda:0', grad_fn=<AddBackward0>)


56it [2:51:02, 183.20s/it]

Epoch 2 Batch 55 total expected reward:  tensor(-0.0074, device='cuda:0', grad_fn=<AddBackward0>)
Batch 55 total mse loss:  tensor([3267.7825], device='cuda:0', grad_fn=<AddBackward0>)


57it [2:54:05, 183.18s/it]

Epoch 2 Batch 56 total expected reward:  tensor(-0.0034, device='cuda:0', grad_fn=<AddBackward0>)
Batch 56 total mse loss:  tensor([4290.4360], device='cuda:0', grad_fn=<AddBackward0>)


58it [2:57:08, 183.15s/it]

Epoch 2 Batch 57 total expected reward:  tensor(-0.0048, device='cuda:0', grad_fn=<AddBackward0>)
Batch 57 total mse loss:  tensor([3724.8091], device='cuda:0', grad_fn=<AddBackward0>)


59it [3:00:11, 183.10s/it]

Epoch 2 Batch 58 total expected reward:  tensor(-0.0082, device='cuda:0', grad_fn=<AddBackward0>)
Batch 58 total mse loss:  tensor([3596.4644], device='cuda:0', grad_fn=<AddBackward0>)


60it [3:03:14, 183.07s/it]

Epoch 2 Batch 59 total expected reward:  tensor(-0.0115, device='cuda:0', grad_fn=<AddBackward0>)
Batch 59 total mse loss:  tensor([3246.4705], device='cuda:0', grad_fn=<AddBackward0>)


60it [3:03:15, 183.25s/it]
1it [03:05, 185.85s/it]

Epoch 3 Batch 0 total expected reward:  tensor(-0.0015, device='cuda:0', grad_fn=<AddBackward0>)
Batch 0 total mse loss:  tensor([4083.3494], device='cuda:0', grad_fn=<AddBackward0>)


2it [06:08, 183.91s/it]

Epoch 3 Batch 1 total expected reward:  tensor(-0.0014, device='cuda:0', grad_fn=<AddBackward0>)
Batch 1 total mse loss:  tensor([4086.1919], device='cuda:0', grad_fn=<AddBackward0>)


3it [09:11, 183.54s/it]

Epoch 3 Batch 2 total expected reward:  tensor(-0.0044, device='cuda:0', grad_fn=<AddBackward0>)
Batch 2 total mse loss:  tensor([3505.6787], device='cuda:0', grad_fn=<AddBackward0>)


4it [12:14, 183.30s/it]

Epoch 3 Batch 3 total expected reward:  tensor(-0.0085, device='cuda:0', grad_fn=<AddBackward0>)
Batch 3 total mse loss:  tensor([3446.6145], device='cuda:0', grad_fn=<AddBackward0>)


5it [15:17, 183.27s/it]

Epoch 3 Batch 4 total expected reward:  tensor(-0.0065, device='cuda:0', grad_fn=<AddBackward0>)
Batch 4 total mse loss:  tensor([3787.4111], device='cuda:0', grad_fn=<AddBackward0>)


6it [18:20, 183.19s/it]

Epoch 3 Batch 5 total expected reward:  tensor(-0.0086, device='cuda:0', grad_fn=<AddBackward0>)
Batch 5 total mse loss:  tensor([3528.7644], device='cuda:0', grad_fn=<AddBackward0>)


7it [21:23, 183.10s/it]

Epoch 3 Batch 6 total expected reward:  tensor(-0.0032, device='cuda:0', grad_fn=<AddBackward0>)
Batch 6 total mse loss:  tensor([4656.1812], device='cuda:0', grad_fn=<AddBackward0>)


8it [24:26, 182.92s/it]

Epoch 3 Batch 7 total expected reward:  tensor(-0.0057, device='cuda:0', grad_fn=<AddBackward0>)
Batch 7 total mse loss:  tensor([3754.2620], device='cuda:0', grad_fn=<AddBackward0>)


9it [27:28, 182.86s/it]

Epoch 3 Batch 8 total expected reward:  tensor(-0.0057, device='cuda:0', grad_fn=<AddBackward0>)
Batch 8 total mse loss:  tensor([3474.1589], device='cuda:0', grad_fn=<AddBackward0>)


10it [30:31, 182.94s/it]

Epoch 3 Batch 9 total expected reward:  tensor(-0.0074, device='cuda:0', grad_fn=<AddBackward0>)
Batch 9 total mse loss:  tensor([3236.4834], device='cuda:0', grad_fn=<AddBackward0>)


11it [33:35, 183.01s/it]

Epoch 3 Batch 10 total expected reward:  tensor(-0.0071, device='cuda:0', grad_fn=<AddBackward0>)
Batch 10 total mse loss:  tensor([4050.5281], device='cuda:0', grad_fn=<AddBackward0>)


12it [36:38, 183.15s/it]

Epoch 3 Batch 11 total expected reward:  tensor(-0.0104, device='cuda:0', grad_fn=<AddBackward0>)
Batch 11 total mse loss:  tensor([2913.4858], device='cuda:0', grad_fn=<AddBackward0>)


13it [39:41, 183.19s/it]

Epoch 3 Batch 12 total expected reward:  tensor(-0.0048, device='cuda:0', grad_fn=<AddBackward0>)
Batch 12 total mse loss:  tensor([4274.2939], device='cuda:0', grad_fn=<AddBackward0>)


14it [42:44, 182.97s/it]

Epoch 3 Batch 13 total expected reward:  tensor(-0.0039, device='cuda:0', grad_fn=<AddBackward0>)
Batch 13 total mse loss:  tensor([3586.6672], device='cuda:0', grad_fn=<AddBackward0>)


15it [45:46, 182.86s/it]

Epoch 3 Batch 14 total expected reward:  tensor(-4.8347e-05, device='cuda:0', grad_fn=<AddBackward0>)
Batch 14 total mse loss:  tensor([5751.4912], device='cuda:0', grad_fn=<AddBackward0>)


16it [48:49, 182.84s/it]

Epoch 3 Batch 15 total expected reward:  tensor(-0.0009, device='cuda:0', grad_fn=<AddBackward0>)
Batch 15 total mse loss:  tensor([4544.9443], device='cuda:0', grad_fn=<AddBackward0>)


17it [51:53, 183.03s/it]

Epoch 3 Batch 16 total expected reward:  tensor(-0.0030, device='cuda:0', grad_fn=<AddBackward0>)
Batch 16 total mse loss:  tensor([4669.2354], device='cuda:0', grad_fn=<AddBackward0>)


18it [54:56, 183.11s/it]

Epoch 3 Batch 17 total expected reward:  tensor(-0.0064, device='cuda:0', grad_fn=<AddBackward0>)
Batch 17 total mse loss:  tensor([3391.0996], device='cuda:0', grad_fn=<AddBackward0>)


19it [57:59, 183.05s/it]

Epoch 3 Batch 18 total expected reward:  tensor(-0.0020, device='cuda:0', grad_fn=<AddBackward0>)
Batch 18 total mse loss:  tensor([4161.8193], device='cuda:0', grad_fn=<AddBackward0>)


20it [1:01:02, 183.05s/it]

Epoch 3 Batch 19 total expected reward:  tensor(-0.0075, device='cuda:0', grad_fn=<AddBackward0>)
Batch 19 total mse loss:  tensor([3320.8921], device='cuda:0', grad_fn=<AddBackward0>)


21it [1:04:05, 182.97s/it]

Epoch 3 Batch 20 total expected reward:  tensor(-0.0080, device='cuda:0', grad_fn=<AddBackward0>)
Batch 20 total mse loss:  tensor([3346.6970], device='cuda:0', grad_fn=<AddBackward0>)


22it [1:07:08, 183.10s/it]

Epoch 3 Batch 21 total expected reward:  tensor(-0.0072, device='cuda:0', grad_fn=<AddBackward0>)
Batch 21 total mse loss:  tensor([3083.1621], device='cuda:0', grad_fn=<AddBackward0>)


23it [1:10:12, 183.35s/it]

Epoch 3 Batch 22 total expected reward:  tensor(-0.0095, device='cuda:0', grad_fn=<AddBackward0>)
Batch 22 total mse loss:  tensor([3073.6023], device='cuda:0', grad_fn=<AddBackward0>)


24it [1:13:16, 183.44s/it]

Epoch 3 Batch 23 total expected reward:  tensor(0.0023, device='cuda:0', grad_fn=<AddBackward0>)
Batch 23 total mse loss:  tensor([6307.7681], device='cuda:0', grad_fn=<AddBackward0>)


25it [1:16:19, 183.48s/it]

Epoch 3 Batch 24 total expected reward:  tensor(0.0021, device='cuda:0', grad_fn=<AddBackward0>)
Batch 24 total mse loss:  tensor([6260.1890], device='cuda:0', grad_fn=<AddBackward0>)


26it [1:19:24, 183.70s/it]

Epoch 3 Batch 25 total expected reward:  tensor(-3.7965e-05, device='cuda:0', grad_fn=<AddBackward0>)
Batch 25 total mse loss:  tensor([5025.8940], device='cuda:0', grad_fn=<AddBackward0>)


27it [1:22:28, 183.85s/it]

Epoch 3 Batch 26 total expected reward:  tensor(-5.8665e-05, device='cuda:0', grad_fn=<AddBackward0>)
Batch 26 total mse loss:  tensor([5569.6934], device='cuda:0', grad_fn=<AddBackward0>)


28it [1:25:31, 183.67s/it]

Epoch 3 Batch 27 total expected reward:  tensor(-0.0038, device='cuda:0', grad_fn=<AddBackward0>)
Batch 27 total mse loss:  tensor([4576.1484], device='cuda:0', grad_fn=<AddBackward0>)


29it [1:28:34, 183.58s/it]

Epoch 3 Batch 28 total expected reward:  tensor(-0.0032, device='cuda:0', grad_fn=<AddBackward0>)
Batch 28 total mse loss:  tensor([4690.2773], device='cuda:0', grad_fn=<AddBackward0>)


30it [1:31:38, 183.46s/it]

Epoch 3 Batch 29 total expected reward:  tensor(-0.0042, device='cuda:0', grad_fn=<AddBackward0>)
Batch 29 total mse loss:  tensor([4458.0601], device='cuda:0', grad_fn=<AddBackward0>)


31it [1:34:41, 183.37s/it]

Epoch 3 Batch 30 total expected reward:  tensor(-0.0095, device='cuda:0', grad_fn=<AddBackward0>)
Batch 30 total mse loss:  tensor([3213.9563], device='cuda:0', grad_fn=<AddBackward0>)


33it [1:40:45, 182.77s/it]

Epoch 3 Batch 32 total expected reward:  tensor(-0.0023, device='cuda:0', grad_fn=<AddBackward0>)
Batch 32 total mse loss:  tensor([5086.3945], device='cuda:0', grad_fn=<AddBackward0>)


34it [1:43:46, 182.34s/it]

Epoch 3 Batch 33 total expected reward:  tensor(-0.0079, device='cuda:0', grad_fn=<AddBackward0>)
Batch 33 total mse loss:  tensor([4047.1174], device='cuda:0', grad_fn=<AddBackward0>)


35it [1:46:50, 182.56s/it]

Epoch 3 Batch 34 total expected reward:  tensor(-0.0129, device='cuda:0', grad_fn=<AddBackward0>)
Batch 34 total mse loss:  tensor([3282.6880], device='cuda:0', grad_fn=<AddBackward0>)


36it [1:49:58, 184.41s/it]

Epoch 3 Batch 35 total expected reward:  tensor(-0.0049, device='cuda:0', grad_fn=<AddBackward0>)
Batch 35 total mse loss:  tensor([4459.1309], device='cuda:0', grad_fn=<AddBackward0>)


37it [1:53:06, 185.43s/it]

Epoch 3 Batch 36 total expected reward:  tensor(-0.0127, device='cuda:0', grad_fn=<AddBackward0>)
Batch 36 total mse loss:  tensor([3246.9558], device='cuda:0', grad_fn=<AddBackward0>)


38it [1:56:09, 184.81s/it]

Epoch 3 Batch 37 total expected reward:  tensor(-0.0109, device='cuda:0', grad_fn=<AddBackward0>)
Batch 37 total mse loss:  tensor([3416.2495], device='cuda:0', grad_fn=<AddBackward0>)


39it [1:59:14, 184.66s/it]

Epoch 3 Batch 38 total expected reward:  tensor(-0.0118, device='cuda:0', grad_fn=<AddBackward0>)
Batch 38 total mse loss:  tensor([2894.9695], device='cuda:0', grad_fn=<AddBackward0>)


40it [2:02:17, 184.22s/it]

Epoch 3 Batch 39 total expected reward:  tensor(-0.0068, device='cuda:0', grad_fn=<AddBackward0>)
Batch 39 total mse loss:  tensor([3839.3582], device='cuda:0', grad_fn=<AddBackward0>)


NameError: name 'reward_results' is not defined

In [11]:
probs_0[:100]

[tensor(0.7500, device='cuda:0', grad_fn=<MulBackward0>),
 tensor(6.0000, device='cuda:0', grad_fn=<MulBackward0>),
 tensor(6.0000, device='cuda:0', grad_fn=<MulBackward0>),
 tensor(6.0000, device='cuda:0', grad_fn=<MulBackward0>),
 tensor(6.0000, device='cuda:0', grad_fn=<MulBackward0>),
 tensor(6.0000, device='cuda:0', grad_fn=<MulBackward0>),
 tensor(6.0000, device='cuda:0', grad_fn=<MulBackward0>),
 tensor(6.0000, device='cuda:0', grad_fn=<MulBackward0>),
 tensor(6.0000, device='cuda:0', grad_fn=<MulBackward0>),
 tensor(6.0000, device='cuda:0', grad_fn=<MulBackward0>),
 tensor(6.0000, device='cuda:0', grad_fn=<MulBackward0>),
 tensor(6.0000, device='cuda:0', grad_fn=<MulBackward0>),
 tensor(5.9786, device='cuda:0', grad_fn=<MulBackward0>),
 tensor(6.0000, device='cuda:0', grad_fn=<MulBackward0>),
 tensor(6.0000, device='cuda:0', grad_fn=<MulBackward0>),
 tensor(1.5459e-05, device='cuda:0', grad_fn=<MulBackward0>),
 tensor(3.0292e-05, device='cuda:0', grad_fn=<MulBackward0>),
 tenso

In [12]:
probs_1[:50]

[tensor(5.9676, device='cuda:0', grad_fn=<MulBackward0>),
 tensor(5.9924, device='cuda:0', grad_fn=<MulBackward0>),
 tensor(3.9994, device='cuda:0', grad_fn=<MulBackward0>),
 tensor(0.0007, device='cuda:0', grad_fn=<MulBackward0>),
 tensor(0.3579, device='cuda:0', grad_fn=<MulBackward0>),
 tensor(5.0587, device='cuda:0', grad_fn=<MulBackward0>),
 tensor(2.7490e-05, device='cuda:0', grad_fn=<MulBackward0>),
 tensor(6.0000, device='cuda:0', grad_fn=<MulBackward0>),
 tensor(5.4254, device='cuda:0', grad_fn=<MulBackward0>),
 tensor(0.5447, device='cuda:0', grad_fn=<MulBackward0>),
 tensor(0.3528, device='cuda:0', grad_fn=<MulBackward0>),
 tensor(1.0686, device='cuda:0', grad_fn=<MulBackward0>),
 tensor(5.9681, device='cuda:0', grad_fn=<MulBackward0>),
 tensor(3.5836, device='cuda:0', grad_fn=<MulBackward0>),
 tensor(0.2328, device='cuda:0', grad_fn=<MulBackward0>),
 tensor(9.6888e-06, device='cuda:0', grad_fn=<MulBackward0>),
 tensor(1.1284e-05, device='cuda:0', grad_fn=<MulBackward0>),
 t

In [13]:
print(rewards_0[:50])
print(rewards_1[:50])

tensor([-2.7851,  0.9022,  2.8566,  2.9561,  2.5913,  1.4005,  2.8890, -0.4359,
         2.9563, -2.7223, -2.8594, -2.8131,  0.2946,  2.7151,  2.9560,  2.9544,
        -2.7062, -2.4482,  2.8733, -0.1318,  2.9515,  1.0451,  1.4682,  2.9417,
        -0.6140,  2.8225,  2.7343,  2.9544,  1.8293,  2.6736,  2.9162,  2.0157,
         2.9543, -1.0884,  2.9180, -2.7983,  2.9554,  2.1885,  1.7539,  2.9026,
         2.8248,  1.5889,  1.0819,  2.9236,  1.1265,  2.9410,  2.2216, -0.6344,
        -0.2428,  2.0169])
tensor([-2.6608,  2.1664, -2.8370, -2.2139, -1.6710,  2.9525,  1.5394,  2.9504,
         1.2801,  2.1259, -0.7744,  2.9505,  2.4044, -2.8730, -2.8742, -0.4144,
         1.0518, -2.8572,  2.9211,  2.5853,  2.1751, -0.7339,  2.9352,  2.5496,
         2.4637, -0.5362,  0.8194,  2.9032, -1.5252,  2.9526, -0.2861, -1.0242,
         2.9459,  2.2418,  2.9555, -1.8400, -0.4232,  2.9170, -1.2534,  2.3564,
         2.9544,  0.7814,  2.7956,  2.9524,  0.0044,  2.8985,  2.6865,  0.7331,
         2.88

In [None]:
'''



def run_A3C(PPnet, preference_model, data_loader):
    
    a3c_model = A3C_PPnet(PPnet, preference_model)
    
    # TO-DO: need to figure out what optimizers we could use
    ac_optimizer = optim.Adam(PPnet.parameters(), lr=learning_rate)
    
    num_epochs = 50

    for epoch in range(num_epochs):
        for idx, (batch_x, y) in enumerate(dataloader):
            action, value, rewards = a3c_model.forward(batch_x)
            value = value.detach().numpy()
            reward = preference_model(action)

  
        # update actor critic
        
        advantage = rewards - values
        actor_loss = (-advantage).mean()
        critic_loss = 0.5 * advantage.pow(2).mean()
        ac_loss = actor_loss + critic_loss + 0.001 * entropy_term

        ac_optimizer.zero_grad()
        ac_loss.backward()
        ac_optimizer.step()

'''