## Playing Pong Using Only Pixel Values ft. Policy Gradient

In [1]:
# Import stuff
import gym
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import time, os
from torch.utils.tensorboard import SummaryWriter

# Set random seed
seed = 42
torch.manual_seed(seed)
np.random.seed(seed)

# Create directory for tensorlog
# Make sure to use a new directory for every new run
log_dir = 'logs/pong_pg_test'
writer = SummaryWriter(log_dir)

# Use GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [2]:
# Create the environment
env = gym.make("Pong-v0")
env.seed(seed)

n_actions = env.action_space.n
print(n_actions)

6


In [3]:
# A function to grab the current game screen and return it as a 2D numpy array
def get_image(env):
    image = env.render(mode='rgb_array')
    image = image.astype(np.float32) / 255.0  # convert to float and scale to the range [0,1]
    image = np.dot(image, [0.299, 0.587, 0.114])  # convert to grayscale
    return image

In [4]:
# A class for the policy network. Here, we will use a convolutional neural network
# that will take an entire screen of game state and suggest an action from that.
class PolicyNetwork(nn.Module):
    def __init__(self, input_channels=1, input_height=210, input_width=160, output_size=6):
        """Initialize the network"""
        super().__init__()
        self.conv1 = nn.Conv2d(in_channels=input_channels, out_channels=16, kernel_size=8, stride=4)
        self.conv2 = nn.Conv2d(in_channels=16, out_channels=32, kernel_size=4, stride=2)
        # Can try using pooling layers here
        flat_size = 32 * self.conv2d_size_out(self.conv2d_size_out(input_height, 8, 4), 4, 2) * \
                    self.conv2d_size_out(self.conv2d_size_out(input_width, 8, 4), 4, 2)
        self.fc1 = nn.Linear(flat_size, 256)
        self.fc2 = nn.Linear(256, output_size)
        
    def conv2d_size_out(self, size, kernel_size, stride):
        """Utility function to calculate size of dimension after convolution"""
        return (size - (kernel_size - 1) - 1) // stride + 1
        
    def forward(self, x):
        """Make a forward pass"""
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        x = x.view(x.shape[0], -1)
        x = F.relu(self.fc1(x))
        x = F.softmax(self.fc2(x), dim=1)
        return x

In [5]:
# A class for the agent
class Agent:
    def __init__(self, learning_rate=0.001, n_actions=6):
        """Initialize agent"""
        self.learning_Rate = learning_rate
        self.n_actions = n_actions
        self.policy = PolicyNetwork().to(device)
        self.optimizer = optim.Adam(self.policy.parameters(), lr=learning_rate)
    
    def get_action(self, state):
        state = torch.from_numpy(state).float().unsqueeze(0).unsqueeze(0)
        state = state.to(device)
        probs = self.policy(state)
        action = np.random.choice(self.n_actions, p=probs.to('cpu').detach().squeeze(0).numpy())
        log_prob = torch.log(probs.squeeze(0)[action])
        state = state.to('cpu')
        return action, log_prob
    
    def get_probs(self, state):
        state = torch.from_numpy(state).float().unsqueeze(0).unsqueeze(0)
        state = state.to(device)
        probs = self.policy(state)
        state = state.to('cpu')
        return probs
    
    def update(self, all_rewards, all_log_probs, discount_rate=0.9):
        # Compute discounted rewards
        all_discounted_rewards = []
        for rewards in all_rewards:
            total_reward = 0
            discounted_rewards = [0] * len(rewards)
            for i in reversed(range(len(rewards))):
                discounted_rewards[i] = rewards[i] + discount_rate * total_reward
                total_reward = discounted_rewards[i]
            all_discounted_rewards.append(discounted_rewards)
            
        # Stack all rewards and log probs
        flat_discounted_rewards = [r for rewards in all_discounted_rewards for r in rewards]
        flat_log_probs = [lp for log_probs in all_log_probs for lp in log_probs]
        
        # Convert to tensors
        discounted_rewards = torch.tensor(flat_discounted_rewards).to(device)
        log_probs = torch.stack(flat_log_probs).to(device)
        
        # Normalize rewards, this should speed up training
        discounted_rewards = (discounted_rewards - discounted_rewards.mean()) / (discounted_rewards.std() + 1e-9)
        
        # Reset parameter gradients
        self.optimizer.zero_grad()
        
        # Compute "loss" function
        loss = torch.mul(discounted_rewards, -log_probs).sum()
        
        # Perform backprop
        loss.backward()
        
        # Optimize
        self.optimizer.step()
        
    def save_parameters(self, path):
        torch.save(self.policy.state_dict(), path)
        
    def load_parameters(self, path):
        self.policy.load_state_dict(torch.load(path))

In [6]:
# A function to simulate an episode
def simulate(env, agent, render=False, fps=30, max_steps=10000, detailed=False):
    seconds_per_frame = 1 / fps
    total_reward = 0
    env.reset()
    image = get_image(env)
    prev_image = image
    state = image - prev_image
    
    for i in range(max_steps):
        if render:
            env.render()
            time.sleep(seconds_per_frame)
            
        if detailed:
            probs = agent.get_probs(state)
            print(probs)
            
        action, log_prob = agent.get_action(state)
        _, reward, done, _ = env.step(action)
        total_reward += reward
        
        if done:
            env.close()
            break
        else:
            prev_image = image
            image = get_image(env)
            state = image - prev_image
            
    print("Simulation complete - total reward:", total_reward)
    return total_reward

In [7]:
# Agent hyperparameters
learning_rate = 0.0001

# Initialize the agent
agent = Agent(learning_rate)

# Autosave settings
save_parameters = True
save_interval = 10
save_path = 'models/pg_pong.pth'
load_parameters_before_training = True

if load_parameters_before_training and os.path.exists(save_path):
    agent.load_parameters(save_path)
    print("Parameters loaded successfully")

Parameters loaded successfully


In [8]:
# Training hyperparameters
n_epochs = 10
episodes_per_epoch = 1
max_steps = 10000
discount_rate = 0.99

# Option to show the agent in training
show_simulation = False
epoch_per_simulation = 25

# Training loop
for epoch in range(n_epochs):
    all_rewards = []
    all_log_probs = []
    total_reward = 0
    
    for episode in range(episodes_per_epoch):
        rewards = []
        log_probs = []
        
        env.reset()
        image = get_image(env)
        prev_image = image
        state = image - prev_image
        for step in range(max_steps):
            action, log_prob = agent.get_action(state)
            _, reward, done, _ = env.step(action)
            rewards.append(reward)
            log_probs.append(log_prob)
            total_reward += reward
            
            if done:
                break
            else:
                prev_image = image
                image = get_image(env)
                state = image - prev_image
                
        all_rewards.append(rewards)
        all_log_probs.append(log_probs)
        
    # Update the policy
    agent.update(all_rewards, all_log_probs, discount_rate)
    
    # Track average rewards in TensorBoard
    writer.add_scalar('average_reward', total_reward / episodes_per_epoch, epoch)
    
    # Save model parameters
    if save_parameters:
        if epoch % save_interval == 0:
            agent.save_parameters(save_path)
    
    # Simulate agent (optional)
    if show_simulation and epoch % epoch_per_simulation == 0:
        simulate(env, agent, render=True)

In [10]:
simulate(env, agent, render=True, detailed=True)

tensor([[5.6676e-06, 3.5975e-04, 7.1137e-01, 2.6313e-03, 3.1439e-02, 2.5419e-01]],
       device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([[9.2033e-07, 1.8681e-04, 7.0845e-01, 7.1929e-04, 1.4705e-02, 2.7594e-01]],
       device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([[2.6688e-06, 1.5291e-04, 9.0183e-01, 9.4094e-04, 2.3649e-02, 7.3424e-02]],
       device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([[2.0506e-06, 9.3437e-05, 9.5804e-01, 4.4340e-04, 1.8542e-02, 2.2878e-02]],
       device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([[3.9157e-06, 2.7490e-04, 7.1588e-01, 2.2243e-03, 2.9341e-02, 2.5228e-01]],
       device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([[2.2044e-06, 2.3992e-04, 1.3115e-01, 3.3209e-03, 1.1969e-02, 8.5332e-01]],
       device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([[5.4815e-06, 3.5456e-04, 7.0629e-01, 2.6227e-03, 3.1142e-02, 2.5958e-01]],
       device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([[5.4690e-06, 3.5145e-04, 7.1337e-01, 2.5807e-03, 3.114

tensor([[4.7817e-06, 3.4494e-04, 6.4761e-01, 2.7841e-03, 2.9729e-02, 3.1952e-01]],
       device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([[4.0390e-06, 3.2179e-04, 5.9823e-01, 2.8064e-03, 2.7935e-02, 3.7071e-01]],
       device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([[3.9735e-06, 2.8038e-04, 7.5230e-01, 2.0589e-03, 2.8540e-02, 2.1681e-01]],
       device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([[4.0763e-06, 2.8679e-04, 7.4059e-01, 2.1400e-03, 2.8893e-02, 2.2808e-01]],
       device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([[2.3179e-06, 2.4383e-04, 1.5011e-01, 3.3464e-03, 1.3179e-02, 8.3312e-01]],
       device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([[4.9850e-06, 3.6397e-04, 5.7612e-01, 3.1250e-03, 2.9491e-02, 3.9090e-01]],
       device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([[3.5403e-06, 3.1998e-04, 3.3828e-01, 3.4036e-03, 2.2149e-02, 6.3585e-01]],
       device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([[2.6890e-06, 1.4273e-04, 9.2467e-01, 7.3334e-04, 2.313

tensor([[2.5994e-06, 1.8356e-04, 8.6195e-01, 1.1975e-03, 2.4384e-02, 1.1229e-01]],
       device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([[3.2606e-06, 2.2537e-04, 8.2099e-01, 1.5494e-03, 2.6967e-02, 1.5026e-01]],
       device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([[3.4145e-06, 2.5948e-04, 7.4302e-01, 1.9736e-03, 2.8060e-02, 2.2668e-01]],
       device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([[3.8842e-06, 2.7338e-04, 7.5319e-01, 1.9651e-03, 2.8861e-02, 2.1570e-01]],
       device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([[4.6078e-06, 3.0210e-04, 7.4549e-01, 2.2083e-03, 2.9830e-02, 2.2217e-01]],
       device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([[3.2672e-06, 1.8579e-04, 8.8150e-01, 1.1674e-03, 2.5602e-02, 9.1544e-02]],
       device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([[3.0064e-07, 1.3255e-05, 9.9301e-01, 3.7212e-05, 5.8624e-03, 1.0726e-03]],
       device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([[4.3113e-06, 3.0741e-04, 6.7228e-01, 2.5441e-03, 2.961

tensor([[3.0679e-06, 3.2273e-04, 1.8429e-01, 3.7883e-03, 1.5309e-02, 7.9628e-01]],
       device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([[3.5282e-06, 3.1381e-04, 3.2449e-01, 3.5650e-03, 2.1572e-02, 6.5005e-01]],
       device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([[3.6616e-06, 2.7990e-04, 6.4792e-01, 2.4474e-03, 2.8613e-02, 3.2073e-01]],
       device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([[3.9600e-06, 2.9090e-04, 6.9572e-01, 2.3942e-03, 2.8858e-02, 2.7273e-01]],
       device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([[3.6866e-06, 2.4844e-04, 7.8683e-01, 1.8450e-03, 2.8384e-02, 1.8269e-01]],
       device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([[5.3384e-06, 3.1612e-04, 7.7336e-01, 2.2161e-03, 3.1079e-02, 1.9303e-01]],
       device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([[2.4348e-06, 1.2893e-04, 9.2539e-01, 7.2928e-04, 2.2807e-02, 5.0939e-02]],
       device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([[2.1300e-06, 1.1492e-04, 9.3201e-01, 6.4023e-04, 2.117

tensor([[3.9340e-07, 7.3990e-05, 1.0564e-02, 1.7117e-03, 1.8828e-03, 9.8577e-01]],
       device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([[4.7583e-06, 3.9167e-04, 4.6972e-01, 3.5755e-03, 2.6645e-02, 4.9967e-01]],
       device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([[4.2694e-06, 3.7472e-04, 4.3035e-01, 3.5948e-03, 2.4841e-02, 5.4084e-01]],
       device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([[1.4923e-06, 1.9134e-04, 6.7670e-02, 3.0088e-03, 7.3931e-03, 9.2173e-01]],
       device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([[1.1975e-06, 1.6865e-04, 5.0108e-02, 2.8488e-03, 5.8200e-03, 9.4105e-01]],
       device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([[2.1090e-06, 2.4673e-04, 1.0084e-01, 3.5575e-03, 9.9280e-03, 8.8543e-01]],
       device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([[2.1826e-06, 2.5945e-04, 1.2851e-01, 3.5136e-03, 1.1308e-02, 8.5641e-01]],
       device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([[3.9601e-06, 3.4713e-04, 4.9327e-01, 3.2335e-03, 2.569

tensor([[1.3409e-06, 1.7681e-04, 5.2540e-02, 3.0201e-03, 6.2034e-03, 9.3806e-01]],
       device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([[3.4330e-06, 3.5841e-04, 4.0399e-01, 3.4543e-03, 2.2020e-02, 5.7017e-01]],
       device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([[2.8991e-06, 3.1108e-04, 2.4504e-01, 3.7247e-03, 1.7128e-02, 7.3380e-01]],
       device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([[3.8419e-06, 3.5800e-04, 4.3141e-01, 3.5684e-03, 2.4034e-02, 5.4063e-01]],
       device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([[4.4306e-06, 3.5202e-04, 5.3956e-01, 3.2571e-03, 2.7761e-02, 4.2906e-01]],
       device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([[3.9724e-06, 3.6357e-04, 3.6126e-01, 3.7037e-03, 2.2929e-02, 6.1174e-01]],
       device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([[3.3482e-06, 2.0180e-04, 8.6350e-01, 1.3522e-03, 2.6443e-02, 1.0850e-01]],
       device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([[5.0078e-06, 3.7486e-04, 5.7321e-01, 3.1962e-03, 2.922

tensor([[4.7039e-06, 3.6772e-04, 4.6716e-01, 3.5074e-03, 2.6932e-02, 5.0203e-01]],
       device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([[3.2786e-06, 3.2373e-04, 2.7117e-01, 3.6863e-03, 1.9269e-02, 7.0554e-01]],
       device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([[4.6255e-06, 3.7085e-04, 4.3709e-01, 3.5985e-03, 2.6019e-02, 5.3292e-01]],
       device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([[2.8349e-06, 2.8212e-04, 2.9272e-01, 3.2870e-03, 1.9289e-02, 6.8442e-01]],
       device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([[2.4048e-06, 2.7964e-04, 1.9997e-01, 3.3336e-03, 1.5023e-02, 7.8139e-01]],
       device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([[1.8691e-06, 2.4638e-04, 1.7681e-01, 3.1636e-03, 1.2907e-02, 8.0687e-01]],
       device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([[2.6758e-06, 2.9412e-04, 1.7556e-01, 3.7018e-03, 1.4209e-02, 8.0623e-01]],
       device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([[1.6516e-06, 2.2143e-04, 1.1803e-01, 3.1253e-03, 1.012

tensor([[2.1871e-06, 2.3912e-04, 1.6178e-01, 3.2619e-03, 1.3371e-02, 8.2134e-01]],
       device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([[3.7450e-06, 2.2644e-04, 8.5449e-01, 1.4207e-03, 2.7074e-02, 1.1679e-01]],
       device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([[4.4015e-06, 2.8942e-04, 7.7160e-01, 2.0676e-03, 2.9355e-02, 1.9668e-01]],
       device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([[5.2001e-06, 3.6039e-04, 6.3851e-01, 2.9147e-03, 3.0400e-02, 3.2781e-01]],
       device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([[5.4387e-06, 3.6306e-04, 6.7042e-01, 2.8061e-03, 3.0974e-02, 2.9543e-01]],
       device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([[4.8173e-06, 3.2696e-04, 7.0909e-01, 2.4741e-03, 3.0102e-02, 2.5800e-01]],
       device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([[3.6351e-06, 2.3989e-04, 8.1471e-01, 1.6441e-03, 2.7338e-02, 1.5607e-01]],
       device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([[2.3254e-06, 1.1320e-04, 9.5340e-01, 5.1634e-04, 1.925

tensor([[3.3306e-06, 2.1719e-04, 8.2899e-01, 1.5395e-03, 2.7323e-02, 1.4193e-01]],
       device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([[3.5964e-06, 1.9184e-04, 8.8713e-01, 1.1835e-03, 2.6448e-02, 8.5047e-02]],
       device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([[3.8385e-06, 2.5547e-04, 8.0576e-01, 1.7347e-03, 2.7935e-02, 1.6431e-01]],
       device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([[4.3129e-06, 2.4040e-04, 8.5229e-01, 1.5076e-03, 2.8530e-02, 1.1743e-01]],
       device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([[2.3725e-06, 1.3906e-04, 9.1483e-01, 7.9443e-04, 2.3022e-02, 6.1215e-02]],
       device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([[2.1509e-06, 1.1467e-04, 9.3864e-01, 5.8753e-04, 2.1286e-02, 3.9374e-02]],
       device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([[2.1667e-06, 1.0003e-04, 9.5466e-01, 4.4706e-04, 1.9749e-02, 2.5045e-02]],
       device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([[1.5463e-06, 8.3024e-05, 9.3194e-01, 6.4418e-04, 2.055

tensor([[2.2148e-06, 2.7120e-04, 2.4244e-01, 3.2139e-03, 1.5867e-02, 7.3821e-01]],
       device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([[2.6685e-06, 2.8614e-04, 4.3748e-01, 2.9313e-03, 2.2234e-02, 5.3706e-01]],
       device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([[2.5188e-06, 2.7104e-04, 2.6137e-01, 3.4077e-03, 1.7460e-02, 7.1749e-01]],
       device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([[2.8093e-06, 2.7787e-04, 2.4317e-01, 3.7366e-03, 1.7658e-02, 7.3516e-01]],
       device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([[3.9687e-06, 2.2687e-04, 8.6801e-01, 1.3846e-03, 2.7132e-02, 1.0324e-01]],
       device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([[1.4784e-06, 7.0755e-05, 9.6388e-01, 3.7724e-04, 1.6670e-02, 1.8998e-02]],
       device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([[2.6750e-06, 1.6689e-04, 9.0111e-01, 9.9415e-04, 2.3303e-02, 7.4423e-02]],
       device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([[8.1355e-07, 4.0005e-05, 9.8127e-01, 1.6239e-04, 1.160

tensor([[3.8799e-06, 3.5311e-04, 4.6157e-01, 3.2754e-03, 2.5076e-02, 5.0972e-01]],
       device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([[2.9430e-06, 2.9741e-04, 3.7119e-01, 3.1216e-03, 2.1499e-02, 6.0388e-01]],
       device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([[4.9495e-06, 3.8849e-04, 4.9706e-01, 3.4859e-03, 2.7857e-02, 4.7120e-01]],
       device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([[4.0903e-06, 3.4144e-04, 4.0317e-01, 3.5703e-03, 2.4909e-02, 5.6800e-01]],
       device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([[6.9108e-07, 9.7711e-05, 2.5517e-02, 2.2194e-03, 3.7291e-03, 9.6844e-01]],
       device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([[4.6278e-06, 2.7238e-04, 7.8680e-01, 2.0583e-03, 3.0398e-02, 1.8046e-01]],
       device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([[4.0110e-06, 2.0932e-04, 8.7740e-01, 1.3043e-03, 2.7575e-02, 9.3508e-02]],
       device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([[3.8538e-06, 1.9314e-04, 8.9823e-01, 1.1054e-03, 2.644

tensor([[3.5881e-06, 3.5435e-04, 2.4306e-01, 3.8879e-03, 1.8301e-02, 7.3440e-01]],
       device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([[2.9699e-06, 3.1125e-04, 2.0749e-01, 3.7037e-03, 1.6141e-02, 7.7235e-01]],
       device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([[4.0563e-06, 3.7453e-04, 3.1502e-01, 3.9027e-03, 2.1471e-02, 6.5923e-01]],
       device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([[4.5677e-06, 3.8981e-04, 4.1843e-01, 3.7446e-03, 2.5043e-02, 5.5239e-01]],
       device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([[2.8360e-06, 3.0051e-04, 1.8426e-01, 3.7240e-03, 1.4888e-02, 7.9682e-01]],
       device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([[1.8923e-06, 2.3361e-04, 9.2470e-02, 3.4063e-03, 9.1421e-03, 8.9475e-01]],
       device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([[2.0756e-06, 2.5229e-04, 1.6371e-01, 3.3606e-03, 1.2816e-02, 8.1986e-01]],
       device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([[2.7362e-06, 2.9260e-04, 2.9400e-01, 3.3715e-03, 1.839

tensor([[1.8194e-06, 9.8903e-05, 9.5382e-01, 4.8655e-04, 1.7990e-02, 2.7606e-02]],
       device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([[2.9698e-06, 2.4698e-04, 7.2937e-01, 1.9320e-03, 2.6713e-02, 2.4173e-01]],
       device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([[2.8244e-06, 2.3084e-04, 7.4902e-01, 1.8015e-03, 2.6741e-02, 2.2221e-01]],
       device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([[1.6632e-06, 9.7394e-05, 9.4984e-01, 4.6614e-04, 1.8310e-02, 3.1282e-02]],
       device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([[1.5682e-06, 8.2499e-05, 9.6003e-01, 3.9249e-04, 1.6923e-02, 2.2569e-02]],
       device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([[1.2413e-06, 6.0829e-05, 9.7141e-01, 2.6644e-04, 1.4740e-02, 1.3525e-02]],
       device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([[1.5132e-06, 7.7814e-05, 9.6408e-01, 3.5907e-04, 1.6420e-02, 1.9061e-02]],
       device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([[3.3828e-06, 3.1527e-04, 4.1501e-01, 3.3521e-03, 2.348

tensor([[3.9789e-06, 2.2775e-04, 8.6747e-01, 1.3679e-03, 2.7213e-02, 1.0371e-01]],
       device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([[3.7485e-06, 2.7675e-04, 7.3148e-01, 2.0783e-03, 2.8165e-02, 2.3799e-01]],
       device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([[2.8527e-06, 1.8707e-04, 8.7618e-01, 1.1453e-03, 2.4617e-02, 9.7868e-02]],
       device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([[5.1320e-06, 3.3398e-04, 7.2801e-01, 2.4280e-03, 3.0660e-02, 2.3856e-01]],
       device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([[4.7982e-06, 3.1643e-04, 7.4739e-01, 2.2463e-03, 3.0024e-02, 2.2002e-01]],
       device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([[4.8784e-06, 3.4017e-04, 6.8471e-01, 2.6069e-03, 2.9972e-02, 2.8236e-01]],
       device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([[4.6889e-06, 3.7313e-04, 5.3499e-01, 3.2629e-03, 2.7988e-02, 4.3339e-01]],
       device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([[3.5225e-06, 2.5648e-04, 7.7271e-01, 1.8488e-03, 2.745

tensor([[4.7223e-06, 3.2964e-04, 6.9392e-01, 2.5350e-03, 2.9881e-02, 2.7332e-01]],
       device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([[4.7901e-06, 3.2390e-04, 7.2352e-01, 2.3851e-03, 3.0002e-02, 2.4376e-01]],
       device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([[5.3207e-06, 3.5701e-04, 6.8182e-01, 2.7224e-03, 3.0768e-02, 2.8432e-01]],
       device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([[5.5707e-06, 3.5443e-04, 7.1695e-01, 2.5764e-03, 3.1283e-02, 2.4883e-01]],
       device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([[5.1423e-06, 3.3694e-04, 7.1939e-01, 2.4772e-03, 3.0615e-02, 2.4718e-01]],
       device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([[5.6624e-06, 3.5969e-04, 7.1075e-01, 2.6323e-03, 3.1423e-02, 2.5483e-01]],
       device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([[5.6883e-06, 3.6057e-04, 7.1108e-01, 2.6362e-03, 3.1468e-02, 2.5445e-01]],
       device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([[5.6676e-06, 3.5975e-04, 7.1137e-01, 2.6313e-03, 3.143

tensor([[4.1109e-06, 2.7461e-04, 7.8173e-01, 1.9134e-03, 2.8858e-02, 1.8722e-01]],
       device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([[4.1017e-06, 3.5846e-04, 3.0782e-01, 3.8452e-03, 2.1570e-02, 6.6640e-01]],
       device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([[4.4455e-06, 2.8461e-04, 7.8751e-01, 1.9462e-03, 2.9366e-02, 1.8089e-01]],
       device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([[4.0687e-06, 2.5532e-04, 8.1846e-01, 1.6838e-03, 2.8428e-02, 1.5117e-01]],
       device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([[3.1063e-06, 2.4730e-04, 3.3402e-01, 3.6933e-03, 2.2719e-02, 6.3932e-01]],
       device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([[4.1752e-06, 3.4358e-04, 4.3232e-01, 3.4425e-03, 2.5471e-02, 5.3842e-01]],
       device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([[4.5911e-06, 2.5769e-04, 8.3838e-01, 1.6488e-03, 2.9332e-02, 1.3038e-01]],
       device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([[2.2078e-06, 1.0898e-04, 9.4312e-01, 5.8649e-04, 2.096

tensor([[4.4144e-06, 2.4556e-04, 8.5464e-01, 1.5015e-03, 2.8523e-02, 1.1509e-01]],
       device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([[3.9941e-06, 2.1347e-04, 8.8234e-01, 1.2474e-03, 2.7064e-02, 8.9135e-02]],
       device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([[2.6871e-06, 1.3281e-04, 9.3640e-01, 6.8452e-04, 2.2048e-02, 4.0732e-02]],
       device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([[4.0887e-06, 2.5343e-04, 8.2344e-01, 1.6203e-03, 2.8418e-02, 1.4626e-01]],
       device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([[4.5097e-06, 2.8205e-04, 7.8907e-01, 1.9450e-03, 2.9590e-02, 1.7911e-01]],
       device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([[3.6333e-06, 3.1953e-04, 2.5564e-01, 3.7669e-03, 1.9589e-02, 7.2069e-01]],
       device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([[3.1827e-06, 2.9807e-04, 2.0141e-01, 3.6609e-03, 1.6677e-02, 7.7795e-01]],
       device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([[2.6814e-06, 2.6509e-04, 1.7990e-01, 3.4852e-03, 1.510

tensor([[2.1209e-06, 2.5786e-04, 1.3784e-01, 3.3358e-03, 1.1937e-02, 8.4663e-01]],
       device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([[2.9719e-06, 3.0764e-04, 3.6744e-01, 3.2091e-03, 2.1309e-02, 6.0773e-01]],
       device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([[4.6741e-06, 3.7859e-04, 4.9663e-01, 3.4283e-03, 2.7121e-02, 4.7244e-01]],
       device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([[2.7139e-06, 2.8190e-04, 1.9905e-01, 3.5909e-03, 1.5407e-02, 7.8167e-01]],
       device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([[1.5563e-06, 2.0796e-04, 8.4887e-02, 3.1710e-03, 8.2464e-03, 9.0349e-01]],
       device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([[1.4831e-06, 1.7479e-04, 8.4934e-02, 2.9975e-03, 8.4711e-03, 9.0342e-01]],
       device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([[2.8573e-06, 2.9293e-04, 2.9050e-01, 3.4408e-03, 1.8666e-02, 6.8710e-01]],
       device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([[4.4962e-06, 3.8148e-04, 4.2827e-01, 3.6503e-03, 2.523

tensor([[1.6112e-06, 7.2623e-05, 9.6782e-01, 3.2193e-04, 1.6324e-02, 1.5464e-02]],
       device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([[2.6411e-06, 1.8765e-04, 8.1912e-01, 1.4554e-03, 2.5961e-02, 1.5327e-01]],
       device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([[3.1098e-06, 3.0258e-04, 2.6328e-01, 3.5380e-03, 1.8688e-02, 7.1419e-01]],
       device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([[5.3102e-06, 3.5266e-04, 6.8765e-01, 2.6905e-03, 3.0841e-02, 2.7846e-01]],
       device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([[5.2080e-06, 3.4276e-04, 7.1060e-01, 2.5450e-03, 3.0730e-02, 2.5578e-01]],
       device='cuda:0', grad_fn=<SoftmaxBackward>)
Simulation complete - total reward: -21.0


-21.0

In [11]:
writer.close()