In [1]:
import gym
import math
import numpy as np
from collections import deque
import matplotlib.pyplot as plt
%matplotlib inline

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.distributions import Categorical
import torch.optim as optim

In [2]:
i = 80 * 80 # input dimensionality: 80x80 grid
h = 200
o = 1

batch_size = 10 # every how many episodes to do a param update?
learning_rate = 1e-4
gamma = 0.99 # discount factor for reward
resume = False # resume from previous checkpoint?
render = False

In [3]:
class PolicyNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(PolicyNN, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, output_size)
        
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        x = torch.sigmoid(x)
        return x#.cpu().data
    
    def act(self, state):
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        p = self.forward(state)
        action = 4 if np.random.uniform() < p else 5 
        return action, torch.log(p)

def preprocess_state(state):
    img = state[35:195,:,:]
    img = img[::2, ::2, 0]
    img[img==144] = 0
    img[img==109] = 0
    img[img!=0] = 1
    return img.astype(np.float).ravel()

# Batch GD
_________________________

In [39]:
def play_pong_minibatch(n_batches=30, gamma=0.9, print_every=100, render=False, b=32):
    scores_deque = deque(maxlen=b)
    scores = []
    
    bn=0
    while bn < n_batches:
        saved_log_probs = []
        rewards = []
        batch_saved_log_probs = []
        batch_rewards = []
        for i_episode in range(bn*b, (bn+1)*b):
            ep_rewards = []
            observation = env.reset()
            prev_input = None
            
            t=0####
            while True:
                if render:
                    env.render()
                cur_input = preprocess_state(observation)
                state = cur_input - prev_input if prev_input is not None else cur_input
                prev_input = cur_input

                action, log_prob = agent.act(state)
                saved_log_probs.append(log_prob)

                observation, reward, done, _ = env.step(action)
                rewards.append(reward*gamma**t)   ####
                ep_rewards.append(reward)
                t+=1    ###
                if done:
                    break

            scores_deque.append(sum(ep_rewards))
            scores.append(sum(ep_rewards))
            batch_saved_log_probs.append(sum(saved_log_probs))
            batch_rewards.append(sum(rewards))

        #discounts = [gamma**i for i in range(len(rewards)+1)]
        #R = sum([a*b for a,b in zip(discounts, rewards)])/b
        #nR = [a*b for a,b in zip(discounts, rewards)]
        #nR = (nR - np.mean(nR))/np.std(nR)

        policy_loss = []
        for l, r in zip(batch_saved_log_probs, batch_rewards):
            policy_loss.append(-l * r)
        #for log_prob in saved_log_probs:
        #    policy_loss.append(-log_prob * nR)
        policy_loss = torch.cat(policy_loss).sum()/b

        optimizer.zero_grad()
        policy_loss.backward()
        optimizer.step()

        print('Batch #{}/{}:\tAverage Score {:.2f}\tLoss {:.2f}'.format(bn+1,
                                                                        n_batches,
                                                                        np.mean(scores_deque),
                                                                       policy_loss))
        
        bn += 1
            
    return scores

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print('Training on {}.'.format(device))

agent = PolicyNN(i,h,o).to(device)
env = gym.make("PongDeterministic-v4")
optimizer = optim.Adam(agent.parameters(), lr=learning_rate)

scores = play_pong_minibatch(render=False, gamma=0.995, b=16)

Training on cuda:0.
Batch #1/30:	Average Score -19.56	Loss -206758.22


In [24]:
### step by step debug
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print('Training on {}.'.format(device))

agent = PolicyNN(i,h,o).to(device)
env = gym.make("PongDeterministic-v4")
optimizer = optim.Adam(agent.parameters(), lr=learning_rate)
scores_deque = deque(maxlen=100)
scores = []


Training on cuda:0.


In [25]:
### batch loop
saved_log_probs = []
rewards = []
batch_saved_log_probs = []
batch_rewards = []

In [32]:
### episode loop starts here
ep_rewards = []
observation = env.reset()
prev_input = None
l = len(saved_log_probs)
t = 0

In [33]:
while True:
    if render:
        env.render()
    cur_input = preprocess_state(observation)
    state = cur_input - prev_input if prev_input is not None else cur_input
    prev_input = cur_input

    action, log_prob = agent.act(state)
    saved_log_probs.append(log_prob)

    observation, reward, done, _ = env.step(action)
    rewards.append(reward*0.99**t)
    ep_rewards.append(reward)
    t+=1
    if done:
        print(t)
        break

scores_deque.append(sum(ep_rewards))
scores.append(sum(ep_rewards))
batch_saved_log_probs.append(sum(saved_log_probs))
batch_rewards.append(sum(rewards))



764


In [34]:
print('We played {} matches with scores {}.'.format(len(scores), scores))
[i for i in rewards if i>0]

print('L of log probs is {}, increase of {}.'.format(len(saved_log_probs), len(saved_log_probs)-l))
print(len(batch_saved_log_probs), len(batch_rewards), batch_saved_log_probs, batch_rewards)

We played 3 matches with scores [-19.0, -20.0, -21.0].
L of log probs is 2641, increase of 764.
3 3 [tensor([[-673.6816]], device='cuda:0', grad_fn=<AddBackward0>), tensor([[-1281.4662]], device='cuda:0', grad_fn=<AddBackward0>), tensor([[-1803.2837]], device='cuda:0', grad_fn=<AddBackward0>)] [-1.7123383686895763, -3.494628757364642, -5.283779504893986]


In [35]:
### loop here
#discounts = [gamma**i for i in range(len(rewards)+1)]
#R = [a*b for a,b in zip(discounts, rewards)]
#nR = [a*b for a,b in zip(discounts, rewards)]
#nR = (nR - np.mean(nR))/np.std(nR)
#print(len(discounts))
#print('Discounted R total {:.3f}, avg score {:.1f}'.format(R, np.mean(scores_deque)))
policy_loss = []
for l, r in zip(batch_saved_log_probs, batch_rewards):
    policy_loss.append(-l*r)
policy_loss

[tensor([[-1153.5708]], device='cuda:0', grad_fn=<MulBackward0>),
 tensor([[-4478.2485]], device='cuda:0', grad_fn=<MulBackward0>),
 tensor([[-9528.1533]], device='cuda:0', grad_fn=<MulBackward0>)]

In [38]:
torch.cat(policy_loss).sum()
policy_loss.backward()

AttributeError: 'list' object has no attribute 'backward'

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print('Training on {}.'.format(device))

agent = PolicyNN(i,h,o).to(device)
env = gym.make("PongDeterministic-v4")
optimizer = optim.Adam(agent.parameters(), lr=learning_rate)

scores = play_pong_minibatch(render=False, gamma=0.97)

In [None]:
32*30