In [1]:
import gym
import math
import numpy as np
from collections import deque
import matplotlib.pyplot as plt
%matplotlib inline

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.distributions import Categorical
import torch.optim as optim
!pip install gym[atari]



In [2]:
i = 80 * 80 # input dimensionality: 80x80 grid
h = 200
o = 1

batch_size = 10 # every how many episodes to do a param update?
learning_rate = 2e-4
gamma = 0.99 # discount factor for reward
decay_rate = 0.99 # decay factor for RMSProp leaky sum of grad^2
resume = False # resume from previous checkpoint?
render = False

In [3]:
class PolicyNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(PolicyNN, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, output_size)
        
    def forward(self, x):
        x = self.fc1(x)
        x = F.relu(x)
        x = self.fc2(x)
        x = F.sigmoid(x)#, dim=1)
        return x
    
    def act(self, state):
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        p = self.forward(state).cpu()
        action = 2 if np.random.uniform() < p else 3 
        return action, torch.log(p)

In [4]:
def preprocess_state(state):
    img = state[35:195,:,:]
    img = img[::2, ::2, 0]
    img[img==144] = 0
    img[img==109] = 0
    img[img!=0] = 1
    return img.astype(np.float).ravel()

In [5]:
def play_pong(n_episodes=1000, gamma=0.9, print_every=100, render=False):
    scores_deque = deque(maxlen=100)
    scores = []
    
    for i_episode in range(1, n_episodes+1):
        saved_log_probs = []
        rewards = []
        observation = env.reset()
        prev_x = None
        
        while True:
            if render: env.render()
                
            cur_x = preprocess_state(observation)
            state = cur_x - prev_x if prev_x is not None else np.zeros(i)
            
            action, log_prob = agent.act(state)
            saved_log_probs.append(log_prob)
            
            state, reward, done, _ = env.step(action)
            rewards.append(reward)
            if done:
                break
                
        scores_deque.append(sum(rewards))
        scores.append(sum(rewards))
        
        discounts = [gamma**i for i in range(len(rewards)+1)]
        R = sum([a*b for a,b in zip(discounts, rewards)])
        
        policy_loss = []
        for log_prob in saved_log_probs:
            policy_loss.append(-log_prob * R)
        policy_loss = torch.cat(policy_loss).sum()
        
        optimizer.zero_grad()
        policy_loss.backward()
        optimizer.step()
        
        if i_episode % print_every == 0:
            print('Episode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_deque)))
            
    return scores

In [6]:
def play_pong_batch(n_episodes=1000, gamma=0.9, print_every=100, render=False, b = batch_size):
    scores_deque = deque(maxlen=100)
    scores = []
    
    saved_log_probs = []
    rewards = []
    
    for i_episode in range(1, n_episodes+1):
        observation = env.reset()
        prev_x = None
        ep_rewards = []
        
        while True:
            if render: env.render()
                
            cur_x = preprocess_state(observation)
            state = cur_x - prev_x if prev_x is not None else np.zeros(i)
            
            action, log_prob = agent.act(state)
            saved_log_probs.append(log_prob)
            
            state, reward, done, _ = env.step(action)
            rewards.append(reward)
            ep_rewards.append(reward)
            if done:
                break
                
        scores_deque.append(sum(ep_rewards))
        scores.append(sum(ep_rewards))
        
        if i_episode % b == 0:
            discounts = [gamma**i for i in range(len(rewards)+1)]
            R = sum([a*b for a,b in zip(discounts, rewards)])
            policy_loss = []
            for log_prob in saved_log_probs:
                policy_loss.append(-log_prob * R)
            policy_loss = torch.cat(policy_loss).sum()

            optimizer.zero_grad()
            policy_loss.backward()
            optimizer.step()
            saved_log_probs = []
            rewards = []
        
        #if i_episode % print_every == 0:
            print('Episode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_deque)))
            
    return scores

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print('Training on {}.'.format(device))

agent = PolicyNN(i,h,o).to(device)
env = gym.make("Pong-v0")
optimizer = optim.Adam(agent.parameters(), lr=learning_rate)

scores = play_pong_batch(render=False, n_episodes=5000, print_every=10, gamma=gamma, b=batch_size)

Training on cuda:0.




Episode 10	Average Score: -20.20
Episode 20	Average Score: -20.40
Episode 30	Average Score: -20.53
Episode 40	Average Score: -20.50
Episode 50	Average Score: -20.46
Episode 60	Average Score: -20.45
Episode 70	Average Score: -20.39
Episode 80	Average Score: -20.39
Episode 90	Average Score: -20.41
Episode 100	Average Score: -20.42
Episode 110	Average Score: -20.46
Episode 120	Average Score: -20.45
Episode 130	Average Score: -20.39
Episode 140	Average Score: -20.45
Episode 150	Average Score: -20.43
Episode 160	Average Score: -20.37
Episode 170	Average Score: -20.43
Episode 180	Average Score: -20.43
Episode 190	Average Score: -20.40
Episode 200	Average Score: -20.39
Episode 210	Average Score: -20.37
Episode 220	Average Score: -20.32
Episode 230	Average Score: -20.37
