## TODO: CUDA-fy, add "resume" functionality, punish no-ops less

#### Credit:
Largely adapted from Andrej Karpathy's pong playing agent and [this notebook](https://gist.github.com/ts1829/ebbe2cf946bf36951b724818c52e36b9#file-policy-gradient-with-cartpole-and-pytorch-medium-version-ipynb)

In [13]:
import argparse

In [14]:
cuda = torch.device('cuda')

In [16]:
device=None
if torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')

In [1]:
"""
Simple Controller module that attempts to navigate to randomly generated target locations
 - Reward based on getting within 7 pixels of target location and having zero velocity
 - Use paddle velocity as input to the reinforcement learning algo as well
 """

import numpy as np
import pickle
import gym
import matplotlib.pyplot as plt
import sys
import time

import gym
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm, trange
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.autograd import Variable
from torch.distributions import Categorical

In [52]:
class Policy(nn.Module):
    def __init__(self, D_in, h1=128):
        super(Policy, self).__init__()
        #self.state_space = env.observation_space.shape[0]
        self.action_space = env.action_space.n
        
        self.l1 = nn.Linear(D_in, h1, bias=False)
        self.l2 = nn.Linear(h1, self.action_space, bias=False)
        
        self.gamma = gamma
        
        # Episode policy and reward history 
        self.policy_history = Variable(torch.Tensor()).to(device=device) 
        self.reward_episode = []
        # Overall reward and loss history
        self.reward_history = []
        self.loss_history = []

    def forward(self, x):    
        model = torch.nn.Sequential(
            self.l1,
            #nn.Dropout(p=0.6),
            nn.ReLU(),
            self.l2,
            nn.Softmax(dim=-1)
        )
        #print(x)
        #print(self.l1.weight)
        #print(self.l1.bias)
        #print()
        return model(x)

In [163]:
class Policy2(nn.Module):
    def __init__(self, D_in, h1=128, h2=64):
        super(Policy2, self).__init__()
        #self.state_space = env.observation_space.shape[0]
        self.action_space = env.action_space.n
        
        self.l1 = nn.Linear(D_in, h1, bias=False)
        self.l2 = nn.Linear(h1, h2, bias=False)
        self.l3 = nn.Linear(h2, self.action_space, bias=False)
        
        self.gamma = gamma
        
        # Episode policy and reward history 
        self.policy_history = Variable(torch.Tensor()).to(device=device) 
        self.reward_episode = []
        # Overall reward and loss history
        self.reward_history = []
        self.loss_history = []

    def forward(self, x):    
        model = torch.nn.Sequential(
            self.l1,
            nn.Dropout(p=0.6),
            nn.ReLU(),
            self.l2,
            nn.Dropout(p=0.3),
            nn.ReLU(),
            self.l3,
            nn.Softmax(dim=-1)
        )
        #print(x)
        #print(self.l1.weight)
        #print(self.l1.bias)
        #print()
        return model(x)

In [186]:
def update_policy():
    R = 0
    rewards = []
    
    # Discount future rewards back to the present using gamma
    for r in policy.reward_episode[::-1]:
        R = r + policy.gamma * R
        rewards.insert(0,R)
        
    # Scale rewards
    rewards = torch.FloatTensor(rewards).to(device=device)
    rewards = (rewards - rewards.mean()) / (rewards.std() + np.finfo(np.float32).eps)
    
    # Calculate loss
    loss = (torch.sum(torch.mul(policy.policy_history, Variable(rewards)).mul(-1), -1)).to(device=device)
    
    # Update network weights
    optimizer.zero_grad()
    loss.backward(retain_graph=True)
    #torch.nn.utils.clip_grad_norm_(policy.parameters(), 2)
    optimizer.step()
    
    #Save and intialize episode history counters
    policy.loss_history.append(loss.item())
    policy.reward_history.append(np.sum(policy.reward_episode))
    policy.policy_history = Variable(torch.Tensor())
    policy.reward_episode= []

In [187]:
def select_action(state):
    #Select an action (0 or 1) by running policy model and choosing based on the probabilities in state
    state = torch.from_numpy(state).type(torch.FloatTensor).to(device=device)
    #print(state.numpy())
    state = policy(Variable(state))
    c = Categorical(state)
    #print(state.detach().numpy())
    #print()
    action = c.sample()
    
    # Add log probability of our chosen action to our history    
    if policy.policy_history.dim() != 0:
        #print(policy.policy_history.dim(), c.log_prob(action).dim())
        policy.policy_history = torch.cat([policy.policy_history.to(device=device), 
                                           c.log_prob(action).view(1).to(device=device)]).to(device=device)
    else:
        policy.policy_history = (c.log_prob(action)).to(device=device)
    return action

In [188]:
def prepro(I):
    """ prepro 210x160x3 uint8 frame into 6400 (80x80) 1D float vector """
    I = I[35:195] # crop
    #I = I[::2,::2,0] # downsample by factor of 2
    I[I == 144] = 0 # erase background (background type 1)
    I[I == 109] = 0 # erase background (background type 2)
    #I[I != 0] = 1 # everything else (paddles, ball) just set to 1
    I = I[:-1,:,0]
    return I.astype(np.float)

In [189]:
def get_paddle_y(img, display_message=False):
    paddle_2_x = 139 # Leftmost position of paddle 2
    paddle_height = 15

    paddle_1_color = 213
    paddle_2_color = 92
    ball_color = 236

    ## In the beginning of the game, the paddle on the left and the ball are not yet present
    not_all_present = np.where(img == paddle_2_color)[0].size == 0
    if (not_all_present):
        if display_message:
            print("One or more of the objects is missing, returning an empty list of positions")
            print("(This happens at the first few steps of the game)")
        return -1

    paddle_2_top = np.unique(np.where(img == paddle_2_color)[0])[0]
    paddle_2_bot = paddle_2_top + paddle_height

    return (paddle_2_top + paddle_2_bot) / 2

In [195]:
env = gym.make("Pong-v0")

# hyperparameters
learning_rate = 1e-4 * 5
gamma = 0.99 # discount factor for reward
h1 = 20 # number of hidden layer neurons
h2 = 10 # number of hidden layer neurons

D_in = 2 ## 1. (where we are - where we need to go), 2. (paddle center last frame - paddle center this frame)

policy = Policy2(D_in, h1=H, h2=50)
policy = policy.to(device=device)
optimizer = optim.Adam(policy.parameters(), lr=learning_rate)

batch_size = 10 # every how many episodes to do a param update?
resume = False # resume from previous checkpoint?
LOAD_PATH="models/control_save_vel2_h100"
save_counter = 0
total_reward = 0
paddle_height = 15

render=False
plotting=False

if resume:
    checkpoint = torch.load(LOAD_PATH)
    policy.load_state_dict(checkpoint['model_state_dict'])
    episode_number = checkpoint['episode_number']

observation = env.reset()

steps=0
prev_x = None # used in computing the difference frame
running_reward = None
reward_sum = 0
episode_number = 0
start = time.time()
prev_paddle_y = -1
target_loc = 55
up_down_counter = 0
no_op_counter = 0
while(episode_number < 5000):
    if render: 
        env.render()
        time.sleep(0.5)

    # preprocess the observation
    curr_img = prepro(observation)
    paddle_y = get_paddle_y(curr_img)

    #if paddle_y != -1:
    if paddle_y != -1 and prev_paddle_y != -1:
        #x = np.array([target_loc - paddle_y])
        vel = paddle_y - prev_paddle_y
        x = np.array([target_loc - paddle_y, vel])
    else:
        vel = 0
        x = np.zeros(D_in)

    # forward the policy network and sample an action from the returned probability
    #aprobs, h = policy(x)
    action = select_action(x)
    observation, reward, done, info = env.step(action)
    steps += 1
    
    ## ~~~~~~~~~~~~~~~~~~
    ## Reward Assignment
    ## ~~~~~~~~~~~~~~~~~~
    if paddle_y == -1:
        reward = 0
        #no_op_counter = 0
    elif np.abs(x[0]) < (paddle_height / 2) and vel == 0:
        #print("reward achieved")
        reward = 2.5
        target_loc = int(np.random.random() * 100 + 20)
        #print(target_loc)
    else: # punish no-ops less
        reward = -.01

    policy.reward_episode.append(reward)
    prev_paddle_y = paddle_y
    reward_sum += reward
    
    if done: # an episode finished
        print("Total reward for this ep({0:d}): {1:.2f}".format(episode_number, reward_sum))
        episode_number += 1
        print("This epsiode lasted " + str(steps) + " steps")
        steps = 0
        
        if episode_number % batch_size == 0:
            update_policy()
        
        # boring book-keeping
        running_reward = reward_sum if running_reward is None else running_reward * 0.99 + reward_sum * 0.01
        reward_sum = 0
        if episode_number % 100 == 0:
            PATH = 'models/control_save_vel2_h__2layer'
            torch.save({
                'episode_number': episode_number,
                'model_state_dict': policy.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
            }, PATH)
            #pickle.dump(model, open('models/control_save_vel2_h'+ str(H) +'_' + str(save_counter) + '.p', 'wb'))
            #save_counter +=1
            
        observation = env.reset() # reset env
        prev_x = None
        
end = time.time()
print(end - start)

Total reward for this ep(0): 26.25
This epsiode lasted 1392 steps
Total reward for this ep(1): 21.20
This epsiode lasted 1144 steps
Total reward for this ep(2): 19.96
This epsiode lasted 1017 steps
Total reward for this ep(3): 7.94
This epsiode lasted 1215 steps
Total reward for this ep(4): 13.50
This epsiode lasted 1161 steps
Total reward for this ep(5): 17.33
This epsiode lasted 1029 steps
Total reward for this ep(6): 21.51
This epsiode lasted 1364 steps
Total reward for this ep(7): 17.55
This epsiode lasted 1258 steps
Total reward for this ep(8): 9.91
This epsiode lasted 1018 steps
Total reward for this ep(9): 29.94
This epsiode lasted 1274 steps
Total reward for this ep(10): 11.78
This epsiode lasted 1082 steps
Total reward for this ep(11): 33.82
This epsiode lasted 1388 steps
Total reward for this ep(12): 34.17
This epsiode lasted 1102 steps
Total reward for this ep(13): 13.56
This epsiode lasted 1155 steps
Total reward for this ep(14): 18.23
This epsiode lasted 1190 steps
Total r

Total reward for this ep(123): 56.02
This epsiode lasted 1176 steps
Total reward for this ep(124): 38.81
This epsiode lasted 1140 steps
Total reward for this ep(125): 49.22
This epsiode lasted 1103 steps
Total reward for this ep(126): 46.07
This epsiode lasted 1167 steps
Total reward for this ep(127): 47.53
This epsiode lasted 1272 steps
Total reward for this ep(128): 67.65
This epsiode lasted 1017 steps
Total reward for this ep(129): 56.17
This epsiode lasted 1161 steps
Total reward for this ep(130): 47.42
This epsiode lasted 1032 steps
Total reward for this ep(131): 27.42
This epsiode lasted 1024 steps
Total reward for this ep(132): 32.05
This epsiode lasted 1063 steps
Total reward for this ep(133): 56.72
This epsiode lasted 1106 steps
Total reward for this ep(134): 42.18
This epsiode lasted 1054 steps
Total reward for this ep(135): 44.54
This epsiode lasted 1069 steps
Total reward for this ep(136): 46.73
This epsiode lasted 1101 steps
Total reward for this ep(137): 47.70
This epsiod

Total reward for this ep(244): 69.79
This epsiode lasted 1054 steps
Total reward for this ep(245): 54.69
This epsiode lasted 1058 steps
Total reward for this ep(246): 69.25
This epsiode lasted 1359 steps
Total reward for this ep(247): 70.25
This epsiode lasted 1008 steps
Total reward for this ep(248): 67.51
This epsiode lasted 1031 steps
Total reward for this ep(249): 62.67
This epsiode lasted 1013 steps
Total reward for this ep(250): 89.11
This epsiode lasted 1130 steps
Total reward for this ep(251): 58.96
This epsiode lasted 1133 steps
Total reward for this ep(252): 69.50
This epsiode lasted 1083 steps
Total reward for this ep(253): 90.30
This epsiode lasted 1262 steps
Total reward for this ep(254): 77.68
This epsiode lasted 1018 steps
Total reward for this ep(255): 89.78
This epsiode lasted 1314 steps
Total reward for this ep(256): 86.82
This epsiode lasted 1108 steps
Total reward for this ep(257): 81.76
This epsiode lasted 1112 steps
Total reward for this ep(258): 62.51
This epsiod

Total reward for this ep(365): 107.84
This epsiode lasted 1014 steps
Total reward for this ep(366): 80.32
This epsiode lasted 1005 steps
Total reward for this ep(367): 110.27
This epsiode lasted 1022 steps
Total reward for this ep(368): 133.08
This epsiode lasted 1251 steps
Total reward for this ep(369): 75.10
This epsiode lasted 1025 steps
Total reward for this ep(370): 105.28
This epsiode lasted 1270 steps
Total reward for this ep(371): 95.14
This epsiode lasted 1029 steps
Total reward for this ep(372): 72.64
This epsiode lasted 1020 steps
Total reward for this ep(373): 100.20
This epsiode lasted 1025 steps
Total reward for this ep(374): 85.32
This epsiode lasted 1007 steps
Total reward for this ep(375): 122.67
This epsiode lasted 1037 steps
Total reward for this ep(376): 100.21
This epsiode lasted 1024 steps
Total reward for this ep(377): 85.20
This epsiode lasted 1019 steps
Total reward for this ep(378): 95.26
This epsiode lasted 1017 steps
Total reward for this ep(379): 116.48
Thi

KeyboardInterrupt: 

In [None]:
window = int(episode_number/20/batch_size)

fig, ax1 = plt.subplots(1, 1, sharey=True, figsize=[14,9]);
rolling_mean = pd.Series(policy.reward_history).rolling(window).mean()
std = pd.Series(policy.reward_history).rolling(window).std()
ax1.plot(rolling_mean)
ax1.fill_between(range(len(policy.reward_history)),rolling_mean-std, rolling_mean+std, color='orange', alpha=0.2)
ax1.set_title('Batch Reward Moving Average ({}-episode window)'.format(window))
ax1.set_xlabel('Episode'); ax1.set_ylabel('Average Total reward/batch ({} episodes)'.format(batch_size))

plt.show()
fig.savefig('results_2layer_dropout_h1_20_h2_10.png')

In [None]:
policy.reward_history