In [1]:
# import statements

from torch import nn
import torch
import gym
from collections import deque
import itertools
import numpy as np
import random

In [2]:
# set random seed
random.seed(0)

In [3]:
# hyperparameters

GAMMA = 0.99
BATCH_SIZE = 32
BUFFER_SIZE = 50000
MIN_REPLAY_SIZE = 1000
EPSILON_START = 1.0
EPSILON_END = 0.02
EPSILON_DECAY = 10000
TARGET_UPDATE_FREQ = 1000
LEARNING_RATE = 5e-4
MAX_STEPS = 200000

# Regularization Coefficient
REGULARIZATION_COEFFICIENT = 0.01

### Custom Loss Functions

In [4]:
phi_matrix = []
observations_t = None
new_observations_t = None

In [5]:
# default loss function - mean squared error

def default_loss_mse(y_true, y_pred):
    
    loss = torch.mean(torch.square(y_true-y_pred))
    
    return loss

In [6]:
# custom loss function - implements explicit DR3 regularizer

# add dot product between each state action and subsequent one’s feature vector to loss
def dr3(y_true, y_pred):
    
    global observations_t, new_observations_t
    
    loss = torch.mean(torch.square(y_true-y_pred))
    
    if observations_t != None and new_observations_t != None:
        
        for i in range(len(observations_t)):          
            curr_state_feature_vector = online_net.get_phi(observations_t[i]).cpu().detach().numpy()
            next_state_feature_vector = online_net.get_phi(new_observations_t[i]).cpu().detach().numpy()
            loss += REGULARIZATION_COEFFICIENT * np.dot(curr_state_feature_vector, next_state_feature_vector)
    
    return loss

In [7]:
# custom loss function - random dot product from phi matrix

# randomly sample two vectors from the phi matrix and add dot product of those vectors to loss
def random_dot(y_true, y_pred):

    global phi_matrix
    
    loss = torch.mean(torch.square(y_true-y_pred))
    
    # Explicit Regularization
    if ((phi_matrix is not None) and (len(phi_matrix) > 1)):
        
        v1 = phi_matrix[random.randrange(len(phi_matrix))]
        v2 = phi_matrix[random.randrange(len(phi_matrix))]
        
        loss += REGULARIZATION_COEFFICIENT * np.dot(np.array(v1), np.array(v2))
        
    return loss

In [8]:
# custom loss function - implements regulizer based on min/max singular values in phi matrix

# add difference between max entry in phi matrix ** 2 and min entry in phi matrix ** 2 to loss
def phi_penalty(y_true, y_pred):
    
    global phi_matrix
    
    loss = torch.mean(torch.square(y_true-y_pred))
    
    # Explicit Regularization
    if ((phi_matrix is not None) and (len(phi_matrix) > 0)):
        minimum = min([min(value) for value in phi_matrix])
        maximum = max([max(value) for value in phi_matrix])
        loss += REGULARIZATION_COEFFICIENT * (maximum**2 - minimum**2)
            
    return loss

### Network Class

In [9]:
# create network class

class Network(nn.Module):
    
    def __init__(self, env):
        super().__init__()        
        in_features = int(np.prod(env.observation_space.shape))     
        # Neural Network
        self.layer1 = nn.Linear(in_features, 24)
        self.layer2 = nn.ReLU()
        self.layer3 = nn.ReLU()
        self.layer4 = nn.Linear(24, env.action_space.n)
    
    def forward(self, x):
        return self.layer4(self.layer3(self.layer2(self.layer1(x))))
    
    def act(self, obs):
        obs_t = torch.as_tensor(obs, dtype=torch.float32)
        q_values = self(obs_t.unsqueeze(0))
        
        max_q_index = torch.argmax(q_values, dim=1)[0]
        action = max_q_index.detach().item()
        
        return action
    
    def get_phi(self, x):
        return self.layer3(self.layer2(self.layer1(x)))

### Create OpenAI Gym Environment

In [10]:
# create environment

env = gym.make('CartPole-v0')

replay_buffer = deque(maxlen=BUFFER_SIZE)
reward_buffer = deque([0.0], maxlen=100)

all_ranks = deque([0])
last_100_ranks = deque([0], maxlen=100)

episode_reward = 0.0

online_net = Network(env)
target_net = Network(env)

target_net.load_state_dict(online_net.state_dict())

optimizer = torch.optim.Adam(online_net.parameters(), lr=LEARNING_RATE)

obs = env.reset()
for _ in range(MIN_REPLAY_SIZE):
    
    action = env.action_space.sample()
    new_obs, reward, done, info = env.step(action)
    transition = (obs, action, reward, done, new_obs)
    replay_buffer.append(transition)
    obs = new_obs
    
    if done:
        obs = env.reset()
    

### Choose Loss Function

In [11]:
# default_loss_mse
# dr3
# random_dot
# phi_pentalty

loss_function = default_loss_mse

### Training

In [12]:
# training

obs = env.reset()

for step in range(1, MAX_STEPS+1):
    
    epsilon = np.interp(step, [0, EPSILON_DECAY], [EPSILON_START, EPSILON_END])
    
    rng = random.random()
    if rng <= epsilon:
        action = env.action_space.sample()
    else:
        action = online_net.act(obs)
    
    new_obs, reward, done, _ = env.step(action)
    transition = (obs, action, reward, done, new_obs)
    replay_buffer.append(transition)
    obs = new_obs
    
    episode_reward += reward
    
    if done:
        obs = env.reset()
        reward_buffer.append(episode_reward)
        episode_reward = 0.0
    
    # watch play
    if len(reward_buffer) >= 200:
        if np.mean(reward_buffer) >= 195: # once model averages score of 195 (max 200)
            while True:
                action = online_net.act(obs)
                
                obs, _, done, _ = env.step(action)
                env.render()
                if done:
                    env.reset()
        
    # Start Gradient Step   
    transitions = random.sample(replay_buffer, BATCH_SIZE)
    
    observations = np.asarray([t[0] for t in transitions])
    actions = np.asarray([t[1] for t in transitions])
    rewards = np.asarray([t[2] for t in transitions])
    dones = np.asarray([t[3] for t in transitions])
    new_observations = np.asarray([t[4] for t in transitions])
    
    observations_t = torch.as_tensor(observations, dtype=torch.float32)
    actions_t = torch.as_tensor(actions, dtype=torch.int64).unsqueeze(-1)
    rewards_t = torch.as_tensor(rewards, dtype=torch.float32).unsqueeze(-1)
    dones_t = torch.as_tensor(dones, dtype=torch.float32).unsqueeze(-1)
    new_observations_t = torch.as_tensor(new_observations, dtype=torch.float32)
    
    # Compute Targets
    target_q_values = target_net(new_observations_t)
    max_target_q_values = target_q_values.max(dim=1, keepdim=True)[0]
    
    targets = rewards_t + GAMMA * (1 - dones_t) * max_target_q_values
    
    # Compute Loss
    q_values = online_net(observations_t)
    action_q_values = torch.gather(input=q_values, dim=1, index=actions_t)
    loss = loss_function(action_q_values, targets)
    
    # Gradient Descent
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    # Compute Phi Matrix
    phi_matrix = []
    phi_matrix = online_net.get_phi(observations_t)
    phi_matrix = phi_matrix.cpu().detach().numpy()
    rank = np.linalg.matrix_rank(phi_matrix)
    all_ranks.append(rank)
    last_100_ranks.append(rank)
    
    # Update Target Network
    if step % TARGET_UPDATE_FREQ == 0:
        target_net.load_state_dict(online_net.state_dict())
        
    # Logging
    if step % 1000 == 0:
        print()
        print("Step:", step)
        print("Average Reward:", np.mean(reward_buffer))
        print("Average Rank:", np.mean(last_100_ranks))


Step: 1000
Average Reward: 19.235294117647058
Average Rank: 16.41

Step: 2000
Average Reward: 19.83
Average Rank: 16.37

Step: 3000
Average Reward: 20.32
Average Rank: 15.32

Step: 4000
Average Reward: 19.18
Average Rank: 14.42

Step: 5000
Average Reward: 22.73
Average Rank: 14.07

Step: 6000
Average Reward: 29.56
Average Rank: 14.0

Step: 7000
Average Reward: 37.08
Average Rank: 13.96

Step: 8000
Average Reward: 44.62
Average Rank: 13.79

Step: 9000
Average Reward: 53.14
Average Rank: 13.55

Step: 10000
Average Reward: 60.41
Average Rank: 13.37

Step: 11000
Average Reward: 69.58
Average Rank: 13.29

Step: 12000
Average Reward: 77.87
Average Rank: 13.22

Step: 13000
Average Reward: 86.27
Average Rank: 13.21

Step: 14000
Average Reward: 94.62
Average Rank: 13.9

Step: 15000
Average Reward: 102.22
Average Rank: 13.85

Step: 16000
Average Reward: 108.78
Average Rank: 13.68

Step: 17000
Average Reward: 118.25
Average Rank: 13.67

Step: 18000
Average Reward: 123.75
Average Rank: 13.5

Step


Step: 148000
Average Reward: 179.57
Average Rank: 11.01

Step: 149000
Average Reward: 179.45
Average Rank: 11.05

Step: 150000
Average Reward: 178.93
Average Rank: 11.09

Step: 151000
Average Reward: 177.52
Average Rank: 11.06

Step: 152000
Average Reward: 177.59
Average Rank: 11.08

Step: 153000
Average Reward: 176.39
Average Rank: 11.13

Step: 154000
Average Reward: 175.95
Average Rank: 11.08

Step: 155000
Average Reward: 175.72
Average Rank: 11.08

Step: 156000
Average Reward: 175.14
Average Rank: 11.07

Step: 157000
Average Reward: 176.35
Average Rank: 11.13

Step: 158000
Average Reward: 176.8
Average Rank: 11.12

Step: 159000
Average Reward: 177.58
Average Rank: 11.11

Step: 160000
Average Reward: 177.69
Average Rank: 11.11

Step: 161000
Average Reward: 176.92
Average Rank: 11.13

Step: 162000
Average Reward: 178.9
Average Rank: 11.13

Step: 163000
Average Reward: 180.04
Average Rank: 11.2

Step: 164000
Average Reward: 181.11
Average Rank: 11.19

Step: 165000
Average Reward: 180.