In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import gym
import os
import time
import random
import time
import numpy as np
from skimage import transform
from IPython.display import display, clear_output
import tensorflow as tf

In [None]:
from utils import FrameStack, Scheduler, calculate_expected_return
from a2c import GaussianA2C
from vec_env.subproc_vec_env import SubprocVecEnv

def preprocess_frame(frame):
    frame = frame[:-12, 6:-6] # Crop to 84x84
    frame = np.dot(frame[..., 0:3], [0.299, 0.587, 0.114])
    frame = frame / 255.0
    frame = frame * 2 - 1
    return frame

def make_env():
    return gym.make("CarRacing-v0")

def evaluate(test_env, num_steps=None):
    initial_frame = test_env.reset()
    frame_stack = FrameStack(initial_frame, preprocess_fn=preprocess_frame)
    done = False
    step = 0
    while not done:
        if num_steps is not None and step > num_steps: break
        step += 1
        # Predict action given state: π(a_t | s_t; θ)
        state = frame_stack.get_state()
        action, _ = a2c_model.predict(np.expand_dims(state, axis=0))
        clear_output(wait=True)
        #print("Mean:",action_mean,"Std:",action_std)
        #action = np.random.normal(loc=actions_mean[0], scale=actions_std[0])
        frame, reward, done, info = test_env.step(action[0])
        test_env.render()
        frame_stack.add_frame(frame)
        time.sleep(0.016)

In [None]:
num_envs = 4
envs = SubprocVecEnv([make_env for _ in range(num_envs)])
test_env = gym.make("CarRacing-v0")

lr_scheduler     = Scheduler(initial_value=1e-6, interval=10, decay_factor=1)#0.95)
action_scheduler = Scheduler(initial_value=20, interval=20, decay_factor=0.90)

discount_factor  = 0.95
save_interval    = 50
t_max            = 5
frame_stack_size = 4
input_shape      = (84, 84, 4)
num_actions = envs.action_space.shape[0]
action_min = np.array([-1.0, 0.0, 0.0])
action_max = np.array([ 1.0, 1.0, 1.0])
episode = 0
model_checkpoint = None #"./models/CarRacing-v0/run5/step399942.ckpt"
a2c_model = GaussianA2C(num_actions, input_shape, tf.train.RMSPropOptimizer, action_min, action_max,
                        value_scale=0.5, entropy_scale=0.01, model_checkpoint=model_checkpoint, model_name="CarRacing-v0")

In [None]:
while True:
    print("Resetting envronments...")
    episode += 1
    T = 0
    
    # Reset environments and get initial frame
    envs.reset()
    envs.get_images()
    for _ in range(100):
        envs.step_async(np.zeros((num_envs, num_actions)))
        initial_frames, rewards, dones, infos = envs.step_wait()
    frame_stacks = [FrameStack(initial_frames[i], preprocess_fn=preprocess_frame) for i in range(num_envs)]
    learning_rate = lr_scheduler.get_value()
    action_interval = np.ceil(action_scheduler.get_value())
    action_step = 0
    total_reward = 0
    episode_loss = episode_policy_loss = episode_value_loss = episode_entropy_loss = 0
    average_episode_reward = []
    
    # While there are running environments
    print("Training...")
    dones = [False] * num_envs
    while sum(dones) < num_envs and T < 2000:
        states_mb, actions_mb, returns_mb, values_mb = [], [], [], []
        
        # Simulate game for some number of steps
        rewards_mb = []
        for _ in range(t_max):
            states = [frame_stacks[i].get_state() if dones[i] == False else np.zeros(input_shape) for i in range(num_envs)]
            if action_step % action_interval == 0:
                # Predict and value action given state
                # π(a_t | s_t; θ)
                actions, values = a2c_model.predict(states)
            else:
                _, values = a2c_model.predict(states)
            action_step += 1
            
            for i in range(num_envs):
                if np.any(actions[i] < action_min) or np.any(actions[i] > action_max):
                    print("Something's wrong")
                    print(actions[i])
            
            # Sample action from a Gaussian distribution
            #actions = np.random.normal(loc=actions_mean, scale=actions_std)
            envs.step_async(actions)
            frames, rewards, dones, infos = envs.step_wait()
            rewards = np.array(rewards)
            envs.get_images() # render
            
            # Store state, action and reward
            states_mb.append(states)
            actions_mb.append(actions)
            rewards_mb.append(rewards)
            values_mb.append(np.squeeze(values, axis=-1))
            total_reward += np.sum(rewards)
            
            # Get new state
            for i in range(num_envs):
                frame_stacks[i].add_frame(frames[i])

        # Calculate return (discounted rewards over a trajectory)
        states = [frame_stacks[i].get_state() if dones[i] == False else np.zeros(input_shape) for i in range(num_envs)]
        last_values = a2c_model.predict(states)[-1]
        rewards_mb = np.array(rewards_mb)
        for i in range(num_envs):
            if dones[i] == False:
                returns_mb.append(calculate_expected_return(np.append(rewards_mb[:, i], last_values[i]), discount_factor)[:-1])
            else:
                returns_mb.append(calculate_expected_return(np.append(rewards_mb[:, i], 0), discount_factor)[:-1])
   
        states_mb = np.array(states_mb).reshape((-1, *input_shape))
        actions_mb = np.array(actions_mb).reshape((-1, envs.action_space.shape[0]))
        values_mb = np.array(values_mb).flatten()
        returns_mb = np.array(returns_mb).transpose(1, 0).flatten()
        
        eploss, pgloss, vloss, entloss = a2c_model.train(states_mb, actions_mb, returns_mb, values_mb, learning_rate=learning_rate)
        episode_loss         += eploss
        episode_policy_loss  += pgloss
        episode_value_loss   += vloss
        episode_entropy_loss += entloss
        T += 1
    average_episode_reward = total_reward / num_envs
    
    clear_output(wait=True)
    print("-- Episode {} --".format(episode))
    print("Learning rate:", learning_rate)
    print("Episode policy loss:", episode_policy_loss)
    print("Episode value loss:", episode_value_loss)
    print("Episode entropy loss:", episode_entropy_loss)
    print("Episode loss:", episode_loss)
    print("Average episode reward:", average_episode_reward)
    print("")
    a2c_model.write_summary(episode_policy_loss, episode_value_loss,
                            episode_entropy_loss, episode_loss,
                            average_episode_reward, learning_rate)
    if episode % save_interval == 0:
        a2c_model.save()

print("Done!")

In [None]:
episode

In [None]:
evaluate(test_env)

In [None]:
a2c_model = GaussianA2C(3, (84, 84, 4), tf.train.RMSPropOptimizer,
                        value_scale=0.5, entropy_scale=0.01,
                        model_checkpoint="models/CarRacing-v0/run6/step40000.ckpt",
                        model_name="CarRacing-v0")