In [1]:
# import IPython
# IPython.Application.instance().kernel.do_shutdown(True) #automatically restarts kernel
%load_ext autoreload
%autoreload 2
from unityagents import UnityEnvironment
import matplotlib.pyplot as plt
from dqn_agent import DQNAgent
from double_dqn_agent import DoubleDQNAgent
from prioritized_replay_dqn_agent import PrioritizedReplayDQNAgent
from dueling_dqn_agent import DuelingDQNAgent
import sys, math, time, torch, logging, json

# os.environ['CUBLAS_WORKSPACE_CONFIG'] = ':4096:8'
log_file_path = 'output.log'
result_file_path = 'result.json'
algorithm = 'DQN'
# ['DQN','Double DQN', 'Prioritized Experience Replay', 'Dueling DQN']

logger = logging.getLogger('p1_navigation')
while logger.handlers:
    logger.removeHandler(logger.handlers[0])
logger.propagate = False

formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')

sh = logging.StreamHandler(sys.stdout)
sh.setFormatter(formatter)
sh.setLevel(logging.DEBUG)

fh = logging.FileHandler(log_file_path)
fh.setLevel(logging.INFO)

logger.addHandler(sh)
logger.addHandler(fh)
logger.setLevel(logging.DEBUG)


logger.info('CUDA version: %s, CUDA enabled: %s' % (torch.version.cuda, torch.backends.cudnn.enabled))


The current device is cuda:0
2021-02-04 21:40:06,650 INFO CUDA version: 11.0, CUDA enabled: True


In [None]:
env = UnityEnvironment(file_name='Banana_Windows_x86_64\Banana.exe')
# get the default brain
brain_name = env.brain_names[0]
brain = env.brains[brain_name]

In [None]:
# reset the environment
env_info = env.reset(train_mode=True)[brain_name]

# number of agents in the environment
logger.debug('Number of agents: %i' % len(env_info.agents))

# examine the state space 
state = env_info.vector_observations[0]
logger.debug('States look like: %s' % state)
state_size = len(state)

# number of actions
action_size = brain.vector_action_space_size
logger.info('Number of agents: %i, states %i, actions: %i' % (len(env_info.agents), state_size, action_size))

if algorithm == 'DQN':
    from constants import CONSTANTS as C
    agent = DQNAgent(state_size, action_size)
elif algorithm == 'Double DQN':
    from constants import CONSTANTS as C
    agent = DoubleDQNAgent(state_size, action_size)
elif algorithm == 'Prioritized Experience Replay':
    from constants import PRIORITIZED_REPLAY_CONSTANTS as C
    agent = PrioritizedReplayDQNAgent(state_size, action_size)
    beta = C['beta_begin']
    logger.info('It takes %f steps for beta to go from %f to %f' % ((C['beta_stable']-C['beta_begin'])/C['beta_increase'], C['beta_begin'], C['beta_stable']))
elif algorithm == 'Dueling DQN':
    from constants import CONSTANTS as C
    agent = DuelingDQNAgent(state_size, action_size)
else:
    logger.warning('No algorithm specified')

logger.info(algorithm)
logger.info(json.dumps(C, indent=4))

episode_score_hist = []
total_score = 0

epsilon = C['epsilon_begin']
logger.info('It takes %f steps for epsilon to go from %f to %f' % (math.log(C['epsilon_stable']/C['epsilon_begin'], C['epsilon_decay']), C['epsilon_begin'], C['epsilon_stable']))


In [None]:
start_time = time.time()
step = 0
for i in range(C['num_episodes'] + 1):
    env_info = env.reset(train_mode=True)[brain_name]  # reset the env at each episode
    state = env_info.vector_observations[0]
    done = False
    episode_score = 0                                  # initialize the score
    while not done:
        if algorithm == 'Prioritized Experience Replay':
            action = agent.action(state, epsilon=epsilon, beta=beta)          # select an action
            beta = min(beta+C['beta_increase'], C['beta_stable'])
        else:
            action = agent.action(state, epsilon=epsilon)
        env_info = env.step(action)[brain_name]        # send the action to the environment
        next_state = env_info.vector_observations[0]   # get the next state
        reward = env_info.rewards[0]                   # get the reward
        done = env_info.local_done[0]                  # see if episode has finished
        agent.replay_buffer.add(state, action, reward, next_state, done)
        step += 1
#         logger.info('step, action, reward:%i, %i, %i' % (step, action, reward))
        episode_score += reward                        # update the score
        state = next_state                             # roll over the state to next time step
    total_score += episode_score
    episode_score_hist.append(episode_score)
    epsilon = max(epsilon*C['epsilon_decay'], C['epsilon_stable'])
    if i % 100 == 0:
        logger.info('For episode %i, the average score is %.2f, episode history %s' % (i, total_score/100, episode_score_hist[-100:]))
        total_score = 0


In [None]:
logger.info('The training completes in %f mins' % ((time.time() - start_time)/60))

In [None]:
plt.figure(figsize=(15, 10))
plt.plot(episode_score_hist)
plt.title('Episode Score with %s' % algorithm)
plt.xlabel('Episodes')
plt.ylabel('Scores')
plt.savefig('episode_score_with_%s.png' % algorithm, bbox_inches='tight')

In [None]:
C['agent'] = type(agent).__name__
C['episode_score_hist'] = episode_score_hist
with open(result_file_path, 'a', encoding='utf-8') as f:
    json.dump(C, f, ensure_ascii=False, indent=4)

In [None]:
env_info = env.reset(train_mode=False)[brain_name] # reset the environment
state = env_info.vector_observations[0]            # get the current state
score = 0                                          # initialize the score
while True:
    action = agent.action(state, epsilon=epsilon, beta=0)                   # select an action
    env_info = env.step(action)[brain_name]        # send the action to the environment
    next_state = env_info.vector_observations[0]   # get the next state
    reward = env_info.rewards[0]                   # get the reward
    done = env_info.local_done[0]                  # see if episode has finished
    score += reward                                # update the score
    state = next_state                             # roll over the state to next time step
    if done:                                       # exit loop if episode finished
        break
    
logger.info("Score: {}".format(score))

In [None]:
env.close()