## Unity Env loading

In [None]:
from keras.models import load_model
import keras
from utils.memory_buffer import MemoryBuffer
from utils.networks import tfSummary, OrnsteinUhlenbeckProcess
from utils.stats import gather_stats
from random import random, randrange
# from tqdm import tqdm
from IPython.display import clear_output
import os

In [None]:
# import matplotlib.pyplot as plt
import numpy as np
import sys
from mlagents_envs.environment import UnityEnvironment

# %matplotlib inline
from mlagents_envs.side_channel.engine_configuration_channel import EngineConfig, EngineConfigurationChannel
print("Python version:")
print(sys.version)

# check Python version
if (sys.version_info[0] < 3):
    raise Exception("ERROR: ML-Agents Toolkit (v0.3 onwards) requires Python 3")

Load Game env

In [None]:
channel = EngineConfigurationChannel()
env_name = "3Dball//UnityEnvironment"  # Name of the Unity environment binary to launch
env = UnityEnvironment(env_name, worker_id=0,side_channels=[channel])

# Examine environment parameters
print(str(env))

Edit display setting

In [None]:
channel.set_configuration_parameters(time_scale = 3, width =300, height =300)
env.reset()
behavior_names = env.behavior_specs.keys()
list(behavior_names)

Get env basic information

In [None]:
env.step() 
group_name = list(env._env_specs.keys())[0]
group_spec = env.behavior_specs[group_name]

state_dim = group_spec.observation_shapes[0]
if group_spec.action_spec.is_continuous():
    action_dim = group_spec.action_spec.continuous_size
else:
    action_dim = group_spec.action_spec.discrete_branches[0]
group_spec

## Agent import

## DDQN

In [None]:
from DDQN.ddqn import DDQN
ddqn = DDQN(action_dim = action_dim, state_dim = state_dim, with_per=False)

Training

In [None]:
import matplotlib.pyplot as plt

results = []

cumul_reward, done  = 0, False
env.reset()
c = 0

model_name = env_name.split('/')[0] + '_model.h5'
target_model_name = env_name.split('/')[0] + '_target_model.h5'
if os.path.isfile(model_name):
    print('load model')
    ddqn.agent.model.load_weights(model_name)

tr = 0
tf = 0
while True:
    # Get state information
    decision_steps, terminal_steps = env.get_steps(group_name)
    old_state = decision_steps.obs[0]
    obs_space = len(old_state) # Check obs space
    if obs_space > 0:
        # Actor picks an action (following the policy)
        if random() <= ddqn.epsilon:
#             a = group_spec.action_spec.random_action(obs_space)
            
            # only for grid world
            a = np.random.randint(1,5,size = (obs_space,1))
        else:
            a = ddqn.policy_action(old_state)
            a = np.reshape(a,(a.shape[0],1))
#             print(a)
        env.set_actions(group_name, a)
#         plt.imshow(old_state[0])
#         plt.show()
        env.step()
        # Retrieve new state, reward, and whether the state is terminal
        decision_steps, terminal_steps = env.get_steps(group_name)
        
        done = np.zeros((obs_space), dtype=bool)
        done[terminal_steps.agent_id] = True
        
        new_state, r = decision_steps.obs[0], decision_steps.reward
        if terminal_steps.reward.size>0:
            r[terminal_steps.agent_id] = terminal_steps.reward
#         plt.imshow(new_state[0])
#         plt.show()
#         print('-'*20)
        # Memorize for experience replay
        ddqn.memorize(old_state, a, r, done, new_state)
        # Update current state
        old_state = new_state
        cumul_reward += r
        if len(terminal_steps.reward) > 0:
            for i in terminal_steps.reward:
                if i > 0:
                    tr+= 1
                if i < 0:
                    tf+= 1
        # Train DDQN and transfer weights to target network
        
        if(ddqn.buffer.size() > ddqn.batch_size):
#             print('Training')
            ddqn.train_agent(ddqn.batch_size)
            ddqn.agent.transfer_weights()
        c+= 1
        if c % 100 == 0:
            print('reach goal: ', tr)
            print('fail: ', tf)
            tr, tf= 0,0
            print(ddqn.epsilon)
            ddqn.save(model_name)
 
    else:
        env.step()

# Gather stats every episode for plotting
if(ddqn.gather_stats):
    mean, stdev = gather_stats(ddqn, env)
    results.append([e, mean, stdev])

# Export results for Tensorboard
score = tfSummary('score', cumul_reward)
summary_writer.add_summary(score, global_step=e)
summary_writer.flush()


## DDPG

In [None]:
from DDPG.ddpg import DDPG

ddpg = DDPG(act_dim = action_dim, env_dim = state_dim, act_range = 1, buffer_size = 10000, batch_size = 300, lr = 0.0005)
# from DDPG.ddpgOld import ActorCritic
# import tensorflow as tf
# import keras.backend as K
# sess = tf.Session()
# K.set_session(sess)
# ddpg = ActorCritic(act_dim = action_dim, env_dim = state_dim, sess= sess)

In [None]:
ddpg.actor.model.summary()

In [None]:
results = []
# import matplotlib.pyplot as plt
# First, gather experience
ddpg.epsilon = 1
ddpg.epsilon_decay = 0.999999
ddpg.batch_size = 300
# Reset episode
time, cumul_reward, done = 100000, 0, False
old_state = env.reset()
actions, states, rewards = [], [], []
noise = OrnsteinUhlenbeckProcess(size=ddpg.act_dim)
tr, tf, c = 0,0,0
while True:
    decision_steps, terminal_steps = env.get_steps(group_name)
    old_state = decision_steps.obs[0]
    obs_space = len(old_state) # Check obs space
    if obs_space > 0:
    # Actor picks an action (following the deterministic policy)

        a = ddpg.policy_action(old_state)
        a = np.reshape(a,(a.shape[0],a.shape[1]))
    # Clip continuous values to be valid w.r.t. environment
#         print(a)
        a = np.clip(a+noise.generate(time), -ddpg.act_range, ddpg.act_range)

    # Retrieve new state, reward, and whether the state is terminal
        env.set_actions(group_name, a)
#         plt.imshow(old_state[0])
#         plt.show()
        env.step()
        decision_steps, terminal_steps = env.get_steps(group_name)
        done = np.zeros((obs_space), dtype=bool)
        done[terminal_steps.agent_id] = True
        new_state, r = decision_steps.obs[0], decision_steps.reward

        if terminal_steps.reward.size>0:
            r = np.ones(12)*.1
            r[terminal_steps.agent_id] = terminal_steps.reward
#         plt.imshow(new_state[0])
#         plt.show()
#         print('-'*20)
        # Memorize for experience replay
#         print(r)
        ddpg.memorize(old_state, a, r, done, new_state)
        
        # Sample experience from buffer
        states, actions, rewards, dones, new_states, _ = ddpg.sample_batch(ddpg.batch_size)
        # Predict target q-values using target networks
        q_values = ddpg.critic.target_predict([new_states, ddpg.actor.target_predict(new_states)])
        # Compute critic target
        critic_target = ddpg.bellman(rewards, q_values, dones)
        # Train both networks on sampled batch, update target networks
        ddpg.update_models(states, actions, critic_target)
            
        # Update current state
        old_state = new_state
        cumul_reward += r
        if len(terminal_steps.reward) > 0:
            for i in terminal_steps.reward:
                if i > 0:
                    tr+= 1
                if i < 0:
                    tf+= 1
        time += 1

        c+= 1
#         if c % 100 == 0:
#             print('reach goal: ', tr)
#             print('fail: ', tf)
#             tr, tf= 0,0
#             print(ddpg.epsilon)
#             ddpg.save(model_name)
 
    else:
        env.step()

# Gather stats every episode for plotting
if(args.gather_stats):
    mean, stdev = gather_stats(self, env)
    results.append([e, mean, stdev])

# Export results for Tensorboard
score = tfSummary('score', cumul_reward)
summary_writer.add_summary(score, global_step=e)
summary_writer.flush()

Random moves

In [None]:
import time
old_state = env.reset()
for i in range(10000):
#     time.sleep(0.1)
    decision_steps, terminal_steps = env.get_steps(group_name)
#     print(decision_steps.action_mask  )
#     print('t: ',terminal_steps.agent_id)
#     print('d: ', decision_steps.reward)
#     print(decision_steps.obs[0].shape)
#     print('t: ',terminal_steps.obs[0].shape)
#     print((terminal_steps.agent_id_to_index))
#     print((terminal_steps.interrupted))

    # Random moves
#     a = len(decision_steps.obs[0])
#     env.set_actions(group_name,group_spec.action_spec.random_action(a))
    
     
    # Agent moves
    old_state = decision_steps.obs[0]
    obs_space = len(old_state)
    if obs_space > 0:
        a = ddpg.policy_action(old_state)

        a = np.reshape(a,(a.shape[0],a.shape[1]))
    #     a = np.reshape(a,(a.shape[0],1))
        env.set_actions(group_name,a)
    
#     print(a)
    # We send data to Unity : A string with the number of Agent at each
    env.step()  # Move the simulation forward

## Neat

In [None]:

while len(decision_steps.reward) == 0:
    env.step() 
    decision_steps, terminal_steps = env.get_steps(group_name)

In [None]:
import neat
import numpy as np
import gym
import visualize
# 
# GAME = 'CartPole-v0'
# env = gym.make(GAME).unwrapped

CONFIG = "./config"
EP_STEP = 0             # maximum episode steps
GENERATION_EP = 10      # evaluate by the minimum of 10-episode rewards
TRAINING = True         # training or testing
CHECKPOINT = 120        # test on this checkpoint


def eval_genomes(genomes, config):
    for genome_id, genome in genomes:
        net = neat.nn.FeedForwardNetwork.create(genome, config)
        ep_r = []
        for ep in range(GENERATION_EP): # run many episodes for the genome in case it's lucky
            accumulative_r = 0.         # stage longer to get a greater episode reward
            
            agent_num = 0
# ========================================================
            while agent_num == 0:
                env.reset()
                decision_steps, terminal_steps = env.get_steps(group_name)
                agent_num = len(decision_steps.obs[0])
#                 print(agent_num)
                
            old_state = decision_steps.obs[0][0]
#             print(old_state)
            obs_space = len(old_state) # Check obs space         
            EP_STEP = 0
#             print(ep)
            while EP_STEP < 100:
                while len(decision_steps.reward) == 0:
                    env.step() 
                    decision_steps, terminal_steps = env.get_steps(group_name)
                old_state = decision_steps.obs[0][0]
                    
                if obs_space > 0:

                    action_values = net.activate(old_state)
                    action = group_spec.action_spec.empty_action(agent_num)
                    action[0] = action_values
                
                    env.set_actions(group_name, action)
                    env.step()
                    
                    decision_steps, terminal_steps = env.get_steps(group_name)

                    if 0 in terminal_steps.agent_id:
                        reward = terminal_steps.reward[0]
                        accumulative_r += reward
                        done = True
                    else:
                        reward = 0.1
                        done = False
                        accumulative_r += reward
                    
                    EP_STEP += 1
#                         print(decision_steps.obs[0])
                            
                else:
                    env.step()  
                if done:
                        break
                
            ep_r.append(accumulative_r)
        genome.fitness = np.min(ep_r)/float(EP_STEP)    # depends on the minimum episode reward


def run():
    config = neat.Config(neat.DefaultGenome, neat.DefaultReproduction,
                         neat.DefaultSpeciesSet, neat.DefaultStagnation, CONFIG)
    pop = neat.Population(config)
    pop = neat.Checkpointer.restore_checkpoint('neat-checkpoint-%i' % CHECKPOINT)
    # recode history
    stats = neat.StatisticsReporter()
    pop.add_reporter(stats)
    pop.add_reporter(neat.StdOutReporter(True))
    pop.add_reporter(neat.Checkpointer(10))

    pop.run(eval_genomes, 1000)       # train 10 generations

#     visualize training
    visualize.plot_stats(stats, ylog=False, view=True)
    visualize.plot_species(stats, view=True)


def evaluation():
    p = neat.Checkpointer.restore_checkpoint('neat-checkpoint-%i' % CHECKPOINT)
    winner = p.run(eval_genomes, 1)     # find the winner in restored population

    # show winner net
    node_names = {-1: 'In0', -2: 'In1', -3: 'In3', -4: 'In4', 0: 'act1', 1: 'act2'}
    visualize.draw_net(p.config, winner, True, node_names=node_names)

    net = neat.nn.FeedForwardNetwork.create(winner, p.config)
    while True:
        agent_num = 0
# ========================================================
        while agent_num == 0:
            env.reset()
            decision_steps, terminal_steps = env.get_steps(group_name)
            agent_num = len(decision_steps.obs[0])
#                 print(agent_num)
                
            old_state = decision_steps.obs[0][0]
            obs_space = len(old_state) # Check obs space       
            
        while True:
            if obs_space > 0:
                action_values = net.activate(old_state)
                action = group_spec.action_spec.empty_action(agent_num)
                action[0] = action_values

                env.set_actions(group_name, action)
                env.step()
                if terminal_steps.reward.size>0:
                    done = True
                else:
                    done = False
            else:
                env.step()    
                
            if done: break


if __name__ == '__main__':
    if TRAINING:
        run()
    else:
        evaluation()

## Testing Code