In [1]:
import tensorflow as tf

import gym

from tensorforce.agents import Agent
from tensorforce.environments import Environment
from tensorforce.execution import Runner
from tensorforce.core.parameters import Decaying

from mountain_car_wrappers import RewardEnergy

In [None]:
environment = Environment.create(
    environment='gym', level='CartPole', max_episode_timesteps=500
)

network = [
    dict(type='dense', size=32, activation='relu'),
    dict(type='dense', size=32, activation='relu')
]

epsilon = dict(
    type='exponential', unit='episodes', num_steps=500,
    initial_value=1.0, decay_rate=.05, dtype=tf.float32
)

cartpole_agent = Agent.create(
    agent='dqn', environment=environment, memory=10000, batch_size=32, network=network,
    update_frequency=1, start_updating=300, learning_rate=1e-3, exploration=epsilon
)

runner = Runner(
    agent=cartpole_agent, environment=environment, max_episode_timesteps=500
)

runner.run(num_episodes=1000, save_best_agent='./best_model/')
runner.run(num_episodes=100, evaluation=True)
runner.close()

In [13]:
with gym.make('MountainCar-v0') as env:
    env = RewardEnergy(env)
    environment = Environment.create(
        environment=env, max_episode_timesteps=200
    )

    network = [
        dict(type='dense', size=32, activation='relu'),
        dict(type='dense', size=32, activation='relu')
    ]

    epsilon = dict(
        type='exponential', unit='episodes', num_steps=300,
        initial_value=1.0, decay_rate=0.05, dtype=tf.float32
    )

    mc_agent = Agent.create(
        agent='dqn', environment=environment, memory=10000, batch_size=128, network=network,
        update_frequency=1, start_updating=1000, learning_rate=1e-3, exploration=epsilon
    )

    runner = Runner(
        agent=mc_agent, environment=environment, max_episode_timesteps=200
    )

    runner.run(num_episodes=300, save_best_agent='./best_model_MC/')
    runner.run(num_episodes=100, evaluation=True)
    runner.close()

Episodes:  39%|███▉      | 117/300 [01:59, return=5.44, ts/ep=153, sec/ep=0.85, ms/ts=5.5, agent=96.7%] 

KeyboardInterrupt: 

In [None]:
with RewardEnergy(gym.make('MountainCar-v0')) as env:
    done = False
    obs = env.reset()
    for _ in range(5):
        # env.render()
        env.step(0)
        
    while not done:
        # env.render()
        obs, rwd, done, _ = env.step(1)
        print(obs, rwd)

In [11]:
from gym.wrappers import Monitor
with Monitor(gym.make('MountainCar-v0'), './MC_energy_train_3/', force=True) as env:
    done = False
    obs = env.reset()
    total_reward = 0
    internals = mc_agent.initial_internals()
    while not done:
        env.render()
        action = mc_agent.act(states=obs, internals=internals, independent=True, deterministic=True)
        obs, rwd, done, _ = env.step(action[0])
        total_reward += rwd
        
    print(f'Total Reward: {total_reward}')



Total Reward: -166.0


In [7]:
mc_agent.save('./MC_energy_2/')



'./MC_energy_2/agent-1'