In [4]:
from mc_estimation import *

In [5]:
def train_verification(config, dataset, env_string, experiments = 10):

    total_mean_rew = []
    total_std_rew = []

    for _ in range(experiments):
        dynamics_data = dataset
        training_epochs = 10
        
        #Model environment dynamics s' <- (s,a)
        dynamics_data.dynamics_init()
        cartpole_model = DynamicsModel(dynamics_data.X_size, dynamics_data.Y_size)
        train_dynamicsModel(cartpole_model, dynamics_data, epochs=training_epochs)
        
        #Model Markov Chain dynamics
        dynamics_data.mc_init()
        cartpole_expert_mc = DynamicsModel(dynamics_data.X_size, dynamics_data.Y_size)
        train_dynamicsModel(cartpole_expert_mc, dynamics_data, epochs=training_epochs)

        print("Number of observations trained on: " + str(dynamics_data.num_observations))

        #Initialize Policy
        policy = Policy(dynamics_data.X_size, np.array([config["action_size"]]))
        print(dynamics_data.X.shape)
        print(dynamics_data.X_size)

        #Train Agent
        mc = mcEstimator(config=config, mc_model=cartpole_expert_mc, dynamics_model=cartpole_model, policy=policy, dataset=dynamics_data)
        mc.train_mc(mc.dataset.X)

        '''Test Performance of Agent'''
        env = gym.make(env_string)
        observation = env.reset()
        reward_total = 0
        reward_history = []
        steps = 0
        for _ in range(1000):
            # env.render()
            # action = env.action_space.sample() # your agent here (this takes random actions)
            
            observation = np.array([observation])
            # print(observation)
            observation = dynamics_data.transform_obs_forward(observation)
            # print(observation)
            # print()
            action_choices = policy(observation)
            # print(action_choices)
            action = np.argmax(action_choices)
            steps += 1
            # print("Action taken: " + str(action))
            observation, reward, done, info = env.step(action)
            reward_total += reward

            if done:
                observation = env.reset()
                print("Reward Total: " + str(reward_total))
                print("Reset")
                print("steps: " + str(steps))
                reward_history.append(reward_total)
                reward_total=0
                steps = 0
        
        print(np.mean(reward_history))
        print(np.std(reward_history))
        total_mean_rew.append(np.mean(reward_history))
        total_std_rew.append(np.std(reward_history))

        env.close()

    print("Mean reward over " + str(experiments) + " experiments: " + str(np.mean(total_mean_rew)))
    print("Mean std over " + str(experiments) + " experiments: " + str(np.std(total_mean_rew)))

In [6]:
import gym

env_string = "CartPole-v1"
action_size = 2
# env_string = "MountainCar-v0"
# action_Size = 3

config = {  
        "loss": tf.keras.losses.KLD, 
        "optimizer": tf.keras.optimizers.Adam(learning_rate=0.00001),
        "validation_split": 0.10,
        "max_obs": int(1e5),
        "epochs": 16,
        "batch_size": 64,
        "state_size": 2, 
        "action_size": action_size
            }

#Import dataset
dynamics_data = dc.dataset("cartpole10000.npz", max_obs=config["max_obs"])
# dynamics_data = dc.dataset("MountainCarDiscrete1000.npz", max_obs=config["max_obs"])

train_verification(config=config, dataset=dynamics_data, env_string=env_string)

==] - 1s 1ms/step - loss: 0.0086 - mae: 0.0055 - acc: 0.9970 - val_loss: 0.0055 - val_mae: 0.0053 - val_acc: 0.9970
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
[     0. 100000.]
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Number of observations trained on: 100000
(99806, 4)
(4,)
Epoch 0 Training Loss: 0.05882313
Epoch 1 Training Loss: 0.045353424
Epoch 2 Training Loss: 0.038969073
Epoch 3 Training Loss: 0.034621984
Epoch 4 Training Loss: 0.031451706
Epoch 5 Training Loss: 0.02900707
Epoch 6 Training Loss: 0.027027912
Epoch 7 Training Loss: 0.02537023
Epoch 8 Training Loss: 0.023949387
Epoch 9 Training Loss: 0.022710444
Epoch 10 Training Loss: 0.021620765
Epoch 11 Training Loss: 0.020654807
Epoch 12 Training Loss: 0.019790523
Epoch 13 Training Loss: 0.019012038
Epoch 14 Training Loss: 0.018306417
Epoch 15 Training Loss: 0.017664153
Reward Total: 500.0
Reset
steps: 500
Reward Total: 500.0
Reset
step